In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import altair as alt



from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder


from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.multiclass import OneVsRestClassifier
from scipy.stats import randint, uniform, loguniform
from sklearn.model_selection import RandomizedSearchCV
from sklearn.decomposition import PCA


## Feature Engineering - Column Transform and Pipeline Build Up

In [2]:
student_df = pd.read_csv('../data/student.csv')

In [3]:
train_df, test_df = train_test_split(student_df, test_size=0.2, random_state = 123)

In [4]:
### Map the marital status from code to actual status
### Categorical variable
status_mapping = {
    1: 'single',
    2: 'married',
    3: 'widower',
    4: 'divorced',
    5: 'facto union',
    6: 'legally separated'
}


train_df['Marital status'] = train_df['Marital status'].map(status_mapping)
test_df['Marital status'] = test_df['Marital status'].map(status_mapping)

In [5]:
### Map the Course from code to actual Course type easier for interpretation
### Categorical variable

course_mapping = {
    33: 'Biofuel Production Technologies',
    171: 'Animation and Multimedia Design',
    8014: 'Social Service (evening attendance)',
    9003: 'Agronomy',
    9070: 'Communication Design',
    9085: 'Veterinary Nursing',
    9119: 'Informatics Engineering',
    9130: 'Equinculture',
    9147: 'Management',
    9238: 'Social Service',
    9254: 'Tourism',
    9500: 'Nursing',
    9556: 'Oral Hygiene',
    9670: 'Advertising and Marketing Management',
    9773: 'Journalism and Communication',
    9853: 'Basic Education',
    9991: 'Management (evening attendance)'
}

# Apply the mapping
train_df['Course'] = train_df['Course'].map(course_mapping)
test_df['Course'] = test_df['Course'].map(course_mapping)


In [6]:
### Map the Course day/night attendance type easier for interpretation
### binary variable

course_mapping = {
    0: 'evening',
    1: 'daytime',
}

# Apply the mapping
train_df['Daytime/evening attendance\t'] = train_df['Daytime/evening attendance\t'].map(course_mapping)
test_df['Daytime/evening attendance\t'] = test_df['Daytime/evening attendance\t'].map(course_mapping)
train_df.rename(columns={'Daytime/evening attendance\t': 'Daytime evening attendance'}, inplace=True)
test_df.rename(columns={'Daytime/evening attendance\t': 'Daytime evening attendance'}, inplace=True)



In [7]:
### Map the nationality from code to actual nationality easier for interpretation
### Categorical variable

nation_mapping = {
    1: 'Portuguese',
    2: 'German',
    6: 'Spanish',
    11: 'Italian',
    13: 'Dutch',
    14: 'English',
    17: 'Lithuanian',
    21: 'Angolan',
    22: 'Cape Verdean',
    24: 'Guinean',
    25: 'Mozambican',
    26: 'Santomean',
    32: 'Turkish',
    41: 'Brazilian',
    62: 'Romanian',
    100: 'Moldova (Republic of)',
    101: 'Mexican',
    103: 'Ukrainian',
    105: 'Russian',
    108: 'Cuban',
    109: 'Colombian'
}

# Apply the mapping
train_df['Nacionality'] = train_df['Nacionality'].map(nation_mapping)
test_df['Nacionality'] = test_df['Nacionality'].map(nation_mapping)


In [8]:
numeric_features = ['Previous qualification (grade)', 'Admission grade', 'Age at enrollment', 
       'Curricular units 1st sem (credited)',
       'Curricular units 1st sem (enrolled)',
       'Curricular units 1st sem (evaluations)',
       'Curricular units 1st sem (approved)',
       'Curricular units 1st sem (grade)',
       'Curricular units 1st sem (without evaluations)',
       'Curricular units 2nd sem (credited)',
       'Curricular units 2nd sem (enrolled)',
       'Curricular units 2nd sem (evaluations)',
       'Curricular units 2nd sem (approved)',
       'Curricular units 2nd sem (grade)',
       'Curricular units 2nd sem (without evaluations)', 
       'Unemployment rate',
       'Inflation rate', 
       'GDP']
categorical_features = ['Marital status', 
                        'Application mode', 
                        'Course', 
                        'Nacionality', 
                        "Mother's occupation", 
                        "Father's occupation"]
ordinal_features = ['Application order', 
                    'Previous qualification', 
                    "Mother's qualification", 
                    "Father's qualification"]
binary_features = ['Daytime evening attendance', 
                   'Displaced', 
                   'Educational special needs', 
                   'Debtor', 
                   'Tuition fees up to date', 
                   'Gender', 
                   'Scholarship holder', 
                   'International']

target = "Target"


In [9]:
ordinal_transformer = OrdinalEncoder(dtype=np.int64, handle_unknown='use_encoded_value', unknown_value=-1)
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False, dtype = int, min_frequency=10)
numeric_transformer = StandardScaler()

In [10]:
preprocessor = make_column_transformer(
    ( numeric_transformer, numeric_features),  
    ( categorical_transformer, categorical_features+binary_features),  
    ( ordinal_transformer, ordinal_features)
)
preprocessor

In [11]:
X_train = train_df.drop(columns=["Target"])
X_test = test_df.drop(columns=["Target"])
y_train = train_df["Target"]
y_test = test_df["Target"]

In [12]:
preprocessor.verbose_feature_names_out = False

X_train_enc = pd.DataFrame(preprocessor.fit_transform(X_train))

column_name = numeric_features + preprocessor.named_transformers_['onehotencoder'].get_feature_names_out().tolist() + ordinal_features

X_train_enc.columns = column_name

X_train_enc.head()

Unnamed: 0,Previous qualification (grade),Admission grade,Age at enrollment,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),...,Gender_0,Gender_1,Scholarship holder_0,Scholarship holder_1,International_0,International_1,Application order,Previous qualification,Mother's qualification,Father's qualification
0,-0.0403,0.245415,-0.694386,-0.299361,-0.109831,-0.541173,0.416328,0.593152,-0.198645,-0.279561,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,12.0,23.0
1,-0.575696,-0.818901,-0.300945,-0.299361,-0.50972,-0.541173,-1.51345,-2.191899,-0.198645,-0.279561,...,1.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,12.0
2,0.342125,0.399461,-0.694386,-0.299361,0.290058,-0.305455,0.737958,0.66683,-0.198645,-0.279561,...,1.0,0.0,0.0,1.0,1.0,0.0,5.0,0.0,0.0,0.0
3,0.26564,0.238413,-0.694386,-0.299361,0.290058,-0.305455,0.416328,0.407482,-0.198645,-0.279561,...,1.0,0.0,0.0,1.0,1.0,0.0,5.0,0.0,12.0,23.0
4,-1.799457,-1.302045,-0.300945,-0.299361,-0.109831,1.580293,-0.870191,-0.025749,-0.198645,-0.279561,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,12.0,22.0
