In [99]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.preprocessing import OneHotEncoder, StandardScaler, KBinsDiscretizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using mps device


In [131]:
input = pd.read_csv('./data/train.csv', index_col=0)
input = input.rename(columns={
    'Marital status': 'c_marital_status',
    'Application mode': 'c_application_mode',
    'Application order': 'c_application_order',
    'Course': 'c_course',
    'Daytime/evening attendance': 'c_attendance',
    'Previous qualification': 'c_qualification',
    'Previous qualification (grade)': 'n_qualification',
    'Nacionality': 'c_nationality',
    "Mother's qualification": 'c_mqual',
    "Father's qualification": 'c_fqual',
    "Mother's occupation": 'c_mocup',
    "Father's occupation": 'c_focup',
    'Admission grade': 'n_grade',
    'Displaced': 'c_displaced',
    'Educational special needs': 'c_special_needs',
    'Debtor': 'c_debtor',
    'Tuition fees up to date': 'c_fees',
    'Gender': 'c_gender',
    'Scholarship holder': 'c_scholarship',
    'Age at enrollment': 'n_age',
    'International': 'c_international',
    'Curricular units 1st sem (credited)': 'n_cu1cr',
    'Curricular units 1st sem (enrolled)': 'n_cu1en',
    'Curricular units 1st sem (evaluations)': 'n_cu1ev',
    'Curricular units 1st sem (approved)': 'n_cu1ap',
    'Curricular units 1st sem (grade)': 'n_cu1gr',
    'Curricular units 1st sem (without evaluations)': 'n_cu1wo',
    'Curricular units 2nd sem (credited)': 'n_cu2cr',
    'Curricular units 2nd sem (enrolled)': 'n_cu2en',
    'Curricular units 2nd sem (evaluations)': 'n_cu2ev',
    'Curricular units 2nd sem (approved)': 'n_cu2ap',
    'Curricular units 2nd sem (grade)': 'n_cu2gr',
    'Curricular units 2nd sem (without evaluations)': 'n_cu2wo',
    'Unemployment rate': 'n_unemployment_rate',
    'Inflation rate': 'n_inflation_rate',
    'GDP': 'n_gdp'
    })
target = 'Target'
features = [col for col in input.columns if col != target]
categorical_features = [f for f in features if f.startswith('c_')]
numerical_features = [f for f in features if f.startswith('n_')]

# remove categorical outliers
for c in categorical_features:
    temp = input[c].value_counts()/len(input)
    below_cutoff = temp[len(temp)*temp<0.01]
    if len(below_cutoff.index)>0:
        print(f'dropping {len(input[input[c].isin(below_cutoff.index)])} records of category {c}')
        input = input[~input[c].isin(below_cutoff.index)]

# split train and validation data
input_train, input_val, target_train, target_val = train_test_split(
    input[features],
    input[target],
    test_size=0.2,
    random_state=42,
    stratify=input[target]
    )

# one-hot-encode categorical features
ohe = OneHotEncoder(sparse_output=False)
encoded_categorical_train_data = ohe.fit_transform(input_train[categorical_features])
encoded_categorical_val_data = ohe.transform(input_val[categorical_features])
encoded_categorical_feature_names = ohe.get_feature_names_out(input_train[categorical_features].columns)
encoded_categorical_train_df = pd.DataFrame(encoded_categorical_train_data, columns=encoded_categorical_feature_names)
encoded_categorical_val_df = pd.DataFrame(encoded_categorical_val_data, columns=encoded_categorical_feature_names)

# scale numerical features
ss = StandardScaler()
scaled_numerical_train_data = ss.fit_transform(input_train[numerical_features])
scaled_numerical_val_data = ss.transform(input_val[numerical_features])
scaled_numerical_feature_names = ss.get_feature_names_out(input_train[numerical_features].columns)
scaled_numerical_train_df = pd.DataFrame(scaled_numerical_train_data, columns=scaled_numerical_feature_names)
scaled_numerical_val_df = pd.DataFrame(scaled_numerical_val_data, columns=scaled_numerical_feature_names)

# merge categorical and numerical features
X_train = pd.merge(encoded_categorical_train_df, scaled_numerical_train_df, left_index=True, right_index=True)
X_val = pd.merge(encoded_categorical_val_df, scaled_numerical_val_df, left_index=True, right_index=True)

# one-hot-encode target 
target_ohe = OneHotEncoder(sparse_output=False)
encoded_train_target = target_ohe.fit_transform(pd.DataFrame(target_train))
encoded_val_target = target_ohe.transform(pd.DataFrame(target_val))
encoded_target_names = target_ohe.get_feature_names_out(pd.DataFrame(target_train).columns)
y_train = pd.DataFrame(encoded_train_target, columns=encoded_target_names)
y_val = pd.DataFrame(encoded_val_target, columns=encoded_target_names)

print(f'{len(X_train)} data points in train set')
print(f'{len(X_val)} data points in validation set')
print(f'{len(X_train.columns)} features')

dropping 167 records of category c_marital_status
dropping 16 records of category c_application_mode
dropping 4 records of category c_application_order
dropping 2 records of category c_course
dropping 48 records of category c_qualification
dropping 104 records of category c_nationality
dropping 146 records of category c_mqual
dropping 115 records of category c_fqual
dropping 128 records of category c_mocup
dropping 141 records of category c_focup
dropping 279 records of category c_special_needs
60294 data points in train set
15074 data points in validation set
159 features
