In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.preprocessing import OneHotEncoder, StandardScaler, KBinsDiscretizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using mps device


In [2]:
n_embed = 30
n_heads = 32
d_model = 128
head_size = d_model//n_heads
dropout = 0.3
batch_size = 128

In [11]:
input = pd.read_csv('./data/train.csv', index_col=0)
input = input.rename(columns={
    'Marital status': 'c_marital_status',
    'Application mode': 'c_application_mode',
    'Application order': 'c_application_order',
    'Course': 'c_course',
    'Daytime/evening attendance': 'c_attendance',
    'Previous qualification': 'c_qualification',
    'Previous qualification (grade)': 'n_qualification',
    'Nacionality': 'c_nationality',
    "Mother's qualification": 'c_mqual',
    "Father's qualification": 'c_fqual',
    "Mother's occupation": 'c_mocup',
    "Father's occupation": 'c_focup',
    'Admission grade': 'n_grade',
    'Displaced': 'c_displaced',
    'Educational special needs': 'c_special_needs',
    'Debtor': 'c_debtor',
    'Tuition fees up to date': 'c_fees',
    'Gender': 'c_gender',
    'Scholarship holder': 'c_scholarship',
    'Age at enrollment': 'n_age',
    'International': 'c_international',
    'Curricular units 1st sem (credited)': 'n_cu1cr',
    'Curricular units 1st sem (enrolled)': 'n_cu1en',
    'Curricular units 1st sem (evaluations)': 'n_cu1ev',
    'Curricular units 1st sem (approved)': 'n_cu1ap',
    'Curricular units 1st sem (grade)': 'n_cu1gr',
    'Curricular units 1st sem (without evaluations)': 'n_cu1wo',
    'Curricular units 2nd sem (credited)': 'n_cu2cr',
    'Curricular units 2nd sem (enrolled)': 'n_cu2en',
    'Curricular units 2nd sem (evaluations)': 'n_cu2ev',
    'Curricular units 2nd sem (approved)': 'n_cu2ap',
    'Curricular units 2nd sem (grade)': 'n_cu2gr',
    'Curricular units 2nd sem (without evaluations)': 'n_cu2wo',
    'Unemployment rate': 'n_unemployment_rate',
    'Inflation rate': 'n_inflation_rate',
    'GDP': 'n_gdp'
    })
target = 'Target'
features = [col for col in input.columns if col != target]
categorical_features = [f for f in features if f.startswith('c_')]
numerical_features = [f for f in features if f.startswith('n_')]

# remove categorical outliers
for c in categorical_features:
    temp = input[c].value_counts()/len(input)
    below_cutoff = temp[len(temp)*temp<0.01]
    if len(below_cutoff.index)>0:
        print(f'dropping {len(input[input[c].isin(below_cutoff.index)])} records of category {c}')
        input = input[~input[c].isin(below_cutoff.index)]

# split train and validation data
input_train, input_val, target_train, target_val = train_test_split(
    input[features],
    input[target],
    test_size=0.01,
    random_state=42,
    stratify=input[target]
    )

# one-hot-encode categorical features
ohe = OneHotEncoder(
    sparse_output=False,
    handle_unknown='ignore'
    )
encoded_categorical_train_data = ohe.fit_transform(input_train[categorical_features])
encoded_categorical_val_data = ohe.transform(input_val[categorical_features])
encoded_categorical_feature_names = ohe.get_feature_names_out(input_train[categorical_features].columns)
encoded_categorical_train_df = pd.DataFrame(encoded_categorical_train_data, columns=encoded_categorical_feature_names)
encoded_categorical_val_df = pd.DataFrame(encoded_categorical_val_data, columns=encoded_categorical_feature_names)

# scale numerical features
disc = KBinsDiscretizer(
    n_bins=n_embed,
    encode='ordinal',
    strategy='uniform',
    subsample=None
)
discretized_numerical_train_data = disc.fit_transform(input_train[numerical_features])
discretized_numerical_val_data = disc.transform(input_val[numerical_features])
discretized_numerical_feature_names = disc.get_feature_names_out(input_train[numerical_features].columns)
discretized_numerical_train_df = pd.DataFrame(discretized_numerical_train_data, columns=discretized_numerical_feature_names)
discretized_numerical_val_df = pd.DataFrame(discretized_numerical_val_data, columns=discretized_numerical_feature_names)

# merge categorical and numerical features
X_train = pd.merge(encoded_categorical_train_df, discretized_numerical_train_df, left_index=True, right_index=True)
X_val = pd.merge(encoded_categorical_val_df, discretized_numerical_val_df, left_index=True, right_index=True)

# one-hot-encode target 
target_ohe = OneHotEncoder(
    sparse_output=False,
    handle_unknown='ignore'
    )
encoded_train_target = target_ohe.fit_transform(pd.DataFrame(target_train))
encoded_val_target = target_ohe.transform(pd.DataFrame(target_val))
encoded_target_names = target_ohe.get_feature_names_out(pd.DataFrame(target_train).columns)
y_train = pd.DataFrame(encoded_train_target, columns=encoded_target_names)
y_val = pd.DataFrame(encoded_val_target, columns=encoded_target_names)

print(f'{len(X_train)} data points in train set')
print(f'{len(X_val)} data points in validation set')
print(f'{len(X_train.columns)} features')

dropping 167 records of category c_marital_status
dropping 16 records of category c_application_mode
dropping 4 records of category c_application_order
dropping 2 records of category c_course
dropping 48 records of category c_qualification
dropping 104 records of category c_nationality
dropping 146 records of category c_mqual
dropping 115 records of category c_fqual
dropping 128 records of category c_mocup
dropping 141 records of category c_focup
dropping 279 records of category c_special_needs
74614 data points in train set
754 data points in validation set
159 features


In [4]:
xs = {
    'train':torch.tensor(X_train.values, dtype=torch.int32),
    'val':torch.tensor(X_val.values, dtype=torch.int32)
}

ys = {
    'train':torch.tensor(y_train.values, dtype=torch.float32),  
    'val':torch.tensor(y_val.values, dtype=torch.float32)
}

def get_batch(split):
    assert split in ['train', 'val']
    idx = torch.randint(len(xs[split]), (batch_size,))
    x = xs[split][idx]
    y = ys[split][idx]
    x, y = x.to(device), y.to(device)
    return x, y

In [5]:
class Head(nn.Module):
    def __init__(self, head_size, dropout):
        super().__init__()
        self.key = nn.Linear(d_model, head_size)
        self.query = nn.Linear(d_model, head_size)
        self.value = nn.Linear(d_model, head_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x) # (B, T, head_size)
        q = self.query(x) # (B, T, head_size)
        v = self.value(x) # (B, T, head_size)
        w = k @ q.transpose(-2, -1) * C**-0.5 # (B, T, T), multiply with C**-0.5 to ensure unit gaussian outputs
        w = F.softmax(w, dim=-1) # (B, T, T)
        w = self.dropout(w)
        out = w @ v # (B, T, T) @ (B, T, C) = (B, T, C)
        return out
    
class MultiHeadAttention(nn.Module):
    def __init__(self, head_size, n_heads, d_model, dropout):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size, dropout) for _ in range(n_heads)])
        self.proj = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([head(x) for head in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out
    
class FeedForward(nn.Module):
    def __init__(self, d_model, dropout):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, 4*d_model),
            nn.ReLU(),
            nn.Linear(4*d_model, d_model),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        out = self.net(x)
        return out
    
class Block(nn.Module):
    def __init__(self, head_size, d_model, n_heads, dropout):
        super().__init__()
        self.attention = MultiHeadAttention(head_size, n_heads, d_model, dropout)
        self.ff = FeedForward(d_model, dropout)
        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)

    def forward(self, x):
        x = x + self.attention(self.ln1(x))
        x = x + self.ff(self.ln2(x))
        return x

In [6]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedcat = nn.Embedding(2, d_model)
        self.embednum = nn.Embedding(n_embed, d_model)
        self.blocks = nn.Sequential(
            Block(head_size, d_model, n_heads, dropout),
            Block(head_size, d_model, n_heads, dropout),
            Block(head_size, d_model, n_heads, dropout),
            Block(head_size, d_model, n_heads, dropout)
        )
        self.linear = nn.Linear(d_model*(len(encoded_categorical_feature_names)+len(discretized_numerical_feature_names)), 3)

    def forward(self, x, y=None):
        xcat = x[:, :len(encoded_categorical_feature_names)]
        xnum = x[:, len(encoded_categorical_feature_names):len(encoded_categorical_feature_names)+len(discretized_numerical_feature_names)]
        ecat = self.embedcat(xcat)
        enum = self.embednum(xnum)
        out = torch.cat([ecat, enum], dim=1)
        out = self.blocks(out).view(-1, d_model*(len(encoded_categorical_feature_names)+len(discretized_numerical_feature_names)))
        out = self.linear(out).squeeze()

        if y == None:
            loss = None
        else:
            loss = F.binary_cross_entropy_with_logits(out, y)
        return out, loss

In [7]:
# instantiate model
m = Model().to(device)
m.train()

# define weight decaying parameters
param_dict = {pn: p for pn, p in m.named_parameters()}
param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
optim_groups = [
    {'params': decay_params, 'weight_decay': 0.1},
    {'params': nodecay_params, 'weight_decay': 0.0}
    ]

# instantiate optimizer
optimizer = torch.optim.AdamW(
    optim_groups,
    lr=5e-4
    )

# instantiate learning rate schedule
lr_schedule = optim.lr_scheduler.ExponentialLR(
    optimizer,
    gamma=0.99
    )

In [8]:
train_losses, val_losses, val_acc_scores = [], [], []
n_eval = 100
for i in tqdm(range(1001)):
    m.train()
    x, y = get_batch('train')
    logits, loss = m(x, y)
    optimizer.zero_grad()
    loss.backward()
    train_losses.append(loss.item())
    optimizer.step()

    m.eval()
    x, y = get_batch('val')
    with torch.no_grad():
        logits, loss = m(x, y)
        val_losses.append(loss.item())
        score = accuracy_score(y.tolist(), F.one_hot(torch.argmax(F.softmax(logits, dim=1), dim=1), num_classes=3).tolist())
        val_acc_scores.append(score)
    if i%n_eval==0:
        tqdm.write(f"step {i+1}: train loss {np.mean(train_losses[-n_eval:]):.5f}, val loss {np.mean(val_losses[-n_eval:]):.5f}, validation accuracy score {np.mean(val_acc_scores[-n_eval:]):.5f}, current learning rate {lr_schedule.get_last_lr()[0]:.7f}")
    lr_schedule.step()

  0%|          | 0/1001 [00:00<?, ?it/s]

step 1: train loss 0.83305, val loss 2.80231, validation accuracy score 0.51562, current learning rate 0.0005000
step 101: train loss 0.42741, val loss 0.38140, validation accuracy score 0.77758, current learning rate 0.0001830
step 201: train loss 0.30297, val loss 0.27888, validation accuracy score 0.82875, current learning rate 0.0000670
step 301: train loss 0.28962, val loss 0.27820, validation accuracy score 0.82281, current learning rate 0.0000245
step 401: train loss 0.28807, val loss 0.27622, validation accuracy score 0.82711, current learning rate 0.0000090
step 501: train loss 0.29018, val loss 0.27672, validation accuracy score 0.82563, current learning rate 0.0000033
step 601: train loss 0.28846, val loss 0.27308, validation accuracy score 0.82719, current learning rate 0.0000012
step 701: train loss 0.28921, val loss 0.26921, validation accuracy score 0.83320, current learning rate 0.0000004
step 801: train loss 0.28918, val loss 0.27250, validation accuracy score 0.83078,

In [12]:
input_test = pd.read_csv('./data/test.csv', index_col=0)
input_test = input_test.rename(columns={
    'Marital status': 'c_marital_status',
    'Application mode': 'c_application_mode',
    'Application order': 'c_application_order',
    'Course': 'c_course',
    'Daytime/evening attendance': 'c_attendance',
    'Previous qualification': 'c_qualification',
    'Previous qualification (grade)': 'n_qualification',
    'Nacionality': 'c_nationality',
    "Mother's qualification": 'c_mqual',
    "Father's qualification": 'c_fqual',
    "Mother's occupation": 'c_mocup',
    "Father's occupation": 'c_focup',
    'Admission grade': 'n_grade',
    'Displaced': 'c_displaced',
    'Educational special needs': 'c_special_needs',
    'Debtor': 'c_debtor',
    'Tuition fees up to date': 'c_fees',
    'Gender': 'c_gender',
    'Scholarship holder': 'c_scholarship',
    'Age at enrollment': 'n_age',
    'International': 'c_international',
    'Curricular units 1st sem (credited)': 'n_cu1cr',
    'Curricular units 1st sem (enrolled)': 'n_cu1en',
    'Curricular units 1st sem (evaluations)': 'n_cu1ev',
    'Curricular units 1st sem (approved)': 'n_cu1ap',
    'Curricular units 1st sem (grade)': 'n_cu1gr',
    'Curricular units 1st sem (without evaluations)': 'n_cu1wo',
    'Curricular units 2nd sem (credited)': 'n_cu2cr',
    'Curricular units 2nd sem (enrolled)': 'n_cu2en',
    'Curricular units 2nd sem (evaluations)': 'n_cu2ev',
    'Curricular units 2nd sem (approved)': 'n_cu2ap',
    'Curricular units 2nd sem (grade)': 'n_cu2gr',
    'Curricular units 2nd sem (without evaluations)': 'n_cu2wo',
    'Unemployment rate': 'n_unemployment_rate',
    'Inflation rate': 'n_inflation_rate',
    'GDP': 'n_gdp'
    })


# one-hot-encode categorical features
encoded_categorical_test_data = ohe.transform(input_test[categorical_features])
encoded_categorical_test_df = pd.DataFrame(encoded_categorical_test_data, columns=encoded_categorical_feature_names)

# scale numerical features
discretized_numerical_test_data = disc.transform(input_test[numerical_features])
discretized_numerical_test_df = pd.DataFrame(discretized_numerical_test_data, columns=discretized_numerical_feature_names)

# merge categorical and numerical features
X_test = pd.merge(encoded_categorical_test_df, discretized_numerical_test_df, left_index=True, right_index=True)

print(f'{len(X_test)} data points in test set')
print(f'{len(X_test.columns)} features')

51012 data points in test set
159 features


In [84]:
chunksize = 100
pred = []
m.eval()
for chunk in tqdm(range((len(X_test)//chunksize)+1)):
    with torch.no_grad():
        x = X_test[(chunksize*chunk):(chunksize*(chunk+1))]
        x = torch.tensor(x.values, dtype=torch.int32).to(device)        
        logits, _ = m(x)
        pred += F.one_hot(torch.argmax(F.softmax(logits, dim=1), dim=1), num_classes=3).tolist()
pd.DataFrame(target_ohe.inverse_transform(pred), index=input_test.index, columns=['Target']).to_csv('./data/submission.csv')

  0%|          | 0/511 [00:00<?, ?it/s]