In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.preprocessing import OneHotEncoder, StandardScaler, KBinsDiscretizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm.notebook import tqdm
from dataclasses import dataclass
from importlib import import_module

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using mps device


In [2]:
@dataclass
class Model_Parameters:
    d_model: int = 32 # dimension of model
    n_embed: int = 16 # dimension of embedding
    n_heads: int = 8 # number of heads
    head_size: int = n_embed//n_heads # head size
    dropout: float = 0.3 # dropout rate
    n_in_cat: int = 141 # number of categorical columns in input tensor
    n_in_num: int = 18 # number of numberical columns in input tensor

@dataclass
class Optim_Parameters:
    lr: float = 3e-4 # initial learning rate
    gamma: float = 0.9999 # exponential learning rate decay gamma
    wd: float = 0.1 # weight decay

@dataclass
class Train_Parameters:
    batch_size: int = 256 # number of examples per batch
    test_size: float = 0.1 # relative size of validation split
    n_eval: int = 100 # evaluate model performance every n_eval steps
    max_steps: int = 1001 # maximum number of steps in training

mparam = Model_Parameters()
oparam = Optim_Parameters()
tparam = Train_Parameters()

In [3]:
input = pd.read_csv('./data/train.csv', index_col=0)
input = input.rename(columns={
    'Marital status': 'c_marital_status',
    'Application mode': 'c_application_mode',
    'Application order': 'c_application_order',
    'Course': 'c_course',
    'Daytime/evening attendance': 'c_attendance',
    'Previous qualification': 'c_qualification',
    'Previous qualification (grade)': 'n_qualification',
    'Nacionality': 'c_nationality',
    "Mother's qualification": 'c_mqual',
    "Father's qualification": 'c_fqual',
    "Mother's occupation": 'c_mocup',
    "Father's occupation": 'c_focup',
    'Admission grade': 'n_grade',
    'Displaced': 'c_displaced',
    'Educational special needs': 'c_special_needs',
    'Debtor': 'c_debtor',
    'Tuition fees up to date': 'c_fees',
    'Gender': 'c_gender',
    'Scholarship holder': 'c_scholarship',
    'Age at enrollment': 'n_age',
    'International': 'c_international',
    'Curricular units 1st sem (credited)': 'n_cu1cr',
    'Curricular units 1st sem (enrolled)': 'n_cu1en',
    'Curricular units 1st sem (evaluations)': 'n_cu1ev',
    'Curricular units 1st sem (approved)': 'n_cu1ap',
    'Curricular units 1st sem (grade)': 'n_cu1gr',
    'Curricular units 1st sem (without evaluations)': 'n_cu1wo',
    'Curricular units 2nd sem (credited)': 'n_cu2cr',
    'Curricular units 2nd sem (enrolled)': 'n_cu2en',
    'Curricular units 2nd sem (evaluations)': 'n_cu2ev',
    'Curricular units 2nd sem (approved)': 'n_cu2ap',
    'Curricular units 2nd sem (grade)': 'n_cu2gr',
    'Curricular units 2nd sem (without evaluations)': 'n_cu2wo',
    'Unemployment rate': 'n_unemployment_rate',
    'Inflation rate': 'n_inflation_rate',
    'GDP': 'n_gdp'
    })
target = 'Target'
features = [col for col in input.columns if col != target]
categorical_features = [f for f in features if f.startswith('c_')]
numerical_features = [f for f in features if f.startswith('n_')]

# remove categorical outliers
for c in categorical_features:
    temp = input[c].value_counts()/len(input)
    below_cutoff = temp[len(temp)*temp<0.01]
    if len(below_cutoff.index)>0:
        print(f'dropping {len(input[input[c].isin(below_cutoff.index)])} records of category {c}')
        input = input[~input[c].isin(below_cutoff.index)]

# split train and validation data
input_train, input_val, target_train, target_val = train_test_split(
    input[features],
    input[target],
    test_size=tparam.test_size,
    random_state=42,
    stratify=input[target]
    )

# one-hot-encode categorical features
ohe = OneHotEncoder(
    sparse_output=False,
    handle_unknown='ignore'
    )
encoded_categorical_train_data = ohe.fit_transform(input_train[categorical_features])
encoded_categorical_val_data = ohe.transform(input_val[categorical_features])
encoded_categorical_feature_names = ohe.get_feature_names_out(input_train[categorical_features].columns)
encoded_categorical_train_df = pd.DataFrame(encoded_categorical_train_data, columns=encoded_categorical_feature_names)
encoded_categorical_val_df = pd.DataFrame(encoded_categorical_val_data, columns=encoded_categorical_feature_names)

# scale numerical features
disc = KBinsDiscretizer(
    n_bins=mparam.n_embed,
    encode='ordinal',
    strategy='uniform',
    subsample=None
)
discretized_numerical_train_data = disc.fit_transform(input_train[numerical_features])
discretized_numerical_val_data = disc.transform(input_val[numerical_features])
discretized_numerical_feature_names = disc.get_feature_names_out(input_train[numerical_features].columns)
discretized_numerical_train_df = pd.DataFrame(discretized_numerical_train_data, columns=discretized_numerical_feature_names)
discretized_numerical_val_df = pd.DataFrame(discretized_numerical_val_data, columns=discretized_numerical_feature_names)

# merge categorical and numerical features
X_train = pd.merge(encoded_categorical_train_df, discretized_numerical_train_df, left_index=True, right_index=True)
X_val = pd.merge(encoded_categorical_val_df, discretized_numerical_val_df, left_index=True, right_index=True)

# one-hot-encode target 
target_ohe = OneHotEncoder(
    sparse_output=False,
    handle_unknown='ignore'
    )
encoded_train_target = target_ohe.fit_transform(pd.DataFrame(target_train))
encoded_val_target = target_ohe.transform(pd.DataFrame(target_val))
encoded_target_names = target_ohe.get_feature_names_out(pd.DataFrame(target_train).columns)
y_train = pd.DataFrame(encoded_train_target, columns=encoded_target_names)
y_val = pd.DataFrame(encoded_val_target, columns=encoded_target_names)

print(f'{len(X_train)} data points in train set')
print(f'{len(X_val)} data points in validation set')
print(f'{len(X_train.columns)} features')

dropping 167 records of category c_marital_status
dropping 16 records of category c_application_mode
dropping 4 records of category c_application_order
dropping 2 records of category c_course
dropping 48 records of category c_qualification
dropping 104 records of category c_nationality
dropping 146 records of category c_mqual
dropping 115 records of category c_fqual
dropping 128 records of category c_mocup
dropping 141 records of category c_focup
dropping 279 records of category c_special_needs
67831 data points in train set
7537 data points in validation set
159 features


In [4]:
xs = {
    'train':torch.tensor(X_train.values, dtype=torch.int32),
    'val':torch.tensor(X_val.values, dtype=torch.int32)
}

ys = {
    'train':torch.tensor(y_train.values, dtype=torch.float32),  
    'val':torch.tensor(y_val.values, dtype=torch.float32)
}

def get_batch(split):
    assert split in ['train', 'val']
    idx = torch.randint(len(xs[split]), (tparam.batch_size,))
    x = xs[split][idx]
    y = ys[split][idx]
    x, y = x.to(device), y.to(device)
    return x, y

In [5]:
mparam.n_in_cat = len(encoded_categorical_feature_names)
mparam.n_in_num = len(discretized_numerical_feature_names)

# instantiate model
architecture = 'Transformer'
module_name = f"architectures.{architecture}"
module = import_module(module_name)
model_class = getattr(module, architecture)
m = model_class(mparam)
m = m.to(device)
#m = torch.compile(device)
print(f'Number of params in {architecture} model: {sum(p.numel() for p in m.parameters())}')
m.train()

# define weight decaying parameters
param_dict = {pn: p for pn, p in m.named_parameters()}
param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
optim_groups = [
    {'params': decay_params, 'weight_decay': oparam.wd},
    {'params': nodecay_params, 'weight_decay': 0.0}
    ]

# instantiate optimizer
optimizer = torch.optim.AdamW(
    optim_groups,
    lr=oparam.lr
    )

# instantiate learning rate schedule
lr_schedule = optim.lr_scheduler.ExponentialLR(
    optimizer,
    gamma=oparam.gamma
    )

Number of params in Transformer model: 58275


In [6]:
train_losses, val_losses, val_acc_scores = [], [], []
print(f'training for {tparam.max_steps} steps, evaluating every {tparam.n_eval} steps.')
print(f'number of batches in training set: {len(xs['train'])}')
for i in tqdm(range(tparam.max_steps)):
    m.train()
    x, y = get_batch('train')
    logits, loss = m(x, y)
    optimizer.zero_grad()
    loss.backward()
    train_losses.append(loss.item())
    optimizer.step()

    m.eval()
    x, y = get_batch('val')
    with torch.no_grad():
        logits, loss = m(x, y)
        val_losses.append(loss.item())
        score = accuracy_score(y.tolist(), F.one_hot(torch.argmax(F.softmax(logits, dim=1), dim=1), num_classes=3).tolist())
        val_acc_scores.append(score)
    if i%tparam.n_eval==0:
        tqdm.write(f"step {i+1}: train loss {np.mean(train_losses[-tparam.n_eval:]):.5f}, val loss {np.mean(val_losses[-tparam.n_eval:]):.5f}, val acc {np.mean(val_acc_scores[-tparam.n_eval:]):.5f}, current lr {lr_schedule.get_last_lr()[0]:.7f}")
    lr_schedule.step()

training for 1001 steps, evaluating every 100 steps.
number of batches in training set: 67831


  0%|          | 0/1001 [00:00<?, ?it/s]

step 1: train loss 0.69254, val loss 0.68637, val acc 0.46094, current lr 0.0003000
step 101: train loss 0.39553, val loss 0.38450, val acc 0.75137, current lr 0.0002970
step 201: train loss 0.30501, val loss 0.30076, val acc 0.81320, current lr 0.0002941


Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000000e:Internal Error)
	<AGXG13XFamilyCommandBuffer: 0x3f58a6900>
    label = <none> 
    device = <AGXG13XDevice: 0x14e41ea00>
        name = Apple M1 Pro 
    commandQueue = <AGXG13XFamilyCommandQueue: 0x166b56200>
        label = <none> 
        device = <AGXG13XDevice: 0x14e41ea00>
            name = Apple M1 Pro 
    retainedReferences = 1
Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000000e:Internal Error)
	<AGXG13XFamilyCommandBuffer: 0x3f1653d90>
    label = <none> 
    device = <AGXG13XDevice: 0x14e41ea00>
        name = Apple M1 Pro 
    commandQueue = <AGXG13XFamilyCommandQueue: 0x166b56200>
        label = <none> 
        device = <AGXG13XDevice: 0x14e41ea00>
            name = Apple M1 Pro 
    r

step 301: train loss 0.29608, val loss 0.29272, val acc 0.81637, current lr 0.0002911


Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000000e:Internal Error)
	<AGXG13XFamilyCommandBuffer: 0x1104cd610>
    label = <none> 
    device = <AGXG13XDevice: 0x14e41ea00>
        name = Apple M1 Pro 
    commandQueue = <AGXG13XFamilyCommandQueue: 0x166b56200>
        label = <none> 
        device = <AGXG13XDevice: 0x14e41ea00>
            name = Apple M1 Pro 
    retainedReferences = 1
Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000000e:Internal Error)
	<AGXG13XFamilyCommandBuffer: 0x3f5506e50>
    label = <none> 
    device = <AGXG13XDevice: 0x14e41ea00>
        name = Apple M1 Pro 
    commandQueue = <AGXG13XFamilyCommandQueue: 0x166b56200>
        label = <none> 
        device = <AGXG13XDevice: 0x14e41ea00>
            name = Apple M1 Pro 
    r

step 401: train loss 0.28358, val loss 0.28657, val acc 0.82203, current lr 0.0002882


Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000000e:Internal Error)
	<AGXG13XFamilyCommandBuffer: 0x3f36ab590>
    label = <none> 
    device = <AGXG13XDevice: 0x14e41ea00>
        name = Apple M1 Pro 
    commandQueue = <AGXG13XFamilyCommandQueue: 0x166b56200>
        label = <none> 
        device = <AGXG13XDevice: 0x14e41ea00>
            name = Apple M1 Pro 
    retainedReferences = 1
Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000000e:Internal Error)
	<AGXG13XFamilyCommandBuffer: 0x3f57672a0>
    label = <none> 
    device = <AGXG13XDevice: 0x14e41ea00>
        name = Apple M1 Pro 
    commandQueue = <AGXG13XFamilyCommandQueue: 0x166b56200>
        label = <none> 
        device = <AGXG13XDevice: 0x14e41ea00>
            name = Apple M1 Pro 
    r

step 501: train loss 0.28743, val loss 0.28289, val acc 0.82266, current lr 0.0002854


Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000000e:Internal Error)
	<AGXG13XFamilyCommandBuffer: 0x3f58c33f0>
    label = <none> 
    device = <AGXG13XDevice: 0x14e41ea00>
        name = Apple M1 Pro 
    commandQueue = <AGXG13XFamilyCommandQueue: 0x166b56200>
        label = <none> 
        device = <AGXG13XDevice: 0x14e41ea00>
            name = Apple M1 Pro 
    retainedReferences = 1
Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000000e:Internal Error)
	<AGXG13XFamilyCommandBuffer: 0x3f587f480>
    label = <none> 
    device = <AGXG13XDevice: 0x14e41ea00>
        name = Apple M1 Pro 
    commandQueue = <AGXG13XFamilyCommandQueue: 0x166b56200>
        label = <none> 
        device = <AGXG13XDevice: 0x14e41ea00>
            name = Apple M1 Pro 
    r

step 601: train loss 0.29271, val loss 0.29023, val acc 0.81426, current lr 0.0002825


Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000000e:Internal Error)
	<AGXG13XFamilyCommandBuffer: 0x3f36653d0>
    label = <none> 
    device = <AGXG13XDevice: 0x14e41ea00>
        name = Apple M1 Pro 
    commandQueue = <AGXG13XFamilyCommandQueue: 0x166b56200>
        label = <none> 
        device = <AGXG13XDevice: 0x14e41ea00>
            name = Apple M1 Pro 
    retainedReferences = 1
Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000000e:Internal Error)
	<AGXG13XFamilyCommandBuffer: 0x3f165e220>
    label = <none> 
    device = <AGXG13XDevice: 0x14e41ea00>
        name = Apple M1 Pro 
    commandQueue = <AGXG13XFamilyCommandQueue: 0x166b56200>
        label = <none> 
        device = <AGXG13XDevice: 0x14e41ea00>
            name = Apple M1 Pro 
    r

step 701: train loss 0.28856, val loss 0.28500, val acc 0.82215, current lr 0.0002797


Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000000e:Internal Error)
	<AGXG13XFamilyCommandBuffer: 0x3f5d28a30>
    label = <none> 
    device = <AGXG13XDevice: 0x14e41ea00>
        name = Apple M1 Pro 
    commandQueue = <AGXG13XFamilyCommandQueue: 0x166b56200>
        label = <none> 
        device = <AGXG13XDevice: 0x14e41ea00>
            name = Apple M1 Pro 
    retainedReferences = 1


step 801: train loss 0.28583, val loss 0.28458, val acc 0.82129, current lr 0.0002769


Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000000e:Internal Error)
	<AGXG13XFamilyCommandBuffer: 0x3f5d14a30>
    label = <none> 
    device = <AGXG13XDevice: 0x14e41ea00>
        name = Apple M1 Pro 
    commandQueue = <AGXG13XFamilyCommandQueue: 0x166b56200>
        label = <none> 
        device = <AGXG13XDevice: 0x14e41ea00>
            name = Apple M1 Pro 
    retainedReferences = 1
Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000000e:Internal Error)
	<AGXG13XFamilyCommandBuffer: 0x3f5d50740>
    label = <none> 
    device = <AGXG13XDevice: 0x14e41ea00>
        name = Apple M1 Pro 
    commandQueue = <AGXG13XFamilyCommandQueue: 0x166b56200>
        label = <none> 
        device = <AGXG13XDevice: 0x14e41ea00>
            name = Apple M1 Pro 
    r

step 901: train loss 0.28253, val loss 0.28734, val acc 0.81988, current lr 0.0002742


Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000000e:Internal Error)
	<AGXG13XFamilyCommandBuffer: 0x324487980>
    label = <none> 
    device = <AGXG13XDevice: 0x14e41ea00>
        name = Apple M1 Pro 
    commandQueue = <AGXG13XFamilyCommandQueue: 0x166b56200>
        label = <none> 
        device = <AGXG13XDevice: 0x14e41ea00>
            name = Apple M1 Pro 
    retainedReferences = 1
Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000000e:Internal Error)
	<AGXG13XFamilyCommandBuffer: 0x3244533a0>
    label = <none> 
    device = <AGXG13XDevice: 0x14e41ea00>
        name = Apple M1 Pro 
    commandQueue = <AGXG13XFamilyCommandQueue: 0x166b56200>
        label = <none> 
        device = <AGXG13XDevice: 0x14e41ea00>
            name = Apple M1 Pro 
    r

step 1001: train loss 0.28412, val loss 0.28544, val acc 0.82121, current lr 0.0002714


In [None]:
input_test = pd.read_csv('./data/test.csv', index_col=0)
input_test = input_test.rename(columns={
    'Marital status': 'c_marital_status',
    'Application mode': 'c_application_mode',
    'Application order': 'c_application_order',
    'Course': 'c_course',
    'Daytime/evening attendance': 'c_attendance',
    'Previous qualification': 'c_qualification',
    'Previous qualification (grade)': 'n_qualification',
    'Nacionality': 'c_nationality',
    "Mother's qualification": 'c_mqual',
    "Father's qualification": 'c_fqual',
    "Mother's occupation": 'c_mocup',
    "Father's occupation": 'c_focup',
    'Admission grade': 'n_grade',
    'Displaced': 'c_displaced',
    'Educational special needs': 'c_special_needs',
    'Debtor': 'c_debtor',
    'Tuition fees up to date': 'c_fees',
    'Gender': 'c_gender',
    'Scholarship holder': 'c_scholarship',
    'Age at enrollment': 'n_age',
    'International': 'c_international',
    'Curricular units 1st sem (credited)': 'n_cu1cr',
    'Curricular units 1st sem (enrolled)': 'n_cu1en',
    'Curricular units 1st sem (evaluations)': 'n_cu1ev',
    'Curricular units 1st sem (approved)': 'n_cu1ap',
    'Curricular units 1st sem (grade)': 'n_cu1gr',
    'Curricular units 1st sem (without evaluations)': 'n_cu1wo',
    'Curricular units 2nd sem (credited)': 'n_cu2cr',
    'Curricular units 2nd sem (enrolled)': 'n_cu2en',
    'Curricular units 2nd sem (evaluations)': 'n_cu2ev',
    'Curricular units 2nd sem (approved)': 'n_cu2ap',
    'Curricular units 2nd sem (grade)': 'n_cu2gr',
    'Curricular units 2nd sem (without evaluations)': 'n_cu2wo',
    'Unemployment rate': 'n_unemployment_rate',
    'Inflation rate': 'n_inflation_rate',
    'GDP': 'n_gdp'
    })


# one-hot-encode categorical features
encoded_categorical_test_data = ohe.transform(input_test[categorical_features])
encoded_categorical_test_df = pd.DataFrame(encoded_categorical_test_data, columns=encoded_categorical_feature_names)

# scale numerical features
discretized_numerical_test_data = disc.transform(input_test[numerical_features])
discretized_numerical_test_df = pd.DataFrame(discretized_numerical_test_data, columns=discretized_numerical_feature_names)

# merge categorical and numerical features
X_test = pd.merge(encoded_categorical_test_df, discretized_numerical_test_df, left_index=True, right_index=True)

print(f'{len(X_test)} data points in test set')
print(f'{len(X_test.columns)} features')

In [None]:
pred = []
m.eval()
for chunk in tqdm(range((len(X_test)//tparam.batch_size)+1)):
    with torch.no_grad():
        x = X_test[(tparam.batch_size*chunk):(tparam.batch_size*(chunk+1))]
        x = torch.tensor(x.values, dtype=torch.int32).to(device)        
        logits, _ = m(x)
        pred += F.one_hot(torch.argmax(F.softmax(logits, dim=1), dim=1), num_classes=3).tolist()
pd.DataFrame(target_ohe.inverse_transform(pred), index=input_test.index, columns=['Target']).to_csv('./data/submission.csv')