In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import AUC, Precision, Recall
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.regularizers import L1L2

from mappings import qualification_mapping, profession_mapping

In [2]:
df = pd.read_csv('train.csv')

In [3]:
def qualification_occupation_engineering(data: pd.DataFrame) -> pd.DataFrame:
    data['qualification_categories'] = data['Previous qualification'].map(qualification_mapping)
    data['mother_qualification_categories'] = data['Mother\'s qualification'].map(qualification_mapping)
    data['father_qualification_categories'] = data['Father\'s qualification'].map(qualification_mapping)
    
    data['mother_profession_categories'] = data['Mother\'s occupation'].map(profession_mapping)
    data['father_profession_categories'] = data['Father\'s occupation'].map(profession_mapping)
    
    return data

In [4]:
def flip_application_order(data: pd.DataFrame) -> pd.DataFrame:
    data['Application order'] = 9 - data['Application order']
    
    return data

In [5]:
categorical_columns = ["Marital status", "Application mode", "Course", "qualification_categories", "mother_profession_categories", "father_profession_categories", "mother_qualification_categories", "father_qualification_categories", 'Daytime/evening attendance', 'Displaced', 'Tuition fees up to date', 'Gender', 'Scholarship holder']
ordinal_columns = ["Application order"]
numeric_columns = ["Previous qualification (grade)", "Admission grade", "Age at enrollment", "Curricular units 1st sem (credited)", "Curricular units 1st sem (enrolled)", "Curricular units 1st sem (evaluations)", "Curricular units 1st sem (approved)", "Curricular units 1st sem (grade)", "Curricular units 1st sem (without evaluations)", "Curricular units 2nd sem (credited)", "Curricular units 2nd sem (enrolled)", "Curricular units 2nd sem (evaluations)", "Curricular units 2nd sem (approved)", "Curricular units 2nd sem (grade)", "Curricular units 2nd sem (without evaluations)", "Unemployment rate", "Inflation rate", "GDP"]
drop_columns = ['Target', 'id', 'Nacionality', 'International', 'Educational special needs', "Previous qualification", "Mother's qualification", "Father's qualification", "Mother's occupation", "Father's occupation"]

In [6]:
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
        ('ordinal', OrdinalEncoder(), ordinal_columns),
        ('scaler', StandardScaler(), numeric_columns)
    ],
    remainder='passthrough'
)

In [7]:
label_encoder = LabelEncoder()
df['Target'] = label_encoder.fit_transform(df['Target'])

In [9]:
df = flip_application_order(df)
df = qualification_occupation_engineering(df)

In [10]:
X_data = df.drop(drop_columns, axis=1)
y_data = df['Target']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=42)
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [12]:
print(X_train.shape)
print(y_train.shape)

(61214, 132)
(61214,)


In [13]:
callback = EarlyStopping(monitor='val_loss', mode='max', min_delta=0.001, patience=10, restore_best_weights=True)

tf_model = Sequential([
    Input(shape=(X_train.shape[1],)),
    # Dense(64, activation='relu', kernel_regularizer=L1L2(l1=1e-8, l2=1e-7)),
    # Dropout(0.2),
    # Dense(128, activation='relu', kernel_regularizer=L1L2(l1=1e-8, l2=1e-7)),
    # Dropout(0.2),
    # Dense(672, activation='relu', kernel_regularizer=L1L2(l1=1e-8, l2=1e-7)),
    # Dropout(0.3),
    # Dense(128, activation='relu', kernel_regularizer=L1L2(l1=1e-8, l2=1e-7)),
    # Dropout(0.2),
    Dense(64, activation='relu', kernel_regularizer=L1L2(l1=1e-8, l2=1e-7)),
    Dropout(0.2),
    Dense(32, activation='relu', kernel_regularizer=L1L2(l1=1e-8, l2=1e-7)),
    Dense(16, activation='relu', kernel_regularizer=L1L2(l1=1e-8, l2=1e-7)),
    Dense(3, activation='softmax'),
])

tf_model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

tf_model.fit(X_train, y_train, epochs=100, verbose=2, validation_data=(X_test, y_test), callbacks=[callback], batch_size=32)

Epoch 1/100
1913/1913 - 4s - 2ms/step - accuracy: 0.8019 - loss: 0.5067 - val_accuracy: 0.8243 - val_loss: 0.4604
Epoch 2/100
1913/1913 - 3s - 1ms/step - accuracy: 0.8208 - loss: 0.4661 - val_accuracy: 0.8272 - val_loss: 0.4493
Epoch 3/100
1913/1913 - 2s - 1ms/step - accuracy: 0.8215 - loss: 0.4600 - val_accuracy: 0.8289 - val_loss: 0.4509
Epoch 4/100
1913/1913 - 2s - 1ms/step - accuracy: 0.8238 - loss: 0.4556 - val_accuracy: 0.8263 - val_loss: 0.4479
Epoch 5/100
1913/1913 - 3s - 1ms/step - accuracy: 0.8236 - loss: 0.4519 - val_accuracy: 0.8262 - val_loss: 0.4519
Epoch 6/100
1913/1913 - 2s - 1ms/step - accuracy: 0.8260 - loss: 0.4495 - val_accuracy: 0.8296 - val_loss: 0.4453
Epoch 7/100
1913/1913 - 3s - 1ms/step - accuracy: 0.8269 - loss: 0.4479 - val_accuracy: 0.8235 - val_loss: 0.4501
Epoch 8/100
1913/1913 - 3s - 2ms/step - accuracy: 0.8284 - loss: 0.4458 - val_accuracy: 0.8298 - val_loss: 0.4442
Epoch 9/100
1913/1913 - 3s - 2ms/step - accuracy: 0.8287 - loss: 0.4432 - val_accuracy: 

<keras.src.callbacks.history.History at 0x1f7224696d0>

In [14]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(n_jobs=-1, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc}")

Accuracy: 0.8329848405645582


In [18]:
from tensorflow.keras.layers import LeakyReLU

leaky_tf_model = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(128),
    LeakyReLU(negative_slope=0.1),
    Dense(64),
    LeakyReLU(negative_slope=0.1),
    Dropout(0.2),
    Dense(32),
    LeakyReLU(negative_slope=0.1),
    Dense(16),
    LeakyReLU(negative_slope=0.1),
    Dense(3, activation='softmax'),
])

leaky_tf_model.compile(loss=SparseCategoricalCrossentropy(), optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

leaky_tf_model.fit(X_train, y_train, epochs=100, verbose=2, validation_data=(X_test, y_test), callbacks=[callback], batch_size=32)

Epoch 1/100
1913/1913 - 4s - 2ms/step - accuracy: 0.8108 - loss: 0.4888 - val_accuracy: 0.8275 - val_loss: 0.4534
Epoch 2/100
1913/1913 - 5s - 3ms/step - accuracy: 0.8236 - loss: 0.4609 - val_accuracy: 0.8274 - val_loss: 0.4567
Epoch 3/100
1913/1913 - 5s - 3ms/step - accuracy: 0.8236 - loss: 0.4551 - val_accuracy: 0.8285 - val_loss: 0.4494
Epoch 4/100
1913/1913 - 3s - 1ms/step - accuracy: 0.8265 - loss: 0.4499 - val_accuracy: 0.8295 - val_loss: 0.4454
Epoch 5/100
1913/1913 - 3s - 2ms/step - accuracy: 0.8278 - loss: 0.4462 - val_accuracy: 0.8297 - val_loss: 0.4475
Epoch 6/100
1913/1913 - 3s - 2ms/step - accuracy: 0.8284 - loss: 0.4431 - val_accuracy: 0.8287 - val_loss: 0.4452
Epoch 7/100
1913/1913 - 5s - 3ms/step - accuracy: 0.8293 - loss: 0.4399 - val_accuracy: 0.8266 - val_loss: 0.4499
Epoch 8/100
1913/1913 - 5s - 3ms/step - accuracy: 0.8307 - loss: 0.4365 - val_accuracy: 0.8270 - val_loss: 0.4440
Epoch 9/100
1913/1913 - 3s - 2ms/step - accuracy: 0.8316 - loss: 0.4330 - val_accuracy: 

<keras.src.callbacks.history.History at 0x1f7256943e0>

In [19]:
leaky_tf_model.save('leaky_tf_128_8336.keras')

In [20]:
test = pd.read_csv('test.csv')

In [21]:
test = flip_application_order(test)
test = qualification_occupation_engineering(test)
ids = test['id']
test = preprocessor.transform(test)

In [22]:
pred = leaky_tf_model.predict(test)

[1m1595/1595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step


In [23]:
pred

array([[0.98737204, 0.0099334 , 0.00269461],
       [0.00742142, 0.0269845 , 0.96559405],
       [0.05165423, 0.23641491, 0.71193093],
       ...,
       [0.95373994, 0.02020093, 0.02605912],
       [0.7631964 , 0.2151306 , 0.02167292],
       [0.9357442 , 0.02914673, 0.03510908]], dtype=float32)

In [31]:
xgb_pred = xgb_model.predict(test)

In [26]:
mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(mapping)

{'Dropout': 0, 'Enrolled': 1, 'Graduate': 2}


In [34]:
submission_xgb = pd.DataFrame({"id": ids, "Target": xgb_pred})
submission_xgb['Target'] = submission_xgb['Target'].map({0: 'Dropout', 1: 'Enrolled', 2: 'Graduate'})
print(submission_xgb['Target'].value_counts())

Target
Graduate    26393
Dropout     15336
Enrolled     9283
Name: count, dtype: int64


In [36]:
submission_xgb.to_csv('submission_xgb.csv', index=False)

In [37]:
predicted_classes = np.argmax(pred, axis=1)
predicted_labels = pd.Series(predicted_classes).map({0: 'Dropout', 1: 'Enrolled', 2: 'Graduate'})

In [40]:
submission_keras = pd.DataFrame({"id": ids, "Target": predicted_labels})
submission_keras

Unnamed: 0,id,Target
0,76518,Dropout
1,76519,Graduate
2,76520,Graduate
3,76521,Enrolled
4,76522,Enrolled
...,...,...
51007,127525,Dropout
51008,127526,Dropout
51009,127527,Dropout
51010,127528,Dropout


In [41]:
submission_keras.to_csv('submission_keras.csv', index=False)