In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier

In [26]:
# Step 1: Load the data
data = pd.read_csv('./data/train.csv')

data.head()

Unnamed: 0,id,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,0,1,1,1,9238,1,1,126.0,1,1,...,0,6,7,6,12.428571,0,11.1,0.6,2.02,Graduate
1,1,1,17,1,9238,1,1,125.0,1,19,...,0,6,9,0,0.0,0,11.1,0.6,2.02,Dropout
2,2,1,17,2,9254,1,1,137.0,1,3,...,0,6,0,0,0.0,0,16.2,0.3,-0.92,Dropout
3,3,1,1,3,9500,1,1,131.0,1,19,...,0,8,11,7,12.82,0,11.1,0.6,2.02,Enrolled
4,4,1,1,2,9500,1,1,132.0,1,19,...,0,7,12,6,12.933333,0,7.6,2.6,0.32,Graduate


In [27]:
# Step 2: Handle missing values and scale numerical features
numerical_features = [
    'Marital status', 'Application mode', 'Application order', 'Course', 
    'Daytime/evening attendance', 'Previous qualification', 'Previous qualification (grade)', 
    'Nacionality', "Mother's qualification", "Father's qualification", 
    "Mother's occupation", "Father's occupation", 'Admission grade', 
    'Displaced', 'Educational special needs', 'Debtor', 'Tuition fees up to date', 
    'Gender', 'Scholarship holder', 'Age at enrollment', 'International', 
    'Curricular units 1st sem (credited)', 'Curricular units 1st sem (enrolled)', 
    'Curricular units 1st sem (evaluations)', 'Curricular units 1st sem (approved)', 
    'Curricular units 1st sem (grade)', 'Curricular units 1st sem (without evaluations)', 
    'Curricular units 2nd sem (credited)', 'Curricular units 2nd sem (enrolled)', 
    'Curricular units 2nd sem (evaluations)', 'Curricular units 2nd sem (approved)', 
    'Curricular units 2nd sem (grade)', 'Curricular units 2nd sem (without evaluations)', 
    'Unemployment rate', 'Inflation rate', 'GDP'
]

# Step 3: Encode the target labels
label_encoder = LabelEncoder()
data['Target'] = label_encoder.fit_transform(data['Target'])

# Step 3: Split the data into features and target
X = data[numerical_features]
y = data['Target']

# Step 4: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
# Step 5: Preprocess the data
preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [29]:
# Step 6: Train a robust classifier (XGBoost)
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'))
])

In [30]:
# Optional: Hyperparameter tuning using GridSearchCV
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [3, 5, 7],
    'classifier__learning_rate': [0.01, 0.1, 0.3]
}

In [31]:
grid_search = GridSearchCV(model, param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

Fitting 3 folds for each of 18 candidates, totalling 54 fits


In [32]:
# Step 7: Evaluate the model
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.90      0.83      0.87      5028
           1       0.66      0.61      0.64      3017
           2       0.86      0.92      0.89      7259

    accuracy                           0.83     15304
   macro avg       0.81      0.79      0.80     15304
weighted avg       0.83      0.83      0.83     15304



In [33]:
# Calculate and print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.83


In [49]:
# Step 9: Load the test data
test_data = pd.read_csv('./data/test.csv')

# Step 10: Preprocess the test data
test_final = test_data[numerical_features]

# Step 11: Make predictions on the test data
test_predictions = best_model.predict(test_final)

# Step 12: Convert numerical predictions back to original labels
test_predictions_labels = label_encoder.inverse_transform(test_predictions)

# Step 13: Create the submission DataFrame
submission = pd.DataFrame({
    'id': test_data['id'],
    'Target': test_predictions_labels
})

# Step 14: Save the submission DataFrame to a CSV file
submission.to_csv('./data/submission.csv', index=False)

In [50]:
sample = pd.read_csv('./data/sample_submission.csv')
sample.head()

Unnamed: 0,id,Target
0,76518,Graduate
1,76519,Graduate
2,76520,Graduate
3,76521,Graduate
4,76522,Graduate


In [51]:
sub = pd.read_csv('./data/submission.csv')
sub.head()

Unnamed: 0,id,Target
0,76518,Dropout
1,76519,Graduate
2,76520,Graduate
3,76521,Enrolled
4,76522,Enrolled
