In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
import optuna
from optuna.samplers import TPESampler
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load data
train_data = pd.read_csv(r'C:\Users\Local User\Documents\MITx\Machine Learning\Kaggle\Classification with an Academic Success Dataset\used final\train_data_preprocess.csv')
test_data = pd.read_csv(r'C:\Users\Local User\Documents\MITx\Machine Learning\Kaggle\Classification with an Academic Success Dataset\used final\test_data_preprocess.csv')


In [4]:
train_data.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,1,1,9238,1,1,126.0,1,1,19,...,0,6,7,6,12.428571,0,11.1,0.6,2.02,Graduate
1,1,17,1,9238,1,1,125.0,1,19,19,...,0,6,9,0,0.0,0,11.1,0.6,2.02,Dropout
2,1,17,2,9254,1,1,137.0,1,3,19,...,0,6,0,0,0.0,0,16.2,0.3,-0.92,Dropout
3,1,1,3,9500,1,1,131.0,1,19,3,...,0,8,11,7,12.82,0,11.1,0.6,2.02,Enrolled
4,1,1,2,9500,1,1,132.0,1,19,37,...,0,7,12,6,12.933333,0,7.6,2.6,0.32,Graduate


In [17]:
label_encoder = LabelEncoder()
label_encoder.fit(['Graduate', 'Enrolled', 'Dropout'])
train_data['Target'] = label_encoder.fit_transform(train_data['Target'])

In [6]:
# Define the objective function for Optuna
def objective(trial):
    params = {
        'grow_policy': 'depthwise',
        'tree_method': 'hist',
        'enable_categorical': True,
        'gamma': trial.suggest_float('gamma', 0, 1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 1, 100),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'random_state': 42
    }

    model = XGBClassifier(**params)

    X = train_data.drop('Target', axis=1)
    y = train_data['Target']

    # Perform cross-validation
    score = cross_val_score(model, X, y, cv=3, scoring='accuracy').mean()
    return score

In [7]:
# Create a study object and optimize
sampler = TPESampler(seed=42)
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=5) #increase to 50

[I 2024-06-27 06:03:17,510] A new study created in memory with name: no-name-3561804a-5574-4d98-b1a0-b6788d9bae06
[I 2024-06-27 06:04:22,809] Trial 0 finished with value: 0.8202906505658799 and parameters: {'gamma': 0.3745401188473625, 'n_estimators': 956, 'learning_rate': 0.22227824312530747, 'max_depth': 7, 'reg_lambda': 16.445845403801215, 'min_child_weight': 1.6443457513284063, 'subsample': 0.5290418060840998, 'colsample_bytree': 0.9330880728874675}. Best is trial 0 with value: 0.8202906505658799.
[I 2024-06-27 06:05:26,544] Trial 1 finished with value: 0.8313338038108681 and parameters: {'gamma': 0.6011150117432088, 'n_estimators': 737, 'learning_rate': 0.01596950334578271, 'max_depth': 10, 'reg_lambda': 83.41182143924175, 'min_child_weight': 2.202157195714934, 'subsample': 0.5909124836035503, 'colsample_bytree': 0.5917022549267169}. Best is trial 1 with value: 0.8313338038108681.
[I 2024-06-27 06:06:00,651] Trial 2 finished with value: 0.8309809456598448 and parameters: {'gamma':

In [8]:
# Get the best trial
best_trial = study.best_trial

print("Best trial params:")
print("{")
for key, value in best_trial.params.items():
    print(f"    '{key}': {value},")
print("}")

Best trial params:
{
    'gamma': 0.45606998421703593,
    'n_estimators': 807,
    'learning_rate': 0.06790539682592432,
    'max_depth': 7,
    'reg_lambda': 59.64904231734221,
    'min_child_weight': 0.5598590859279775,
    'subsample': 0.8037724259507192,
    'colsample_bytree': 0.5852620618436457,
}


In [9]:
# Train the final model with the best parameters
best_params = best_trial.params
best_params['grow_policy'] = 'depthwise'
best_params['tree_method'] = 'hist'
best_params['enable_categorical'] = True
best_params['random_state'] = 42

best_model = XGBClassifier(**best_params)

X = train_data.drop('Target', axis=1)
y = train_data['Target']

best_model.fit(X, y)

In [10]:
# Save the 'ID' column
test_id = test_data['id'].copy()

# Drop the 'ID' column from test_data
test_data = test_data.drop(columns=['id'])

# Predict using the trained model
predictions = best_model.predict_proba(test_data) #Change to and from proba if probabilities needed

# Assuming the positive class probabilities are in the first column
positive_class_probabilities = predictions[:, 1]  #Drop the 1st column if proba probabilities is used
