# Imports

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

In [4]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Data Cleaning and Pre-processing

In [6]:
df = pd.read_csv('data/df_exam.csv')
df.head(10)

Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,MathScore,ReadingScore,WritingScore
0,1,1,3,1,0,0,1,1,4.0,0,0,87,93,91
1,0,2,1,1,0,3,1,1,0.0,0,1,76,78,75
2,1,1,1,1,0,3,2,1,1.0,0,1,73,84,79
3,1,1,1,1,1,2,0,0,1.0,1,1,85,93,89
4,0,1,1,0,0,3,1,1,1.0,1,2,41,43,39
5,0,3,0,0,1,0,1,0,3.0,1,2,65,64,68
6,0,3,1,1,0,1,1,1,1.0,0,1,40,52,43
7,1,1,0,1,0,3,2,0,1.0,1,1,66,82,74
8,0,0,1,1,1,0,1,1,1.0,1,2,80,73,71
9,1,0,3,1,0,1,1,1,2.0,1,0,48,53,58


In [7]:
x = df.drop('MathScore', axis=1)
y = df['MathScore'].apply(lambda x: 1 if x >= 52 else 0)

X_train, X_temp, y_train, y_temp = train_test_split(x, y, test_size=0.4, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print("Training data shape:", X_train.shape, y_train.shape)
print("Validation data shape:", X_val.shape, y_val.shape)
print("Test data shape:", X_test.shape, y_test.shape)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

Training data shape: (11545, 13) (11545,)
Validation data shape: (3849, 13) (3849,)
Test data shape: (3849, 13) (3849,)


# Models


## ML Models

In [13]:
classification_models = {
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Bagging': BaggingClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Support Vector Machine': SVC(),
    'Naive Bayes': GaussianNB(),
}

classification_params = {
    'Decision Tree': {'max_depth': [3,]},
    'Random Forest': {'n_estimators': [50], 'max_depth': [3]},
    'Gradient Boosting': {'n_estimators': [50], 'learning_rate': [0.01]},
    'Bagging': {'n_estimators': [50]},
    'K-Nearest Neighbors': {'n_neighbors': [5]},
    'Support Vector Machine': {'C': [0.1], 'kernel': ['linear']},
}

classification_results = []

for model_name in classification_models:
    model = classification_models[model_name]

    if model_name in classification_params:
        grid_search = GridSearchCV(model, classification_params[model_name], cv=5)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        training_time = grid_search.refit_time_
        y_pred = best_model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        classification_results.append([model_name, best_params, training_time, accuracy])


results_df = pd.DataFrame(classification_results, columns=['Model', 'Best Parameters', 'Training Time (s)', 'Accuracy'])

results_df

Unnamed: 0,Model,Best Parameters,Training Time (s),Accuracy
0,Decision Tree,{'max_depth': 3},0.006995,0.903092
1,Random Forest,"{'max_depth': 3, 'n_estimators': 50}",0.08763,0.906469
2,Gradient Boosting,"{'learning_rate': 0.01, 'n_estimators': 50}",0.270572,0.830346
3,Bagging,{'n_estimators': 50},0.469294,0.924396
4,K-Nearest Neighbors,{'n_neighbors': 5},0.003075,0.889062
5,Support Vector Machine,"{'C': 0.1, 'kernel': 'linear'}",0.397708,0.931151
