# 0. Initial Setup

In [13]:
import warnings

warnings.simplefilter('ignore')

In [14]:
import numpy as np
import pandas as pd
import math

from datetime import datetime

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
# Constants
TARGET_VAR = 'GradeClass'
NUMERIC_FEATURES = ['StudyTimeWeekly', 'Absences']

# 1. Helper Functions

In [16]:
import matplotlib.pyplot as plt
import seaborn as sns

def check_corr(df):
  corr_matrix = df.corr()
  fig, ax = plt.subplots(figsize=(12, 9))
  ax = sns.heatmap(corr_matrix,
                  annot=True,
                  linewidths=0.2,
                  fmt=".2f",
                  cmap="YlGnBu");
  bottom, top = ax.get_ylim()
  ax.set_ylim(bottom + 0.5, top - 0.5)

In [17]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

def get_cleaned_data():
    # https://www.kaggle.com/datasets/rabieelkharoua/students-performance-dataset
    df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/CS260D/Project/Student_performance_data.csv', sep=',', encoding='utf-8')

    df.drop(['StudentID', 'GPA'], axis=1, inplace=True)
    return df

In [18]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def print_score(model_name, clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print('\nTrain Result(', model_name,'):\n================================================')
        print(f'Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%')
        print('_______________________________________________')
        print(f'CLASSIFICATION REPORT:\n{clf_report}')
        print('_______________________________________________')
        print(f'Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n')
        print('================================================\n\n')

    elif train==False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print('\nTest Result(', model_name,'):\n================================================')
        print(f'Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%')
        print('_______________________________________________')
        print(f'CLASSIFICATION REPORT:\n{clf_report}')
        print('_______________________________________________')
        print(f'Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n')
        print('================================================\n\n')

In [19]:
def get_model(model_name):
    match model_name:
        case 'Logistic Regression':
            return LogisticRegression()
        case 'SVM':
            return SVC()
        case 'Random Forest':
            return RandomForestClassifier()
        case _:
            return None

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

def train_model(df, data_selection):
    print(TARGET_VAR, ',', data_selection)
    X = df.drop(TARGET_VAR, axis=1)
    y = df[TARGET_VAR]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    model_list = {'Logistic Regression': None, 'SVM': None, 'Random Forest': None}
    model_names = ['Logistic Regression', 'SVM', 'Random Forest']
    log_reg_param_grid = {
        'penalty':['l1', 'l2'],
        # 'C' : [0.1, 1],
        'C' : [0.01, 0.1, 1, 10, 100],
        'solver': ['lbfgs', 'newton-cg', 'liblinear'],
        # 'max_iter'  : [50, 100]
        'max_iter'  : [100, 200, 300, 500, 1000]
    }

    svm_param_grid = {'C': [0.01, 0.1, 1, 10, 100],
        'gamma': [1, 0.1, 0.01, 0.001]
    }

    rf_param_grid = {
        # 'n_estimators': [50, 100],
        'n_estimators': [100, 200, 300, 500, 1000],
        'max_features': ['sqrt', 'log2'],
        # 'max_depth' : [5, 6]
        'max_depth' : [5, 6, 7, 8]
    }
    params_grid = {'Logistic Regression': log_reg_param_grid, 'SVM': svm_param_grid, 'Random Forest': rf_param_grid}

    df_results = pd.DataFrame(columns=['Model', 'Data Selection', 'Params', 'Training Accuracy %', 'Training Data Size', 'Testing Accuracy %', 'Testing Data Size'])
    for model_name in model_names:
        clf = get_model(model_name)
        gs_cv = GridSearchCV(estimator=clf, param_grid=params_grid[model_name], cv= 5)
        gs_cv.fit(X_train, y_train)

        # print(gs_cv.best_estimator_)
        # print(gs_cv.best_params_)
        # print(gs_cv.best_score_)

        # print_score(model_name, gs_cv.best_estimator_, X_train, y_train, X_test, y_test, train=True)
        # print_score(model_name, gs_cv.best_estimator_, X_train, y_train, X_test, y_test, train=False)

        test_score = accuracy_score(y_test, gs_cv.best_estimator_.predict(X_test)) * 100
        train_score = accuracy_score(y_train, gs_cv.best_estimator_.predict(X_train)) * 100

        model_list[model_name] = gs_cv.best_estimator_
        df_results = pd.concat([df_results, pd.DataFrame(data=[[model_name, data_selection, gs_cv.best_params_, train_score, X_train.shape[0], test_score, X_test.shape[0]]], columns=df_results.columns)], ignore_index=True)

    print(data_selection, ':')
    print('Ran', df_results.shape[0], 'models')
    # display(df_results)

    return model_list, df_results

# 2. Train without Data Selection

In [21]:
def train_wo_data_selection(df):
    model_list, df_results = train_model(df, 'W/O Data Selection')
    return df_results

# 3. Train with Random Data Selection

In [22]:
def train_with_rand_data_selection(df_results, df):
    df_random20 = pd.DataFrame(columns=df.columns)
    df_random50 = pd.DataFrame(columns=df.columns)
    df_random80 = pd.DataFrame(columns=df.columns)

    for i in df[TARGET_VAR].unique():
        df_tmp = df[df[TARGET_VAR]==i]
        df_tmp20 = df_tmp.sample(n=int(df_tmp.shape[0]*0.2))
        df_tmp50 = df_tmp.sample(n=int(df_tmp.shape[0]*0.5))
        df_tmp80 = df_tmp.sample(n=int(df_tmp.shape[0]*0.8))
        df_random20 = pd.concat([df_random20, df_tmp20], ignore_index=True)
        df_random50 = pd.concat([df_random50, df_tmp50], ignore_index=True)
        df_random80 = pd.concat([df_random80, df_tmp80], ignore_index=True)

    model_list, df_results_tmp = train_model(df_random20, 'Select Random 20%')
    df_results = pd.concat([df_results, df_results_tmp], ignore_index=True)

    model_list, df_results_tmp = train_model(df_random50, 'Select Random 50%')
    df_results = pd.concat([df_results, df_results_tmp], ignore_index=True)

    model_list, df_results_tmp = train_model(df_random80, 'Select, Random 80%')
    df_results = pd.concat([df_results, df_results_tmp], ignore_index=True)

    return df_results

# 4. Train with Data Selection - High vs Low Feature Variance Average (Numeric or not)

In [23]:
from scipy.stats import zscore

def train_with_data_selection_by_zscore(df_results, df):
    df_zscore = df.apply(zscore).apply(abs)
    series_row_avg = df[NUMERIC_FEATURES].mean(axis=1)
    df_row_avg = pd.DataFrame({'Avg': series_row_avg.values})
    df = df.join(df_row_avg)

    df_dict = {}
    df_low_zscore20 = df[df['Avg'] <= df['Avg'].quantile(0.2)]
    df_low_zscore20.drop(['Avg'], axis=1, inplace=True)
    df_dict['Keep '+'Low Z-Score 20%'] = df_low_zscore20
    df_low_zscore50 = df[df['Avg'] <= df['Avg'].quantile(0.5)]
    df_low_zscore50.drop(['Avg'], axis=1, inplace=True)
    df_dict['Keep '+'Low Z-Score 50%'] = df_low_zscore50
    df_low_zscore80 = df[df['Avg'] <= df['Avg'].quantile(0.8)]
    df_low_zscore80.drop(['Avg'], axis=1, inplace=True)
    df_dict['Keep '+'Low Z-Score 80%'] = df_low_zscore80

    df_high_zscore20 = df[df['Avg'] > df['Avg'].quantile(0.8)]
    df_high_zscore20.drop(['Avg'], axis=1, inplace=True)
    df_dict['Keep '+'High Z-Score 20%'] = df_high_zscore20
    df_high_zscore50 = df[df['Avg'] > df['Avg'].quantile(0.5)]
    df_high_zscore50.drop(['Avg'], axis=1, inplace=True)
    df_dict['Keep '+'High Z-Score 50%'] = df_high_zscore50
    df_high_zscore80 = df[df['Avg'] > df['Avg'].quantile(0.2)]
    df_high_zscore80.drop(['Avg'], axis=1, inplace=True)
    df_dict['Keep '+'High Z-Score 80%'] = df_high_zscore80

    for k, v in df_dict.items():
        model_list, df_results_tmp = train_model(v, k)
        df_results = pd.concat([df_results, df_results_tmp], ignore_index=True)
    return df_results

# 5. Run For Stduent Performance Dataset

In [24]:
df = get_cleaned_data()

df_results = train_wo_data_selection(df)
df_results = train_with_rand_data_selection(df_results, df)
df_results = train_with_data_selection_by_zscore(df_results, df)

GradeClass , W/O Data Selection
W/O Data Selection :
Ran 3 models
GradeClass , Select Random 20%
Select Random 20% :
Ran 3 models
GradeClass , Select Random 50%
Select Random 50% :
Ran 3 models
GradeClass , Select, Random 80%
Select, Random 80% :
Ran 3 models
GradeClass , Keep Low Z-Score 20%
Keep Low Z-Score 20% :
Ran 3 models
GradeClass , Keep Low Z-Score 50%
Keep Low Z-Score 50% :
Ran 3 models
GradeClass , Keep Low Z-Score 80%
Keep Low Z-Score 80% :
Ran 3 models
GradeClass , Keep High Z-Score 20%
Keep High Z-Score 20% :
Ran 3 models
GradeClass , Keep High Z-Score 50%
Keep High Z-Score 50% :
Ran 3 models
GradeClass , Keep High Z-Score 80%
Keep High Z-Score 80% :
Ran 3 models


In [25]:
df_results

Unnamed: 0,Model,Data Selection,Params,Training Accuracy %,Training Data Size,Testing Accuracy %,Testing Data Size
0,Logistic Regression,W/O Data Selection,"{'C': 100, 'max_iter': 1000, 'penalty': 'l2', ...",73.894863,1674,68.941504,718
1,SVM,W/O Data Selection,"{'C': 100, 'gamma': 0.001}",78.136201,1674,74.373259,718
2,Random Forest,W/O Data Selection,"{'max_depth': 8, 'max_features': 'sqrt', 'n_es...",89.307049,1674,69.359331,718
3,Logistic Regression,Select Random 20%,"{'C': 0.1, 'max_iter': 100, 'penalty': 'l1', '...",70.870871,333,58.741259,143
4,SVM,Select Random 20%,"{'C': 100, 'gamma': 0.001}",83.783784,333,69.93007,143
5,Random Forest,Select Random 20%,"{'max_depth': 8, 'max_features': 'log2', 'n_es...",98.198198,333,62.237762,143
6,Logistic Regression,Select Random 50%,"{'C': 1, 'max_iter': 300, 'penalty': 'l2', 'so...",73.652695,835,72.423398,359
7,SVM,Select Random 50%,"{'C': 100, 'gamma': 0.001}",78.562874,835,72.423398,359
8,Random Forest,Select Random 50%,"{'max_depth': 8, 'max_features': 'log2', 'n_es...",93.293413,835,67.966574,359
9,Logistic Regression,"Select, Random 80%","{'C': 10, 'max_iter': 200, 'penalty': 'l2', 's...",73.223635,1337,72.299652,574


In [26]:
df_results.to_csv('/content/drive/My Drive/Colab Notebooks/CS260D/Project/Student Performance Result.csv', encoding='utf-8')