In [280]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV, StratifiedKFold, KFold
from sklearn.metrics import make_scorer, get_scorer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam, RMSprop
from scikeras.wrappers import KerasClassifier





In [281]:
df = pd.read_csv(r"..\oblig3\student_performance.csv", delimiter=";")


In [282]:
encoder = OneHotEncoder(sparse_output=False)

targets = encoder.fit_transform(df[['Target']])
features = pd.DataFrame(StandardScaler().fit_transform(df.drop('Target', axis=1)), columns=df.drop('Target', axis=1).columns)


In [283]:
df_targets = pd.DataFrame(targets, columns=encoder.get_feature_names_out(['Target']))

In [284]:
print(targets.shape)

(4424, 3)


In [285]:

from numpy import average


global_random_state = 15

scoring = {
    'accuracy': get_scorer('accuracy'),
    'precision': make_scorer(precision_score, average='macro', zero_division=1.0),
    'recall': make_scorer(recall_score, average='macro', zero_division=1.0),
    'f1': make_scorer(f1_score, average='macro'),
    'roc_auc': make_scorer(roc_auc_score, multi_class='ovr', average='macro', response_method='predict')   
}

def evaluate(estimator, X, y):
    scores = {}
    for (name,scorer) in scoring.items():
        scores[name] = scorer(estimator, X, y) 
    return scores

def train(features, targets, estimator, params, scoring=scoring, refit='f1', random_state=global_random_state, outer_splits=5, inner_splits=4):

    outer_cv = KFold(n_splits=outer_splits, shuffle=True, random_state=global_random_state)
    inner_cv = KFold(n_splits=inner_splits, shuffle=True, random_state=global_random_state)

    scores_train = []
    scores_test = []
    estimators = []
    cv_results = []

    # Loop through all test folds
    for (train_index, test_index) in outer_cv.split(features, targets):
        
        grid = GridSearchCV(
            estimator,
            params, 
            scoring=scoring, 
            refit=refit,
            error_score='raise', 
            cv=inner_cv)
        grid.fit(features.iloc[train_index], targets[train_index])        
        
        evaluation_train = evaluate(grid, features.iloc[train_index], targets[train_index])
        evaluation_test = evaluate(grid, features.iloc[test_index], targets[test_index])
        
        scores_train.append(evaluation_train)
        scores_test.append(evaluation_test)
        
        estimators.append(grid.best_estimator_)
        cv_results.append(pd.DataFrame(grid.cv_results_))
        print("*")

    return estimators, pd.DataFrame(scores_train), pd.DataFrame(scores_test), pd.concat(cv_results, names=['test_split'], keys=range(outer_splits))

def print_estimators(estimators):
    for estimatior in estimators:
        print(estimatior)

def create_model(optimizer="adam", kernel_initializer='glorot_uniform'):
    model = Sequential()
    model.add(Input(shape=(36,)))
    model.add(Dense(100, activation='relu', kernel_initializer=kernel_initializer))
    model.add(Dense(50, activation='relu', kernel_initializer=kernel_initializer))
    model.add(Dense(20, activation='relu', kernel_initializer=kernel_initializer))
    model.add(Dense(3, activation='softmax', kernel_initializer=kernel_initializer))
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    return model


In [286]:
param_grid = {
    #'batch_size': [10, 20, 40],
    #'epochs': [10, 20, 40],
    #'optimizer': ['SGD', 'Adam', 'RMSprop'],
    #'model__kernel_initializer': ['glorot_uniform', 'normal', 'uniform'],     
    'model__optimizer': ['adam', 'sgd']
    #'model__kernel_initializer': ['glorot_uniform', 'he_normal']
}

kearas_model = KerasClassifier(model=create_model, verbose=1)

In [287]:
kearas_estimators, kearas_scores_train, kearas_scores_test, kearas_cv_results = train(features, targets, kearas_model, param_grid)

[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6978 - loss: 0.7501
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6797 - loss: 0.7905
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6673 - loss: 0.8307  
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6667 - loss: 0.8013
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5629 - loss: 0.9937
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5674 - l

In [291]:
kearas_scores_test



Unnamed: 0,accuracy,precision,recall,f1,roc_auc
0,0.714124,0.616647,0.605863,0.603684,0.722351
1,0.751412,0.697612,0.650812,0.654811,0.753754
2,0.770621,0.695828,0.663537,0.672716,0.767223
3,0.758192,0.714996,0.627238,0.624395,0.741209
4,0.730769,0.668174,0.623109,0.595804,0.735007


In [288]:



grid = GridSearchCV(estimator=kearas_model, verbose=1, scoring=scoring, return_train_score=True, cv=5, param_grid=param_grid, refit='f1', error_score='raise')

In [289]:
grid_result = grid.fit(features, df_targets)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6982 - loss: 0.7332
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 764us/step
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6844 - loss: 0.7560
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 790us/step
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7030 - loss: 0.7413
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 799us/step
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6287 - loss: 0.8463
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

In [290]:
test = pd.DataFrame(grid_result.cv_results_)
evaluate(grid, features, targets)

[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 935us/step
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 744us/step
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 742us/step
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 758us/step
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 815us/step


{'accuracy': 0.7646925858951176,
 'precision': 0.7119948046370185,
 'recall': 0.6721228687872497,
 'f1': 0.6834109523379018,
 'roc_auc': 0.7692931968168505}