# Deep learning practice on Student performance dataset (Multi-class)

Retrieved from https://www.kaggle.com/datasets/rabieelkharoua/students-performance-dataset/data

## Importing libraries

In [4]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.activations import linear, relu, sigmoid
from sklearn.model_selection import train_test_split

In [5]:
student_perf = pd.read_csv('Student_performance_data.csv')
student_perf.head()

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0
1,1002,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1.0
2,1003,15,0,2,3,4.21057,26,0,2,0,0,0,0,0.112602,4.0
3,1004,17,1,0,3,10.028829,14,0,3,1,0,0,0,2.054218,3.0
4,1005,17,1,0,2,4.672495,17,1,3,0,0,0,0,1.288061,4.0


In [6]:
student_perf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2392 entries, 0 to 2391
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   StudentID          2392 non-null   int64  
 1   Age                2392 non-null   int64  
 2   Gender             2392 non-null   int64  
 3   Ethnicity          2392 non-null   int64  
 4   ParentalEducation  2392 non-null   int64  
 5   StudyTimeWeekly    2392 non-null   float64
 6   Absences           2392 non-null   int64  
 7   Tutoring           2392 non-null   int64  
 8   ParentalSupport    2392 non-null   int64  
 9   Extracurricular    2392 non-null   int64  
 10  Sports             2392 non-null   int64  
 11  Music              2392 non-null   int64  
 12  Volunteering       2392 non-null   int64  
 13  GPA                2392 non-null   float64
 14  GradeClass         2392 non-null   float64
dtypes: float64(3), int64(12)
memory usage: 280.4 KB


In [7]:
# Drop ID column
student_perf.drop(['StudentID'], axis=1, inplace=True)
student_perf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2392 entries, 0 to 2391
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Age                2392 non-null   int64  
 1   Gender             2392 non-null   int64  
 2   Ethnicity          2392 non-null   int64  
 3   ParentalEducation  2392 non-null   int64  
 4   StudyTimeWeekly    2392 non-null   float64
 5   Absences           2392 non-null   int64  
 6   Tutoring           2392 non-null   int64  
 7   ParentalSupport    2392 non-null   int64  
 8   Extracurricular    2392 non-null   int64  
 9   Sports             2392 non-null   int64  
 10  Music              2392 non-null   int64  
 11  Volunteering       2392 non-null   int64  
 12  GPA                2392 non-null   float64
 13  GradeClass         2392 non-null   float64
dtypes: float64(3), int64(11)
memory usage: 261.8 KB


In [8]:
# Convert GradeClass from float to object/category
student_perf['GradeClass'] = student_perf['GradeClass'].astype('object')
student_perf['GradeClass'].unique()

array([2.0, 1.0, 4.0, 3.0, 0.0], dtype=object)

In [9]:
# Assign GradeClass to y, the dependent variable
Y = np.asarray(student_perf['GradeClass']).astype('float32').reshape(-1,1)
print(f'First 5 elements of y are {Y[:5]} \nShape of y is {Y.shape}')

First 5 elements of y are [[2.]
 [1.]
 [4.]
 [3.]
 [4.]] 
Shape of y is (2392, 1)


In [10]:
# Assign other variables to matrix X as features, excluding StudentID and GPA
features = ['Age','Gender','Ethnicity','ParentalEducation','StudyTimeWeekly','Absences','Tutoring','ParentalSupport','Extracurricular','Sports','Music','Volunteering']
X = np.asarray(student_perf[features]).astype('float32')
print(f'First element of X is {X[0]} \nShape of X is {X.shape}')

First element of X is [17.        1.        0.        2.       19.833723  7.        1.
  2.        0.        0.        1.        0.      ] 
Shape of X is (2392, 12)


In [11]:
# Split dataset into training, testing and cross-validation set
X_train, X_, Y_train, Y_ = train_test_split(X, Y, test_size=0.2)
X_cv, X_test, Y_cv, Y_test = train_test_split(X_, Y_, test_size=0.5)
print(f'The training set has {Y_train.shape[0]} entries while the testing set and cross-validation set has {Y_test.shape[0]} entries.')

The training set has 1913 entries while the testing set and cross-validation set has 240 entries.


## Developing multi-class neural network

### Simple neural network

In [14]:
# Function to calculate accuracy on training and cross-validation sets
def train_and_cv_accuracy(model, X_train, Y_train, X_cv, Y_cv):
    prediction_array = model.predict(X_train)
    prediction_probs = tf.nn.softmax(prediction_array)
    prediction_classes = []
    for test in prediction_probs:
        prediction_classes.append(np.argmax(test))
    Yhat = np.asarray(prediction_classes).astype('float32').reshape(-1,1)
    train_acc = np.mean(Yhat == Y_train)
    
    prediction_array = model.predict(X_cv)
    prediction_probs = tf.nn.softmax(prediction_array)
    prediction_classes = []
    for test in prediction_probs:
        prediction_classes.append(np.argmax(test))
    Yhat = np.asarray(prediction_classes).astype('float32').reshape(-1,1)
    cv_acc = np.mean(Yhat == Y_cv)

    return pd.DataFrame({'Training accuracy':[train_acc], 'Cross-validation accuracy':[cv_acc]})

In [15]:
# Simple neural network with 2 layers
simple_mc_nn = Sequential(
    [
    tf.keras.Input(shape = (12,)),
    Dense(units=120, activation='relu', name='L1'),
    Dense(units=5, activation='linear', name='L2')
    ], name='SimpleNN'
)
simple_mc_nn.summary()

In [16]:
# Checking dimensions of parameters w and b in each layer
layer1, layer2 = simple_mc_nn.layers
W1, b1 = layer1.get_weights()
W2, b2 = layer2.get_weights()
print(f'W1 shape is {W1.shape}, b1 shape is {b1.shape}')
print(f'W2 shape is {W2.shape}, b2 shape is {b2.shape}')

W1 shape is (12, 120), b1 shape is (120,)
W2 shape is (120, 5), b2 shape is (5,)


In [17]:
# Specify loss and fit model to training set
simple_mc_nn.compile(
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
)

simple_mc_nn.fit(X_train, Y_train, epochs=500, verbose=0)

<keras.src.callbacks.history.History at 0x161ca4510>

In [18]:
# Make predictions on training, cv and testing set
accs = train_and_cv_accuracy(simple_mc_nn, X_train, Y_train, X_cv, Y_cv)
print(f'Simple neural network \nTraining accuracy: {accs["Training accuracy"][0]:0.3f}, Cross-validation accuracy: {accs["Cross-validation accuracy"][0]:0.3f}')

[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 472us/step
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 444us/step
Simple neural network 
Training accuracy: 0.841, Cross-validation accuracy: 0.720


In [19]:
# Add some regularization effect to reduce over-fitting
simple_mc_nn_R = Sequential(
    [
    tf.keras.Input(shape = (12,)),
    Dense(units=120, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.1), name='L1'),
    Dense(units=5, activation='linear', name='L2')
    ], name='RegularizedSimpleNN'
)

simple_mc_nn_R.compile(
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
)

simple_mc_nn_R.fit(X_train, Y_train, epochs=500, verbose=0)

<keras.src.callbacks.history.History at 0x16444f090>

In [20]:
accs = train_and_cv_accuracy(simple_mc_nn_R, X_train, Y_train, X_cv, Y_cv)
print(f'Regularized Simple neural network \nTraining accuracy: {accs["Training accuracy"][0]:0.3f}, Cross-validation accuracy: {accs["Cross-validation accuracy"][0]:0.3f}')

[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 532us/step
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 414us/step
Regularized Simple neural network 
Training accuracy: 0.749, Cross-validation accuracy: 0.757


In [21]:
# Iterate to explore different regularization values
lambdas = [0.001, 0.01, 0.05]
models=[None] * len(lambdas)
best_lambda = 0
best_cv_acc = 0

for i in range(len(lambdas)):
    lambda_ = lambdas[i]
    models[i] =  Sequential(
        [
            Dense(120, activation = 'relu', kernel_regularizer=tf.keras.regularizers.l2(lambda_)),
            Dense(5, activation = 'linear')
        ]
    )
    
    models[i].compile(
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        optimizer=tf.keras.optimizers.Adam(0.01),
    )

    models[i].fit(X_train, Y_train, epochs=500, verbose=0)
    accs = train_and_cv_accuracy(models[i], X_train, Y_train, X_cv, Y_cv)
    if accs['Cross-validation accuracy'][0] > best_cv_acc:
        best_lambda = lambda_
        best_cv_acc = np.round(accs['Cross-validation accuracy'][0],3)

print(f'Best regularization lambda is {best_lambda} with cross-validation accuracy of {best_cv_acc}')

[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 490us/step
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 618us/step
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 445us/step
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 412us/step
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 438us/step
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 387us/step
Best regularization lambda is 0.001 with cross-validation accuracy of 0.774


# Deep learning practice on Alzheimers Diseases dataset (Binary)

Retrieved from https://www.kaggle.com/datasets/rabieelkharoua/alzheimers-disease-dataset?select=alzheimers_disease_data.csv

In [23]:
alz_d = pd.read_csv('alzheimers_disease_data.csv')
alz_d.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,0,0,1.725883,0,0,0,1,0,0,XXXConfid
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,0,0,2.592424,0,0,0,0,1,0,XXXConfid
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,0,0,7.119548,0,1,0,1,0,0,XXXConfid
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,0,1,6.481226,0,0,0,0,0,0,XXXConfid
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,0,0,0.014691,0,0,1,1,0,0,XXXConfid


In [24]:
alz_d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2149 entries, 0 to 2148
Data columns (total 35 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PatientID                  2149 non-null   int64  
 1   Age                        2149 non-null   int64  
 2   Gender                     2149 non-null   int64  
 3   Ethnicity                  2149 non-null   int64  
 4   EducationLevel             2149 non-null   int64  
 5   BMI                        2149 non-null   float64
 6   Smoking                    2149 non-null   int64  
 7   AlcoholConsumption         2149 non-null   float64
 8   PhysicalActivity           2149 non-null   float64
 9   DietQuality                2149 non-null   float64
 10  SleepQuality               2149 non-null   float64
 11  FamilyHistoryAlzheimers    2149 non-null   int64  
 12  CardiovascularDisease      2149 non-null   int64  
 13  Diabetes                   2149 non-null   int64

In [25]:
alz_d.drop(['PatientID','DoctorInCharge'], axis=1, inplace=True)
alz_d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2149 entries, 0 to 2148
Data columns (total 33 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Age                        2149 non-null   int64  
 1   Gender                     2149 non-null   int64  
 2   Ethnicity                  2149 non-null   int64  
 3   EducationLevel             2149 non-null   int64  
 4   BMI                        2149 non-null   float64
 5   Smoking                    2149 non-null   int64  
 6   AlcoholConsumption         2149 non-null   float64
 7   PhysicalActivity           2149 non-null   float64
 8   DietQuality                2149 non-null   float64
 9   SleepQuality               2149 non-null   float64
 10  FamilyHistoryAlzheimers    2149 non-null   int64  
 11  CardiovascularDisease      2149 non-null   int64  
 12  Diabetes                   2149 non-null   int64  
 13  Depression                 2149 non-null   int64

In [26]:
# Function for Z normalization for continuous variables
def z_norms(var_name):
    values = np.asarray(alz_d[var_name])
    mu, sigma = np.mean(values), np.std(values)
    values = (values - mu) / sigma
    alz_d[var_name] = values
    return None

In [27]:
# Normalization for continuous variables
cont_vars = ['Age','BMI','AlcoholConsumption','PhysicalActivity','DietQuality','SleepQuality','SystolicBP','DiastolicBP',
             'CholesterolTotal','CholesterolLDL','CholesterolHDL','CholesterolTriglycerides','MMSE','FunctionalAssessment','ADL']
for var in cont_vars:
    z_norms(var)

alz_d.head()

Unnamed: 0,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,...,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis
0,-0.212368,0,0,2,-0.655225,0,0.565923,0.492525,-1.253593,1.119918,...,0.497506,0,0,-1.104434,0,0,0,1,0,0
1,1.567757,0,0,0,-0.114751,0,-0.954895,0.945093,-1.538442,0.056836,...,0.704907,0,0,-0.810601,0,0,0,0,1,0
2,-0.212368,0,3,1,-1.366428,0,1.653006,1.023896,-1.088855,1.48738,...,0.281813,0,0,0.724491,0,1,0,1,0,0
3,-0.101111,1,0,1,0.851625,1,0.37693,1.227995,0.839804,0.760833,...,1.343346,0,1,0.508044,0,0,0,0,0,0
4,1.567757,0,0,0,-0.961607,0,1.461793,0.486696,-1.443293,-0.824566,...,0.333665,0,0,-1.684679,0,0,1,1,0,0


In [28]:
# Convert Diagnosis into categorical variable
alz_d['Diagnosis'] = alz_d['Diagnosis'].astype('object')
alz_d['Diagnosis'].unique()

array([0, 1], dtype=object)

In [29]:
# Assign Diagnosis to dependent variable
Y = np.asarray(alz_d['Diagnosis']).astype('float32').reshape(-1,1)
print(f'First 5 elements of y are {Y[:5]} \nShape of y is {Y.shape}')

First 5 elements of y are [[0.]
 [0.]
 [0.]
 [0.]
 [0.]] 
Shape of y is (2149, 1)


In [30]:
# Assign other variables to matrix X as features, excluding StudentID and GPA
X = alz_d.drop(['Diagnosis'], axis=1)
X = np.asarray(X).astype('float32')
print(f'First element of X is {X[0]} \nShape of X is {X.shape}')

First element of X is [-0.21236841  0.          0.          2.         -0.65522534  0.
  0.5659231   0.4925249  -1.2535934   1.1199181   0.          0.
  1.          1.          0.          0.          0.29815874 -1.0147502
  0.4036769  -1.5726606  -1.1144291  -0.64819944  0.7790368   0.49750587
  0.          0.         -1.1044345   0.          0.          0.
  1.          0.        ] 
Shape of X is (2149, 32)


In [31]:
# Split dataset into training, testing and cross-validation set
X_train, X_, Y_train, Y_ = train_test_split(X, Y, test_size=0.2)
X_cv, X_test, Y_cv, Y_test = train_test_split(X_, Y_, test_size=0.5)
print(f'The training set has {Y_train.shape[0]} entries while the testing set and cross-validation set has {Y_test.shape[0]} entries.')

The training set has 1719 entries while the testing set and cross-validation set has 215 entries.


# Slightly complex neural network

In [33]:
# Neural network with multiple layers and units
complex_mc_nn = Sequential(
    [
    tf.keras.Input(shape = (32,)),
    Dense(units=120, activation='relu', name='L1'),
    Dense(units=50, activation='sigmoid', name='L2'),
    Dense(units=10, activation='relu', name='L3'),
    Dense(units=1, activation='sigmoid', name='L4')
    ], name='ComplexNN'
)
complex_mc_nn.summary()

In [34]:
# Checking dimensions of parameters w and b in each layer
layer1, layer2, layer3, layer4 = complex_mc_nn.layers
W1, b1 = layer1.get_weights()
W2, b2 = layer2.get_weights()
W3, b3 = layer3.get_weights()
W4, b4 = layer4.get_weights()
print(f'W1 shape is {W1.shape}, b1 shape is {b1.shape}')
print(f'W2 shape is {W2.shape}, b2 shape is {b2.shape}')
print(f'W3 shape is {W3.shape}, b3 shape is {b3.shape}')
print(f'W4 shape is {W4.shape}, b4 shape is {b4.shape}')

W1 shape is (32, 120), b1 shape is (120,)
W2 shape is (120, 50), b2 shape is (50,)
W3 shape is (50, 10), b3 shape is (10,)
W4 shape is (10, 1), b4 shape is (1,)


In [35]:
# Specify loss and fit model to training set
complex_mc_nn.compile(
    loss = tf.keras.losses.BinaryCrossentropy(),
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
)

complex_mc_nn.fit(X_train, Y_train, epochs=500, verbose=0) 

<keras.src.callbacks.history.History at 0x1645e5fd0>

In [36]:
accs = train_and_cv_accuracy(complex_mc_nn, X_train, Y_train, X_cv, Y_cv)
print(f'Complex neural network \nTraining accuracy: {accs["Training accuracy"][0]:0.3f}, Cross-validation accuracy: {accs["Cross-validation accuracy"][0]:0.3f}')

[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 653us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 419us/step
Complex neural network 
Training accuracy: 0.646, Cross-validation accuracy: 0.674


In [37]:
# Iterate to explore different regularization values
lambdas = [0.001, 0.01, 0.05]
models=[None] * len(lambdas)
best_lambda = 0
best_cv_acc = 0

for i in range(len(lambdas)):
    lambda_ = lambdas[i]
    models[i] =  Sequential(
        [
            tf.keras.Input(shape = (32,)),
            Dense(units=120, activation='relu', name='L1', kernel_regularizer=tf.keras.regularizers.l2(lambda_)),
            Dense(units=50, activation='sigmoid', name='L2', kernel_regularizer=tf.keras.regularizers.l2(lambda_)),
            Dense(units=10, activation='relu', name='L3', kernel_regularizer=tf.keras.regularizers.l2(lambda_)),
            Dense(units=1, activation='sigmoid', name='L4')
        ]
    )
    
    models[i].compile(
        loss=tf.keras.losses.BinaryCrossentropy(),
        optimizer=tf.keras.optimizers.Adam(0.01),
    )

    models[i].fit(X_train, Y_train, epochs=500, verbose=0)
    accs = train_and_cv_accuracy(models[i], X_train, Y_train, X_cv, Y_cv)
    if accs['Cross-validation accuracy'][0] > best_cv_acc:
        best_lambda = lambda_
        best_cv_acc = np.round(accs['Cross-validation accuracy'][0],3)

print(f'Best regularization lambda is {best_lambda} with cross-validation accuracy of {best_cv_acc}')

[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 720us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 470us/step
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 610us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 429us/step
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 610us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 436us/step
Best regularization lambda is 0.05 with cross-validation accuracy of 0.674
