In [1]:
# Import dependencies.
import matplotlib
from matplotlib import style
style.use('fivethirtyeight')
import matplotlib.pyplot as plt
import pandas as pd

# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, text, inspect, func

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

In [3]:
engine = create_engine("sqlite:///open_university.sqlite", echo=False)

In [4]:
dataset = engine.execute(text("""
SELECT sI.id_student, sI.code_module, sI.code_presentation, sVle.date, sVle.sum_click, sI.num_of_prev_attempts, sI.final_result
FROM studentVle as sVle
OUTER LEFT JOIN studentInfo as sI ON sI.id_student = sVle.id_student
LIMIT 100000
""")).fetchall()

df = pd.DataFrame(dataset, columns=['id_student', 'code_module', 'code_presentation', 'day', 'clicks', 'num_of_prev_attempts', 'final_result'])
display(df.count())
display(df.head())

id_student              13006575
code_module             13006575
code_presentation       13006575
day                     13006575
clicks                  13006575
num_of_prev_attempts    13006575
final_result            13006575
dtype: int64

Unnamed: 0,id_student,code_module,code_presentation,day,clicks,num_of_prev_attempts,final_result
0,28400,AAA,2013J,-10,4,0,Pass
1,28400,AAA,2013J,-10,1,0,Pass
2,28400,AAA,2013J,-10,1,0,Pass
3,28400,AAA,2013J,-10,11,0,Pass
4,28400,AAA,2013J,-10,1,0,Pass


In [5]:
#df = df.astype({'gender':'string', 'imd_band':'string', 'highest_education':'string', 'age_band':'string', 'region':'string', 'final_result':'string'})
#df.dtypes


In [6]:
display(df.nunique())
#display(df.value_counts("gender"))
#display(df.value_counts("imd_band"))
#display(df.value_counts("highest_education"))
#print(f"A-Level is equivilent to high school cert, HE Qualification is level 1 ro 2 of University, ")
#display(df.value_counts("age_band"))
#display(df.value_counts("num_of_prev_attempts"))
#display(df.value_counts("region"))
display(df.value_counts("final_result"))

id_student              26074
code_module                 7
code_presentation           4
day                       295
clicks                    498
num_of_prev_attempts        7
final_result                4
dtype: int64

final_result
Pass           7163176
Distinction    2421423
Withdrawn      1830536
Fail           1591440
Name: count, dtype: int64

In [7]:
# distinction is a higher form of pass
# withdrawn would be a fail if the student would not complete the course both fail and withdrawn is a form of failure.
# Possible to remove withdrawn
df_edited = df.copy()
df_edited["final_result"] = df_edited["final_result"].replace({'Distinction': 'Pass', 'Withdrawn': 'Fail'})
df_edited["final_result"] = df_edited["final_result"].replace({'Pass': '1', 'Fail': '0'})
df_edited = df_edited.astype({'id_student': 'float32', 'day': 'float32', 'clicks': 'float32', 'num_of_prev_attempts': 'float32', 'final_result': 'float32'})
display(df_edited["final_result"].unique())
display(df_edited.dtypes)

array([1., 0.], dtype=float32)

id_student              float32
code_module              object
code_presentation        object
day                     float32
clicks                  float32
num_of_prev_attempts    float32
final_result            float32
dtype: object

In [8]:
df_edited = pd.get_dummies(df_edited)
df_edited.head()

Unnamed: 0,id_student,day,clicks,num_of_prev_attempts,final_result,code_module_AAA,code_module_BBB,code_module_CCC,code_module_DDD,code_module_EEE,code_module_FFF,code_module_GGG,code_presentation_2013B,code_presentation_2013J,code_presentation_2014B,code_presentation_2014J
0,28400.0,-10.0,4.0,0.0,1.0,True,False,False,False,False,False,False,False,True,False,False
1,28400.0,-10.0,1.0,0.0,1.0,True,False,False,False,False,False,False,False,True,False,False
2,28400.0,-10.0,1.0,0.0,1.0,True,False,False,False,False,False,False,False,True,False,False
3,28400.0,-10.0,11.0,0.0,1.0,True,False,False,False,False,False,False,False,True,False,False
4,28400.0,-10.0,1.0,0.0,1.0,True,False,False,False,False,False,False,False,True,False,False


In [9]:
# combine pass and distinction
# combine widthdrawn and fail
X = df_edited.drop('final_result', axis=1).values
y = df_edited['final_result'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)


In [10]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

X_train_scaled.shape

(9754931, 15)

In [11]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])

# X shape / input feature / columns rule of thumbs is normally 2-3 times the ammount so 43 * 2 or 3 is 86-129 so i will try choosing 90 for the first hidden node
hidden_nodes1 = 90
hidden_nodes2 = 60
hidden_nodes3 = 30

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes1, input_shape=(number_input_features,), activation='relu'))
# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes2, activation='relu'))
# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes3, activation='relu'))
# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))
# Check the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [12]:
# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [13]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train, epochs=20)

Epoch 1/20
[1m304842/304842[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 465us/step - accuracy: 0.7496 - loss: 0.5201
Epoch 2/20
[1m304842/304842[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m132s[0m 432us/step - accuracy: 0.7555 - loss: 0.5090
Epoch 3/20
[1m304842/304842[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 429us/step - accuracy: 0.7573 - loss: 0.5057
Epoch 4/20
[1m304842/304842[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m134s[0m 439us/step - accuracy: 0.7584 - loss: 0.5042
Epoch 5/20
[1m304842/304842[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m132s[0m 431us/step - accuracy: 0.7595 - loss: 0.5023
Epoch 6/20
[1m304842/304842[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 426us/step - accuracy: 0.7598 - loss: 0.5020
Epoch 7/20
[1m304842/304842[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 441us/step - accuracy: 0.7606 - loss: 0.5008
Epoch 8/20
[1m304842/304842[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m134s[0m 440us

KeyboardInterrupt: 

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
# Export our model to HDF5 file
nn.save("models/Student_Pass1Fail_relu90+relu60+relu30+sigmoid.h5")

In [None]:
# plotting the accuracy
df_edited = pd.DataFrame(fit_model.history, index = range(1, len(fit_model.history['loss'])+1))
df_edited.plot(y = 'accuracy')