## Early Predictor for Student Success Based on Behavioural and Demographical Indicators

Import libraries

In [1]:
import zipfile
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

Load data

In [2]:
zf = zipfile.ZipFile('./data.zip')
student_info = pd.read_csv(zf.open('studentInfo.csv'))
student_vle = pd.read_csv(zf.open('studentVle.csv'))
student_assessment = pd.read_csv(zf.open('studentAssessment.csv'))
student_registration = pd.read_csv(zf.open('studentRegistration.csv'))
assessments = pd.read_csv(zf.open('assessments.csv'))

Basic information about modules, first assessments and number of registered students

In [3]:
# Get information about first day assessments
basic_info = assessments.filter(items=['code_module', 'code_presentation', 'date'])
basic_info = basic_info.groupby(['code_module', 'code_presentation']).min()
basic_info = pd.merge(basic_info, assessments, on=['code_module', 'code_presentation', 'date'],
                             how='inner').filter(items=['code_module', 'code_presentation', 'id_assessment', 'date'])

# Get total number of registered students for each module
registrations = student_registration.filter(items=['code_module', 'code_presentation', 'id_student']).groupby(
    ['code_module', 'code_presentation']).count().reset_index()
basic_info.insert(4, "number of registered students", registrations['id_student'])
basic_info

Unnamed: 0,code_module,code_presentation,id_assessment,date,number of registered students
0,AAA,2013J,1752,19.0,383
1,AAA,2014J,1758,19.0,365
2,BBB,2013B,14984,19.0,1767
3,BBB,2013J,14996,19.0,2237
4,BBB,2014B,15008,12.0,1613
5,BBB,2014J,15020,19.0,2292
6,CCC,2014B,24286,18.0,1936
7,CCC,2014J,24295,18.0,2498
8,DDD,2013B,25341,23.0,1303
9,DDD,2013J,25348,25.0,1938


### Data preparation

In [4]:
clicks_before_start = student_vle[student_vle['date'] < 0]

# Merge student_info and student_registration tables to get registration_date
df1 = pd.merge(student_info, student_registration, how='left', on=['id_student', 'code_module', 'code_presentation'])

# Merge previous table with basic_info that was created earlier to get first assessment days and ids
df2 = pd.merge(df1, basic_info, on=['code_module', 'code_presentation'], how='left')

# Merge previous table with student assessment to find student's scores on their first assessments
df3 = pd.merge(df2, student_assessment, on=['id_assessment', 'id_student'], how='left')

# Merge previous table with clicks_before_start table that was created earlier
data = pd.merge(df3, clicks_before_start, on=['id_student', 'code_module', 'code_presentation'], how='left')

# Keep only columns that are needed later
data = data.filter(['code_module', 'code_presentation', 'id_student', 'score', 'highest_education', 'sum_click',
                    'date_registration', 'age_band',
                    'disability', 'gender', 'num_of_prev_attempts', 'final_result'])
data.fillna(value=0, inplace=True)

# Get number of clicks before course start
data = data.groupby(
    ['code_module', 'code_presentation', 'id_student', 'score', 'highest_education', 'date_registration',
     'age_band', 'disability', 'gender', 'num_of_prev_attempts', 'final_result']).sum().reset_index()

# Get the final dataframe that will be used
data = data[['id_student', 'gender', 'highest_education', 'age_band', 'num_of_prev_attempts', 'disability', 'score',
             'date_registration', 'sum_click', 'final_result']]
data.rename(columns={'score': 'first_assignment', 'sum_click': 'clicks_before_start',
                     'num_of_prev_attempts': 'previous_attempts', 'age_band': 'age'}, inplace=True)
data.head()

Unnamed: 0,id_student,gender,highest_education,age,previous_attempts,disability,first_assignment,date_registration,clicks_before_start,final_result
0,11391,M,HE Qualification,55<=,0,N,78.0,-159.0,98.0,Pass
1,28400,F,HE Qualification,35-55,0,N,70.0,-53.0,215.0,Pass
2,30268,F,A Level or Equivalent,35-55,0,Y,0.0,-92.0,102.0,Withdrawn
3,31604,F,A Level or Equivalent,35-55,0,N,72.0,-52.0,169.0,Pass
4,32885,F,Lower Than A Level,0-35,0,N,69.0,-176.0,295.0,Pass


Converting all categorical variables into dichotomous variables.

In [5]:
education_mapping = {
    'No Formal quals': 0,
    'Lower Than A Level': 0,
    'A Level or Equivalent': 0,
    'HE Qualification': 1,
    'Post Graduate Qualification': 1
}

age_mapping = {
    '0-35': 0,
    '35-55': 1,
    '55<=': 1
}

grade_mapping = {
    'Withdrawn': -1,
    'Fail': -1,
    'Pass': 1,
    'Distinction': 1
}

gender_mapping = {
    'F': 1,
    'M': 0
}

disability_mapping = {
    'N': 0,
    'Y': 1
}

data['highest_education'] = data['highest_education'].map(education_mapping)
data['age'] = data['age'].map(age_mapping)
data['final_result'] = data['final_result'].map(grade_mapping)
data['gender'] = data['gender'].map(gender_mapping)
data['disability'] = data['disability'].map(disability_mapping)

data.head()

Unnamed: 0,id_student,gender,highest_education,age,previous_attempts,disability,first_assignment,date_registration,clicks_before_start,final_result
0,11391,0,1,1,0,0,78.0,-159.0,98.0,1
1,28400,1,1,1,0,0,70.0,-53.0,215.0,1
2,30268,1,0,1,0,1,0.0,-92.0,102.0,-1
3,31604,1,0,1,0,0,72.0,-52.0,169.0,1
4,32885,1,0,0,0,0,69.0,-176.0,295.0,1


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data[['first_assignment', 'highest_education', 'age', 'gender', 'previous_attempts', 'disability', 'clicks_before_start', 'date_registration']],
                                                    data['final_result'], test_size=0.3, random_state=2)

### Decision Tree

In [7]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(class_weight='balanced', min_samples_split=10)
decision_tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight='balanced', min_samples_split=10)

### Random Forest

In [8]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_jobs=-1, class_weight='balanced', min_samples_split=10)
random_forest.fit(X_train, y_train)


RandomForestClassifier(class_weight='balanced', min_samples_split=10, n_jobs=-1)

### BART

In [10]:
from bartpy.sklearnmodel import SklearnModel as BART

bart = BART(n_burn=200, n_chains=2, n_samples=200, n_trees=50, alpha=0.9, beta=1.5)
bart.fit(X_train, y_train)

SklearnModel(alpha=0.9, beta=1.5, n_chains=2, n_trees=50)

### KNN Classifier

In [20]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=100, n_jobs=-1)
knn.fit(X_train, y_train)

KNeighborsClassifier(n_jobs=-1, n_neighbors=100)

### SVC
#### Grid search for optimal parameters, do not run the grid search again (it lasts for hours, literally)

In [12]:
from sklearn.svm import SVC
# from sklearn.model_selection import GridSearchCV
#
# defining parameter range
# param_grid = {'C': [0.1, 1, 10, 100, 1000],
#               'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
#               'kernel': ['rbf']}
#
# grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
#
# # fitting the model for grid search
# grid.fit(X_train, y_train)
# print(grid.best_params_)

print("Result of grid search: {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}")

svc = SVC(C=1000, gamma=0.001)
svc.fit(X_train, y_train)

Result of grid search: {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}


SVC(C=1000, gamma=0.001)

In [22]:
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

results = []

for model in [decision_tree, random_forest, bart, knn, svc]:
    for score in [precision_score, recall_score, f1_score, accuracy_score]:
        if model == bart:
            results.append(score(y_pred=[1 if res >= 0 else -1 for res in model.predict(X_test)], y_true=y_test))
        else:
            results.append(score(y_pred=model.predict(X_test), y_true=y_test))
split = np.array_split(results, 5)


multi = pd.MultiIndex.from_product(
    [['Pass vs Fail'], ['Precision', 'Recall', 'F1', 'Accuracy']],
    names=['Final result', 'Metric'])

results = pd.DataFrame(index=multi, columns=['Decision tree', 'Random forest', 'BART', 'KNN', 'SVC'],
                       data={'Decision tree': split[0], 'Random forest': split[1], 'BART': split[2], 'KNN': split[3], 'SVC': split[4]})
results


Unnamed: 0_level_0,Unnamed: 1_level_0,Decision tree,Random forest,BART,KNN,SVC
Final result,Metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Pass vs Fail,Precision,0.663998,0.688615,0.699431,0.6807,0.673528
Pass vs Fail,Recall,0.690944,0.799658,0.840453,0.839171,0.796668
Pass vs Fail,F1,0.677203,0.739994,0.763485,0.751674,0.729941
Pass vs Fail,Accuracy,0.684598,0.730927,0.750665,0.734506,0.717734
