In [2]:
import pandas as pd
import numpy as np
import glob
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report



In [3]:
all_files = glob.glob('Data/*.csv')
data = pd.concat((pd.read_csv(f) for f in all_files))

In [4]:
missing_values = data.isnull().sum()
print(missing_values)

id_assessment                 10726852
id_student                        6592
date_submitted                10727058
is_banked                     10727058
score                         10727231
code_module                     173912
code_presentation               173912
gender                        10868377
region                        10868377
highest_education             10868377
imd_band                      10869488
age_band                      10868377
num_of_prev_attempts          10868377
studied_credits               10868377
disability                    10868377
final_result                  10868377
id_site                         239326
date                            245495
sum_click                       245690
module_presentation_length    10900948
activity_type                 10894606
week_from                     10899849
week_to                       10899849
date_registration             10868422
date_unregistration           10890898
assessment_type          

In [5]:
print(data.columns)

Index(['id_assessment', 'id_student', 'date_submitted', 'is_banked', 'score',
       'code_module', 'code_presentation', 'gender', 'region',
       'highest_education', 'imd_band', 'age_band', 'num_of_prev_attempts',
       'studied_credits', 'disability', 'final_result', 'id_site', 'date',
       'sum_click', 'module_presentation_length', 'activity_type', 'week_from',
       'week_to', 'date_registration', 'date_unregistration',
       'assessment_type', 'weight'],
      dtype='object')


In [16]:
# Label encoding for categorical variables
label_encoder = LabelEncoder()
categorical_features = ['gender', 'region', 'highest_education', 'imd_band', 'age_band', 'disability', 'code_module', 'code_presentation', 'activity_type', 'assessment_type']
for column in categorical_features:
    data[column] = label_encoder.fit_transform(data[column].astype(str))

# Fill missing values
for column in data.columns:
    if data[column].dtype == 'object':  # if the column is categorical
        data.loc[:, column].fillna(data[column].mode()[0], inplace=True)  # fill with the most frequent value
    else:  # if the column is numeric
        data.loc[:, column].fillna(data[column].mean(), inplace=True)  # fill with the mean

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data.loc[:, column].fillna(data[column].mean(), inplace=True)  # fill with the mean
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data.loc[:, column].fillna(data[column].mode()[0], inplace=True)  # fill with the most frequent value


In [17]:
# Feature Selection: Selecting the features that might influence a student's decision to withdraw.
features = ['gender', 'region', 'highest_education', 'imd_band', 'age_band', 'disability', 'code_module', 'code_presentation', 'sum_click', 'activity_type', 'date_submitted', 'is_banked', 'score', 'num_of_prev_attempts', 'studied_credits', 'module_presentation_length', 'assessment_type', 'weight', 'date_registration', 'date_unregistration']
X = data[features]

In [18]:
# Defining the target variable: The target variable is 'final_result'. 
# Need to convert it to a binary format where 'Withdrawn' is 1 and everything else is 0.
y = data['final_result'].apply(lambda x: 1 if x == 'Withdrawn' else 0)

In [19]:
# Split the data: Splitting data into a training set and a test set. 
# Split is 80% of the data for training and 20% for testing.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# Random Forest Classifier the machine learning model that I will use to make my predictions. 
model = RandomForestClassifier(n_estimators=100, random_state=42)

In [21]:
# Training the chosen model on the training data.
model.fit(X_train, y_train)

In [23]:
# Use the test data to evaluate the performance of the model. 
# Accuracy, precision, recall, and the F1 score are all potential metrics to use.
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   2178132
           1       0.43      0.30      0.35      2062

    accuracy                           1.00   2180194
   macro avg       0.71      0.65      0.67   2180194
weighted avg       1.00      1.00      1.00   2180194



In [None]:
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# The classification report suggests that the model is performing extremely well on class 0 but poorly on class 1. 
# This is indicated by the high precision, recall, and F1-score for class 0 and the relatively low scores for class 1.

    # more detail #
# The model seems to be very good at predicting students who will not withdraw from the course (class 0),
# but it performs poorly at predicting students who will withdraw (class 1).