# Group 7 

# Predicting at-risk student using Open University Learning Analytics(OULA) Dataset

In [5]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
     ---------------------------------------- 81.9/81.9 kB 2.3 MB/s eta 0:00:00
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.3


In [6]:
# General data handling and computation
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy import stats
from sklearn.model_selection import train_test_split

# Machine Learning Models
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from category_encoders import TargetEncoder

# Model evaluation and hyperparameter tuning
from sklearn.model_selection import train_test_split, cross_val_score, KFold

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Set up visualizations to be displayed inline in the Jupyter Notebook
%matplotlib inline


# Load and clean data

In [7]:
student_info=pd.read_csv('studentInfo.csv')
student_info

Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result
0,AAA,2013J,11391,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,Pass
1,AAA,2013J,28400,F,Scotland,HE Qualification,20-30%,35-55,0,60,N,Pass
2,AAA,2013J,30268,F,North Western Region,A Level or Equivalent,30-40%,35-55,0,60,Y,Withdrawn
3,AAA,2013J,31604,F,South East Region,A Level or Equivalent,50-60%,35-55,0,60,N,Pass
4,AAA,2013J,32885,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,N,Pass
...,...,...,...,...,...,...,...,...,...,...,...,...
32588,GGG,2014J,2640965,F,Wales,Lower Than A Level,10-20,0-35,0,30,N,Fail
32589,GGG,2014J,2645731,F,East Anglian Region,Lower Than A Level,40-50%,35-55,0,30,N,Distinction
32590,GGG,2014J,2648187,F,South Region,A Level or Equivalent,20-30%,0-35,0,30,Y,Pass
32591,GGG,2014J,2679821,F,South East Region,Lower Than A Level,90-100%,35-55,0,30,N,Withdrawn


In [8]:
student_info.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32593 entries, 0 to 32592
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   code_module           32593 non-null  object
 1   code_presentation     32593 non-null  object
 2   id_student            32593 non-null  int64 
 3   gender                32593 non-null  object
 4   region                32593 non-null  object
 5   highest_education     32593 non-null  object
 6   imd_band              31482 non-null  object
 7   age_band              32593 non-null  object
 8   num_of_prev_attempts  32593 non-null  int64 
 9   studied_credits       32593 non-null  int64 
 10  disability            32593 non-null  object
 11  final_result          32593 non-null  object
dtypes: int64(3), object(9)
memory usage: 3.0+ MB


In [9]:
student_info.shape

(32593, 12)

In [10]:
unique_final_results = student_info['final_result'].unique()
unique_final_results

array(['Pass', 'Withdrawn', 'Fail', 'Distinction'], dtype=object)

# Clean the Data

Identify Missing Values

In [11]:
# Check for missing values
missing_values_count = student_info.isnull().sum()

# Calculate the percentage of missing values
missing_values_percentage = (missing_values_count / len(student_info)) * 100

# Create a DataFrame to display the results
missing_values_df = pd.DataFrame({
    'Missing Values Count': missing_values_count,
    'Percentage': missing_values_percentage
})

# Display the missing values DataFrame
print("Missing Values in Each Column:")
print(missing_values_df)

Missing Values in Each Column:
                      Missing Values Count  Percentage
code_module                              0    0.000000
code_presentation                        0    0.000000
id_student                               0    0.000000
gender                                   0    0.000000
region                                   0    0.000000
highest_education                        0    0.000000
imd_band                              1111    3.408707
age_band                                 0    0.000000
num_of_prev_attempts                     0    0.000000
studied_credits                          0    0.000000
disability                               0    0.000000
final_result                             0    0.000000


# 

# imd_band has missing 1111 out of the total 32593. . 

In [12]:
# Total number of rows
total_rows = 32593

# Number of missing values in 'imd_band'
missing_imd_band = 1111

# Calculate the percentage of missing values
missing_percentage_imd_band = (missing_imd_band / total_rows) * 100
missing_percentage_imd_band


3.408707391157611

Given the low percentage of missing data (3.41%), mode imputation is likely the simplest and most effective approach, especially if 'imd_band' is categorical. Since we want to maintain the integrity of the dataset and avoid potential biases, we chose mode imputation. 

In [13]:
# Handle missing values in 'imd_band' using mode imputation
mode_value = student_info['imd_band'].mode()[0]
student_info['imd_band'].fillna(mode_value, inplace=True)

# Display the cleaned dataset
print("Cleaned Data:")
print(student_info.head())

Cleaned Data:
  code_module code_presentation  id_student gender                region  \
0         AAA             2013J       11391      M   East Anglian Region   
1         AAA             2013J       28400      F              Scotland   
2         AAA             2013J       30268      F  North Western Region   
3         AAA             2013J       31604      F     South East Region   
4         AAA             2013J       32885      F  West Midlands Region   

       highest_education imd_band age_band  num_of_prev_attempts  \
0       HE Qualification  90-100%     55<=                     0   
1       HE Qualification   20-30%    35-55                     0   
2  A Level or Equivalent   30-40%    35-55                     0   
3  A Level or Equivalent   50-60%    35-55                     0   
4     Lower Than A Level   50-60%     0-35                     0   

   studied_credits disability final_result  
0              240          N         Pass  
1               60          N 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  student_info['imd_band'].fillna(mode_value, inplace=True)


# Preprocess Data

The target variable 'final_result' is encoded using LabelEncoder.
Categorical features are identified for mixed encoding (one-hot encoding and target encoding).
Numerical features are identified and standardized.

In [14]:
# Separate features and target
X = student_info.drop(columns=['final_result'])
y = student_info['final_result']

# Encode target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Identify categorical and numerical features
categorical_features = ['gender', 'region', 'highest_education', 'imd_band', 'age_band', 'disability']
numerical_features = ['num_of_prev_attempts', 'studied_credits']

# Target Encoding Alternative
for col in ['highest_education', 'imd_band', 'age_band', 'disability']:
    mean_encoded_col = X.groupby(col)['studied_credits'].mean()
    X[col + '_mean_encoded'] = X[col].map(mean_encoded_col)
    X.drop(col, axis=1, inplace=True)

# One-Hot Encoding for categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features + [col + '_mean_encoded' for col in ['highest_education', 'imd_band', 'age_band', 'disability']]),
        ('cat_onehot', OneHotEncoder(handle_unknown='ignore'), ['gender', 'region'])
    ]
)

Split data

In [15]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Define Models : Random Forest, Logistic Regression, Support Vector Machine, Gradient Boosting

In [16]:
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Support Vector Machine': SVC(kernel='linear', probability=True, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}


# Train and Evaluate Each Model

In [18]:
for name, model in models.items():
    # Create a pipeline with preprocessor and model
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

In [19]:
 pipeline.fit(X_train, y_train)

In [22]:
# Predict on validation set
y_val_pred = pipeline.predict(X_val)
# Evaluate performance on validation set
print(f"{name} Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(f"{name} Validation Classification Report:")
print(classification_report(y_val, y_val_pred, target_names=label_encoder.classes_))

Gradient Boosting Validation Accuracy: 0.41971773368787074
Gradient Boosting Validation Classification Report:
              precision    recall  f1-score   support

 Distinction       0.33      0.01      0.02       450
        Fail       0.37      0.10      0.16      1103
        Pass       0.42      0.71      0.53      1833
   Withdrawn       0.42      0.42      0.42      1503

    accuracy                           0.42      4889
   macro avg       0.39      0.31      0.28      4889
weighted avg       0.40      0.42      0.37      4889



Based on the results, where the model shows relatively poor performance in predicting certain classes (especially "Distinction" and "Fail") and moderate performance in others. We need to improve models by deploying strategies to handle Class Imbalance. The model struggles with minority classes (e.g., "Distinction" and "Fail"). Addressing class imbalance can help improve performance.

 Class Weights: 
Adjust the class weights in the model to give more importance to the minority classes.

In [23]:
from sklearn.ensemble import GradientBoostingClassifier

# Define the model with class weights
gb_model = GradientBoostingClassifier(random_state=42)

# Create a pipeline with preprocessor and model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', gb_model)
])

# Train the model with class weights
pipeline.fit(X_train, y_train, classifier__sample_weight=[1 if y == 2 else 2 if y == 3 else 3 for y in y_train])


In [24]:
# Predict on validation set
y_val_pred = pipeline.predict(X_val)
# Evaluate performance on validation set
print(f"{name} Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(f"{name} Validation Classification Report:")
print(classification_report(y_val, y_val_pred, target_names=label_encoder.classes_))

Gradient Boosting Validation Accuracy: 0.3282879934546942
Gradient Boosting Validation Classification Report:
              precision    recall  f1-score   support

 Distinction       0.25      0.18      0.21       450
        Fail       0.27      0.55      0.36      1103
        Pass       0.49      0.07      0.12      1833
   Withdrawn       0.39      0.53      0.45      1503

    accuracy                           0.33      4889
   macro avg       0.35      0.33      0.28      4889
weighted avg       0.39      0.33      0.28      4889

