# Procrastination Prediction using OULAD Dataset
**Author:** Jeremiah Agbaje  
**Date:** February 2026

Pre-training a Bi-LSTM model on OULAD behavioral data for transfer learning.

## Setup and Imports

In [None]:
!pip install pandas numpy matplotlib seaborn scikit-learn tensorflow statsmodels -q

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, silhouette_score
from sklearn.decomposition import PCA

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

print(f"TensorFlow: {tf.__version__}")
print(f"GPU: {tf.config.list_physical_devices('GPU')}")

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller

## Load Data from Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
BASE_PATH = '/content/drive/MyDrive/ALU Capstone/OULAD_data/'

In [None]:
print("Loading datasets...")
student_vle = pd.read_csv(BASE_PATH + 'studentVle.csv')
student_assessment = pd.read_csv(BASE_PATH + 'studentAssessment.csv')
student_info = pd.read_csv(BASE_PATH + 'studentInfo.csv')
assessments = pd.read_csv(BASE_PATH + 'assessments.csv')
vle = pd.read_csv(BASE_PATH + 'vle.csv')

print(f"VLE interactions: {len(student_vle):,}")
print(f"Assessments: {len(student_assessment):,}")
print(f"Students: {len(student_info):,}")

## Data Exploration

In [None]:
student_vle.head()

In [None]:
student_assessment.head()

In [None]:
student_info.head()

In [None]:
print("Missing values:")
print(student_vle.isnull().sum())
print("\n", student_assessment.isnull().sum())

## Sample Data for Faster Processing

In [None]:
SAMPLE_SIZE = 5000
student_sample = student_info.sample(n=min(SAMPLE_SIZE, len(student_info)), random_state=42)
student_ids = student_sample['id_student'].unique()

vle_sample = student_vle[student_vle['id_student'].isin(student_ids)]
assess_sample = student_assessment[student_assessment['id_student'].isin(student_ids)]

print(f"Sampled {len(student_sample)} students")

## Feature Engineering

Creating behavioral indicators for procrastination

In [None]:
def engineer_features(vle_data, assess_data, assess_info, student_data):
    features = []
    
    unique_students = student_data[['code_module', 'code_presentation', 'id_student']].drop_duplicates()
    
    for idx, (module, presentation, sid) in enumerate(unique_students.values):
        if idx % 1000 == 0:
            print(f"Processing {idx}/{len(unique_students)}...")
        
        s_vle = vle_data[
            (vle_data['code_module'] == module) & 
            (vle_data['code_presentation'] == presentation) & 
            (vle_data['id_student'] == sid)
        ]
        
        s_assess = assess_data[assess_data['id_student'] == sid]
        s_assess_full = s_assess.merge(assess_info[['id_assessment', 'date']], on='id_assessment', how='left')
        
        if len(s_assess_full) > 0:
            s_assess_full['days_late'] = s_assess_full['date_submitted'] - s_assess_full['date']
            late_rate = (s_assess_full['days_late'] > 0).sum() / len(s_assess_full)
            avg_late = s_assess_full['days_late'].mean()
        else:
            late_rate = avg_late = 0
        
        if len(s_vle) > 0:
            daily = s_vle.groupby('date')['sum_click'].sum()
            irregularity = daily.std() / daily.mean() if daily.mean() > 0 else 0
            total_clicks = daily.sum()
            active_days = len(daily)
            
            dates = sorted(s_vle['date'].unique())
            gaps = np.diff(dates) if len(dates) > 1 else [0]
            avg_gap = gaps.mean()
            max_gap = gaps.max()
        else:
            irregularity = total_clicks = active_days = avg_gap = max_gap = 0
        
        last_min_ratio = 0
        if len(s_vle) > 0 and len(s_assess_full) > 0:
            for deadline in s_assess_full['date'].dropna():
                week_clicks = s_vle[
                    (s_vle['date'] >= deadline - 7) & 
                    (s_vle['date'] <= deadline)
                ]['sum_click'].sum()
                last_min_ratio += week_clicks
            last_min_ratio = last_min_ratio / total_clicks if total_clicks > 0 else 0
        
        features.append({
            'id_student': sid,
            'code_module': module,
            'code_presentation': presentation,
            'late_rate': late_rate,
            'avg_days_late': avg_late,
            'irregularity': irregularity,
            'last_min_ratio': last_min_ratio,
            'avg_gap': avg_gap,
            'max_gap': max_gap,
            'total_clicks': total_clicks,
            'active_days': active_days,
            'num_assessments': len(s_assess_full)
        })
    
    return pd.DataFrame(features)

In [None]:
print("Engineering features...")
features_df = engineer_features(vle_sample, assess_sample, assessments, student_sample)
print(f"Created {len(features_df)} feature vectors")

In [None]:
features_df.head()

In [None]:
features_clean = features_df[
    (features_df['num_assessments'] > 0) & 
    (features_df['total_clicks'] > 0)
].copy()

print(f"Valid students: {len(features_clean)}")

## Feature Distributions

In [None]:
features_to_plot = ['late_rate', 'avg_days_late', 'irregularity', 'last_min_ratio', 'avg_gap', 'max_gap']

fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.flatten()

for idx, feat in enumerate(features_to_plot):
    axes[idx].hist(features_clean[feat].fillna(0), bins=40, edgecolor='black', alpha=0.7)
    axes[idx].set_title(feat.replace('_', ' ').title())
    axes[idx].set_xlabel('Value')

plt.tight_layout()
plt.show()

In [None]:
features_clean[features_to_plot].describe()

## Correlation Analysis

In [None]:
corr_matrix = features_clean[features_to_plot].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

## K-Means Clustering

Creating procrastination labels from behavioral patterns

In [None]:
cluster_features = ['late_rate', 'irregularity', 'last_min_ratio', 'avg_gap']
X_cluster = features_clean[cluster_features].fillna(0)

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_cluster)

In [None]:
inertias = []
silhouettes = []
K_range = range(2, 8)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X_scaled)
    inertias.append(kmeans.inertia_)
    silhouettes.append(silhouette_score(X_scaled, labels))

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

ax1.plot(K_range, inertias, 'bo-')
ax1.set_xlabel('Number of Clusters')
ax1.set_ylabel('Inertia')
ax1.set_title('Elbow Method')
ax1.grid(True, alpha=0.3)

ax2.plot(K_range, silhouettes, 'ro-')
ax2.set_xlabel('Number of Clusters')
ax2.set_ylabel('Silhouette Score')
ax2.set_title('Silhouette Analysis')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
optimal_k = 3
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
features_clean['cluster'] = kmeans.fit_predict(X_scaled)

print(features_clean['cluster'].value_counts().sort_index())

In [None]:
cluster_summary = features_clean.groupby('cluster')[cluster_features].mean()
cluster_summary

In [None]:
risk_scores = cluster_summary[['late_rate', 'irregularity', 'last_min_ratio']].mean(axis=1)
cluster_order = risk_scores.sort_values().index.tolist()

risk_map = {cluster_order[0]: 'Low', cluster_order[1]: 'Medium', cluster_order[2]: 'High'}
features_clean['risk'] = features_clean['cluster'].map(risk_map)

print(features_clean['risk'].value_counts())

## Cluster Visualization

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

for cluster in features_clean['cluster'].unique():
    mask = features_clean['cluster'] == cluster
    risk = risk_map[cluster]
    ax1.scatter(X_pca[mask, 0], X_pca[mask, 1], label=f'{risk} Risk', alpha=0.6, s=30)

ax1.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
ax1.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
ax1.set_title('Student Clusters (PCA)')
ax1.legend()
ax1.grid(True, alpha=0.3)

risk_counts = features_clean['risk'].value_counts()
colors = {'Low': '#2ecc71', 'Medium': '#f39c12', 'High': '#e74c3c'}
ax2.bar(risk_counts.index, risk_counts.values, color=[colors[x] for x in risk_counts.index])
ax2.set_xlabel('Risk Level')
ax2.set_ylabel('Students')
ax2.set_title('Risk Distribution')
ax2.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## Time Series Analysis

Checking if LSTM is appropriate for this sequential data

In [None]:
sample_student = features_clean.iloc[0]
student_clicks = vle_sample[
    (vle_sample['id_student'] == sample_student['id_student']) &
    (vle_sample['code_module'] == sample_student['code_module']) &
    (vle_sample['code_presentation'] == sample_student['code_presentation'])
]

daily_clicks = student_clicks.groupby('date')['sum_click'].sum().sort_index()

print(f"Analyzing student {sample_student['id_student']}")
print(f"Days of activity: {len(daily_clicks)}")

In [None]:
plt.figure(figsize=(12, 4))
plt.plot(daily_clicks.values)
plt.title('Daily Click Pattern (Sample Student)')
plt.xlabel('Day')
plt.ylabel('Clicks')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
adf_result = adfuller(daily_clicks.values)
print(f"ADF Statistic: {adf_result[0]:.4f}")
print(f"p-value: {adf_result[1]:.4f}")
print(f"Stationary: {adf_result[1] < 0.05}")

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8))

plot_acf(daily_clicks.values, lags=30, ax=ax1)
ax1.set_title('Autocorrelation Function (ACF)')

plot_pacf(daily_clicks.values, lags=30, ax=ax2)
ax2.set_title('Partial Autocorrelation Function (PACF)')

plt.tight_layout()
plt.show()

## Sequence Generation for LSTM

In [None]:
def create_sequences(vle_data, features_data, seq_len=30):
    sequences = []
    labels = []
    
    label_enc = LabelEncoder()
    features_data['risk_enc'] = label_enc.fit_transform(features_data['risk'])
    
    for idx, row in features_data.iterrows():
        if idx % 500 == 0:
            print(f"{idx}/{len(features_data)}")
        
        student_data = vle_data[
            (vle_data['id_student'] == row['id_student']) &
            (vle_data['code_module'] == row['code_module']) &
            (vle_data['code_presentation'] == row['code_presentation'])
        ]
        
        if len(student_data) < seq_len:
            continue
        
        daily = student_data.groupby('date')['sum_click'].sum().sort_index()
        
        for i in range(len(daily) - seq_len):
            seq = daily.iloc[i:i+seq_len].values
            seq_norm = seq / seq.max() if seq.max() > 0 else seq
            sequences.append(seq_norm.reshape(-1, 1))
            labels.append(row['risk_enc'])
    
    return np.array(sequences), np.array(labels), label_enc

In [None]:
print("Creating sequences...")
X_seq, y_labels, label_enc = create_sequences(vle_sample, features_clean, seq_len=30)

print(f"X shape: {X_seq.shape}")
print(f"y shape: {y_labels.shape}")
print(f"Classes: {label_enc.classes_}")

## Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_seq, y_labels, test_size=0.2, random_state=42, stratify=y_labels
)

print(f"Train: {X_train.shape[0]} samples")
print(f"Test: {X_test.shape[0]} samples")

In [None]:
unique, counts = np.unique(y_train, return_counts=True)
for cls, cnt in zip(label_enc.classes_, counts):
    print(f"{cls}: {cnt} ({cnt/len(y_train)*100:.1f}%)")

## Bi-LSTM Model

In [None]:
def build_model(seq_len, n_features, n_classes):
    model = Sequential([
        Bidirectional(LSTM(128, return_sequences=True), input_shape=(seq_len, n_features)),
        Dropout(0.3),
        Bidirectional(LSTM(64, return_sequences=True)),
        Dropout(0.3),
        Bidirectional(LSTM(32, return_sequences=False)),
        Dropout(0.2),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dense(n_classes, activation='softmax')
    ])
    
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model

In [None]:
model = build_model(
    seq_len=X_train.shape[1],
    n_features=X_train.shape[2],
    n_classes=len(np.unique(y_train))
)

model.summary()

## Training

In [None]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    ModelCheckpoint('best_model.h5', monitor='val_accuracy', save_best_only=True)
]

In [None]:
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=32,
    callbacks=callbacks,
    verbose=1
)

## Evaluation

In [None]:
loss, acc = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {acc:.4f}")

In [None]:
y_pred = np.argmax(model.predict(X_test), axis=1)
print(classification_report(y_test, y_pred, target_names=label_enc.classes_))

In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
           xticklabels=label_enc.classes_,
           yticklabels=label_enc.classes_)
plt.title('Confusion Matrix')
plt.ylabel('True')
plt.xlabel('Predicted')
plt.show()

## Training History

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

ax1.plot(history.history['accuracy'], label='Train')
ax1.plot(history.history['val_accuracy'], label='Validation')
ax1.set_title('Accuracy')
ax1.set_xlabel('Epoch')
ax1.legend()
ax1.grid(True, alpha=0.3)

ax2.plot(history.history['loss'], label='Train')
ax2.plot(history.history['val_loss'], label='Validation')
ax2.set_title('Loss')
ax2.set_xlabel('Epoch')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Save Artifacts

In [None]:
model.save('procrastination_bilstm.h5')
print("Model saved")

In [None]:
features_clean.to_csv('features_with_labels.csv', index=False)
print("Features saved")

In [None]:
import pickle

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_enc, f)

print("Preprocessors saved")

## Summary

Pre-trained model ready for transfer learning with local institutional data.