# Identifying Struggles and Tutorial Impact in Online Education Logs
## By: [John Courtney](mailto:jrcourtney3797@eagle.fgcu.edu) and [Alana D'Angelo](adangelo9184@eagle.fgcu.edu)
#### MAS 4106 - Spring 2025
#### Instructor: [Dr. Alberto A. Condori](mailto:acondori@fgcu.edu), [FGCU](http://www.fgcu.edu)

#### Abstract:
The purpose of this material is to accurately make predictions on student performance to
tailor their education to their ability. Riiid is the source company, an AI EdTech company
based in South Korea. Enabling higher efficiency in education improves student
outcomes, which in turn opens doors for social mobility. Meeting students with support
for exactly where they're struggling is an efficient use of both the student and educators
time. This is important to educators, students, parents, and nearly everyone with a stake
in education.


K-Means Clustering as well as a regularized least squares model was implemented. The
k-means clustering revealed performance groups by capturing variation in accuracy,
standard deviation, and number of attempts. The regularized least squares yielded with
$\lambda=0.01$ achieved ~80.7\% accuracy and 100\% precision in predicting student success. Feature engineering was required on the original data to include metrics
such as mean accuracy, consistency trends, and effectiveness to train the model. These
findings suggest that predictive modeling with performance-based clustering can support
the development of personalized education strategies, since educators can reliably
determine whether a student needs help or falls within a performance group that is classified as intervention needing.

In [None]:
import Pkg; Pkg.add("LinearAlgebra"); Pkg.add("Plots"); Pkg.add("Statistics"); Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("Random"); Pkg.add("Clustering"); Pkg.add("PlotlyJS")

In [3]:
using LinearAlgebra
using Plots
using Statistics
using CSV, DataFrames
using Random

df = DataFrame()

for chunk in CSV.Chunks("train.csv"; ntasks=10)
    df_chunk = DataFrame(chunk)
    df_chunk = filter(row -> rand() < 0.01, df_chunk) # basically just randomly takes 1% of the dataset for each chunk
    filter!(row -> row.content_type_id == 0, df_chunk)
    append!(df, df_chunk)
end


In [None]:


grouped = combine(groupby(df[df.content_type_id .== 0, :], :user_id), 
  :answered_correctly => mean => :mean_acc,
  :answered_correctly => (x -> isnan(std(x)) ? 0.0 : std(x)) => :std_acc,
  :answered_correctly => length => :num_attempted,
  :prior_question_elapsed_time => (x -> mean(skipmissing(x))) => :avg_time,
  :prior_question_had_explanation => (x -> mean(skipmissing(x))) => :mean_explained 
)
grouped = dropmissing(grouped)


X = Matrix(grouped[:, [:mean_acc, :std_acc, :num_attempted, :avg_time, :mean_explained]])
replace!(X, Inf=>0); replace!(X, NaN=>0)

# Scale features
function scale_features(X)
    X_scaled = similar(X)
    for i in 1:size(X, 2)
        X_scaled[:, i] = (X[:, i] .- mean(X[:, i])) ./ std(X[:, i])
    end
    return X_scaled
end

#Regularization of the attempts as they can't be scaled with z-score.
function regularize_attempts(X, λ=0.1)
    X_reg = copy(X)
    X_reg[:, 3] = X_reg[:, 3] * λ
    return X_reg
end


function prepare_features(df)
    
    question_df = df[df.content_type_id .== 0, :]
    sort!(question_df, [:user_id, :timestamp])
    
    
    user_features = combine(groupby(question_df, :user_id)) do user_data
        
        mean_acc = mean(user_data.answered_correctly)
        std_acc = isnan(std(user_data.answered_correctly)) ? 0.0 : std(user_data.answered_correctly)
        
        # Time features
        time_diffs = diff(user_data.timestamp)
        mean_time_between = mean(time_diffs)
        std_time_between = isnan(std(time_diffs)) ? 0.0 : std(time_diffs)
        
        
        recent_acc = mean(user_data.answered_correctly[max(1, end-4):end])
        
        # Performance trend
        if length(user_data.answered_correctly) >= 10
            first_half = mean(user_data.answered_correctly[1:div(end,2)])
            second_half = mean(user_data.answered_correctly[div(end,2)+1:end])
            trend = second_half - first_half
        else
            trend = 0.0
        end
        
        
        container_acc = combine(groupby(user_data, :task_container_id), 
            :answered_correctly => mean => :container_acc).container_acc
        mean_container_acc = mean(container_acc)
        std_container_acc = isnan(std(container_acc)) ? 0.0 : std(container_acc)
        
        
        explanation_impact = 0.0
        valid_explanations = 0
        for i in 2:nrow(user_data)
            if !ismissing(user_data.prior_question_had_explanation[i]) && user_data.prior_question_had_explanation[i]
                explanation_impact += user_data.answered_correctly[i] - user_data.answered_correctly[i-1]
                valid_explanations += 1
            end
        end
        explanation_impact = valid_explanations > 0 ? explanation_impact / valid_explanations : 0.0
        
        # Time-accuracy
        time_values = collect(skipmissing(user_data.prior_question_elapsed_time))
        acc_values = float.(user_data.answered_correctly[.!ismissing.(user_data.prior_question_elapsed_time)])
        
        if length(time_values) > 1 && length(acc_values) > 1
            time_acc_correlation = cor(time_values, acc_values)
            time_acc_correlation = isnan(time_acc_correlation) ? 0.0 : time_acc_correlation
        else
            time_acc_correlation = 0.0
        end
        
        return (
            mean_acc = mean_acc,
            std_acc = std_acc,
            num_questions = nrow(user_data),
            mean_time_between = mean_time_between,
            std_time_between = std_time_between,
            recent_acc = recent_acc,
            performance_trend = trend,
            mean_container_acc = mean_container_acc,
            std_container_acc = std_container_acc,
            explanation_impact = Float64(explanation_impact),
            time_acc_correlation = time_acc_correlation
        )
    end
    
    # Feature Matrix
    feature_cols = [:mean_acc, :std_acc, :num_questions, :mean_time_between, 
                   :std_time_between, :recent_acc, :performance_trend, 
                   :mean_container_acc, :std_container_acc, :explanation_impact, 
                   :time_acc_correlation]
    
    for col in feature_cols
        user_features[!, col] = float.(user_features[!, col])
    end
    
    X = Matrix{Float64}(user_features[:, feature_cols])
    replace!(X, NaN=>0.0, Inf=>0.0, -Inf=>0.0)
    
    # Normalization
    X = (X .- mean(X, dims=1)) ./ std(X, dims=1)
    
    # Target value
    y = float.(user_features.mean_acc .>= median(user_features.mean_acc))
    
    return X, y, user_features, feature_cols
end

# Just the regularized least squares model implemented.
function regularized_ls(X, y, λ)
    n, p = size(X)
    β = (X'X + λ*I(p)) \ (X'y)
    return β
end

# Predicts the value of y given X and β.
function predict(X, β)
    return X * β
end

# Evaluates the accuracy of the model by comparing the predicted values to the true values.
function evaluate_model(y_true, y_pred)
    predictions = y_pred .>= 0.5
    accuracy = mean(predictions .== y_true)
    return accuracy
end

# Splits the data into training and testing sets.
function split_data(X, y, train_ratio=0.8)
    n = size(X, 1)
    shuffle_idx = randperm(n)
    train_size = floor(Int, train_ratio * n)
    
    X_train = X[shuffle_idx[1:train_size], :]
    y_train = y[shuffle_idx[1:train_size]]
    X_test = X[shuffle_idx[(train_size+1):end], :]
    y_test = y[shuffle_idx[(train_size+1):end]]
    
    return X_train, y_train, X_test, y_test
end

# Confusion matrix
function print_confusion_matrix_table(X, y, β)
    y_pred_continuous = predict(X, β)
    y_pred_binary = y_pred_continuous .>= 0.5
    
    # Metrics
    tp = sum((y .== 1) .& (y_pred_binary .== 1))
    fp = sum((y .== 0) .& (y_pred_binary .== 1))
    tn = sum((y .== 0) .& (y_pred_binary .== 0))
    fn = sum((y .== 1) .& (y_pred_binary .== 0))
    
    total = tp + fp + tn + fn
    p_total = sum(y .== 1)
    n_total = sum(y .== 0)
    
    tp_pct = round(tp / total * 100, digits=2)
    fp_pct = round(fp / total * 100, digits=2)
    tn_pct = round(tn / total * 100, digits=2)
    fn_pct = round(fn / total * 100, digits=2)
    
    accuracy = (tp + tn) / total
    precision = tp / (tp + fp)
    
    println("Confusion Matrix:")
    println("=================")
    println("                 Predicted Negative    Predicted Positive    Total")
    println("Actual Negative  $tn ($tn_pct%)        $fp ($fp_pct%)          $n_total")
    println("Actual Positive  $fn ($fn_pct%)        $tp ($tp_pct%)          $p_total")
    println("Total            $(tn + fn)            $(fp + tp)              $total")
    println()
    println("Metrics:")
    println("=========")
    println("Accuracy:  $(round(accuracy * 100, digits=2))%")
    println("Precision: $(round(precision * 100, digits=2))%")
end

In [None]:
using Clustering
K = 20
results = kmeans(X', K; maxiter=20)
assignments = results.assignments

gr()

X_3d = X[:, [1, 2, 3]]

p2 = Plots.scatter(X_3d[:, 1], X_3d[:, 2], X_3d[:, 3],
    zcolor = assignments,
    markersize = 3,
    xlabel = "Mean Accuracy",
    ylabel = "Std Dev Accuracy",
    zlabel = "Number of Attempts",
    title = "Student Clusters: Performance vs Engagement",
    colorbar_title = "Cluster",
    legend = false
)

display(p2)



# Elbow method
m, n = size(X)
J = zeros(K)
for k in 1:K
    result = kmeans(X', k; maxiter=100)
    assignments = result.assignments
    centers = result.centers
    J[k] = sum(norm(X[i,:] - centers[:, assignments[i]], 2)^2 for i in 1:m)
end

p3 = Plots.plot(1:K, J, marker=:circle, 
    xlabel="k", ylabel="SSE", 
    title="SSE vs. k (Elbow Method)", 
    legend=false)
display(p3)

In [None]:
X, y, user_features, feature_cols = prepare_features(df)

X_train, y_train, X_test, y_test = split_data(X, y)

β = regularized_ls(X_train, y_train, .01)

metrics = print_confusion_matrix_table(X_test, y_test, β)