In [1]:
import numpy as np
import pandas as pd
from pyBKT.models import Model
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
model = Model(seed = 60, num_fits = 1)

In [3]:
model.fetch_dataset("https://raw.githubusercontent.com/lishaparmar13/H5P-Enhanced-BKT/main/SS.csv",'.')

In [4]:
df= pd.read_csv(r"SS.csv", encoding ='latin')
df.tail(5)

Unnamed: 0,Row,Anon Student Id,Age,Problem Hierarchy,Problem Name,question_id,Step Start Time,Step End Time,Correct First Attempt,Step Duration (sec),Answer Type,KC(Default),skill_id,Opportunity (Default),template_id
489,22,22,16,2,Chocolates left,10,22-07-2023 17:48,22-07-2023 17:49,0,57,Single Choice set,Numbers,4,1,6
490,23,23,16,2,Chocolates left,10,22-07-2023 18:17,22-07-2023 18:17,1,3,Single Choice set,Numbers,4,1,6
491,24,24,15,2,Chocolates left,10,23-07-2023 11:57,23-07-2023 12:00,1,180,Single Choice set,Numbers,4,1,6
492,25,25,16+,2,Chocolates left,10,23-07-2023 12:37,23-07-2023 12:39,0,120,Single Choice set,Numbers,4,1,6
493,26,26,15,2,Chocolates left,10,24-07-2023 17:05,24-07-2023 17:07,1,60,Single Choice set,Numbers,4,1,6


In [5]:
#Spliting the data to be later used for predictions
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [6]:
model.fit(data = train_df)

In [7]:
model.fit(data_path = 'SS.csv')
print("Fitted Skills:\n%s" % '\n'.join(model.coef_.keys()))

In [None]:
# Evaluate with the default RMSE then specify AUC.
model.fit(data = train_df)
training_rmse = model.evaluate(data = df)
training_auc = model.evaluate(data = train_df, metric = 'auc')
print("Training RMSE: %f" % training_rmse)
print("Training AUC: %f" % training_auc)

In [None]:
#define own metric
def mae(true_vals, pred_vals):
  """ Calculates the mean absolute error. """
  return np.mean(np.abs(true_vals - pred_vals))

training_mae = model.evaluate(data = train_df, metric = mae)
print("Training MAE: %f" % training_mae)

In [None]:
model = Model(seed = 60, num_fits = 1)

In [None]:
model.fit(data= train_df)
preds = model.predict(data= test_df)
preds[['Anon Student Id', 'KC(Default)', 'Correct First Attempt', 
       'correct_predictions', 'state_predictions']].head(20)

In [None]:
# Let's sanity check that we have only trained on the skills that we 
# specified in the call to fit! Note that while it is possible for a 
# BKT prediction to be 0.5 exactly, it is unlikely.
preds[preds['correct_predictions'] != 0.5]['KC(Default)'].unique()

In [None]:
# We use model.evaluate to accomplish the same thing!
# You should receive an RMSE that is identical to the above
# manually calculated RMSE.
model.evaluate(data = df)

In [None]:
model.evaluate(data=df, metric=['rmse','accuracy','auc','mean_absolute_error'])

In [None]:
model = Model(seed = 60, num_fits = 1)
# Note that folds is an optional parameter as well as the model 
# variant, seed, and crossvalidated metric.
# By default, we crossvalidate on all skills separately.
model.crossvalidate(data=df, metric= 'accuracy', folds = 5)

In [None]:
model = Model(seed = 60, num_fits = 1)
# Note that folds is an optional parameter as well as the model 
# variant, seed, and crossvalidated metric.
# By default, we crossvalidate on all skills separately.
model.crossvalidate(data=df, folds = 5)

In [None]:
model = Model(seed = 60, num_fits = 1)
# Try this with a different skill or metric by replacing the lines below.
skill = 'Algebra'
metric = 'rmse'

simple_cv = model.crossvalidate(data = test_df, skills = skill, 
                                metric = metric)
simple_cv

In [None]:
model = Model(seed = 60, num_fits = 1)
skill = 'Algebra'
metric= 'auc'
multigs_cv = model.crossvalidate(data= test_df, skills = skill,
                                 multigs = True, metric = metric)
multigs_cv

In [None]:
model = Model(seed = 60, num_fits = 1)
metric = 'rmse'
multigs_cv = model.crossvalidate(data= test_df, skills = skill,
                                 multigs = True, metric = metric)
multigs_cv


In [None]:
model = Model(seed = 60, num_fits = 1)
skill = 'Algebra'

multilearn_cv = model.crossvalidate(data= df, skills = skill,
                                    multilearn ="question_id", metric = metric)
multilearn_cv

In [None]:
model = Model(seed = 60, num_fits = 1)
# The multiprior model generates different priors based on the first 
# response of each student.
multiprior_cv = model.crossvalidate(data= test_df, skills = skill,
                                    multiprior = True, metric = metric,
                                    folds = 3)

model = Model(seed = 60, num_fits = 1)
multipair_cv = model.crossvalidate(data= test_df, skills = skill,
                                   multipair = True, metric = metric,
                                   folds = 3)
pd.concat([multiprior_cv, multipair_cv], axis = 0)

In [None]:
model = Model(seed = 60, num_fits = 1)
# We combine the fifth parameter, forgets, with the previous multilearn
# and multiguess/slip models for a combo model.
combo_cv = model.crossvalidate(data= test_df, skills = skill,
                               forgets = True, multilearn = True, 
                               multigs = True, metric = metric)
combo_cv

In [None]:
model = Model(seed = 60, num_fits = 1)

In [None]:
skill = 'Fractions'

model.coef_ = {skill: {'prior': 1e-40}}
model.coef_

In [None]:
# Train the model with the pre-initialized parameters.
model.fit(data= train_df, multigs = True)
low_prior_auc = model.evaluate(data_path = 'Sepskills.csv', metric = 'auc')

# We can obtain the prior value by indexing into the model.coef_
# dictionary with the skill and parameter names. 
print("Fitted Prior Value: %f" % model.coef_[skill]['prior'])
print("Training AUC: %f" % low_prior_auc)

In [None]:
# Initialize the prior to be more reasonable.
model.coef_ = {skill: {'prior': 0.5}}
model.fit(data_path = 'Sepskills.csv', multigs = True)
normal_prior_auc = model.evaluate(data_path = 'Sepskills.csv', metric = 'auc')

# Print the fitted prior value and RMSE.
print("Fitted Prior Value: %f" % model.coef_[skill]['prior'])
print("Training AUC: %f" % normal_prior_auc)

In [None]:
import pandas as pd

# Set options to display all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 1)


In [None]:
model = Model(seed = 60, num_fits = 1)

In [None]:
model.fit(data = df, skills=skill,
          forgets = True, multilearn = True, 
          multigs = True)
model.params()

In [None]:
# We will get warnings for using indexing past lexsort. That's fine,
# and we will disable these warnings.
import warnings
warnings.simplefilter(action='ignore')

#Model Intialization
model = Model(seed = 60, num_fits = 1)
model.fit(data = df,
          forgets = True, multilearn = True, 
          multigs = True)

# Plot the learns, forgets, slips and guesses for each of the classes.
params = model.params()
plt.figure(figsize = (12, 6))
plt.plot(params.loc[(skill, 'guesses')], label = 'Guesses')
plt.plot(params.loc[(skill, 'learns')], label = 'Learns')
plt.plot(params.loc[(skill, 'forgets')], label = 'Forgets')
plt.plot(params.loc[(skill, 'slips')], label = 'Slips')
plt.xlabel('Template ID')
plt.ylabel('Rate')
plt.title('BKT Parameters per Template ID Class')
plt.legend();

In [None]:
import matplotlib.pyplot as plt

# Data (replace with your actual data)
skills = ['Geometry', 'Numbers', 'Algebra', 'Percentages', 'Fractions', 'Ratio and Proportion']
under16_correct = [0.71905, 0.65945, 0.41178, 0.58205, 0.76517, 0.57274]
above16_correct = [0.51663, 0.56880, 0.41095, 0.60470, 0.68247, 0.63329]
under16_state = [0.72016, 0.20428, 0.74020, 0.44804, 0.83052, 0.27746]
above16_state = [0.47483, 0.83708, 0.47082, 0.45746, 0.76397, 0.01140]

# Create line plots for correct predictions
plt.figure(figsize=(10, 6))
plt.plot(skills, under16_correct, marker='o', label='Under16')
plt.plot(skills, above16_correct, marker='o', label='Above16')
plt.xlabel('Skills')
plt.ylabel('Correct Predictions')
plt.title('Correct Predictions by Skill and Age Group')
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

# Create line plots for state predictions
plt.figure(figsize=(10, 6))
plt.plot(skills, under16_state, marker='o', label='Under16')
plt.plot(skills, above16_state, marker='o', label='Above16')
plt.xlabel('Skills')
plt.ylabel('State Predictions')
plt.title('State Predictions by Skill and Age Group')
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from pyBKT.models import Model


In [None]:
model = Model(seed = 60, num_fits = 1)

In [None]:
df= pd.read_csv(r"SS.csv", encoding ='latin')
df.tail(5)

In [None]:
class IRT: 
    def __init__(self, lr, iterations, user_size, question_size) :
        self.lr= lr
        self.iterations= iterations
        self.user_size= user_size
        self.question_size= question_size
        self.theta= np.zeros(user_size)
        self.beta= np.zeros( question_size)
        self.val_acc_lst= []
        self.neg_lld_lst= []
        self.val_lld_lst= []

    def sigmoid( self, x):
        return np.exp(x) / (1+np.exp(x))

    def neg_log_lklihood(self, data):
        log_lklihood= 0.
        for ind in np.arange(len(data["is_correct"])):
            i = data["user_id"][ind]
            j= data["question_id"][ind]
            cij= data["is_correct"][ind]

            theta_i = self.theta[i]
            beta_j = self. beta[j]
            diff= theta_i - beta_j
            log_lklihood += cij * diff- np.log(1+ np.exp(diff))
        return-log_lklihood
    
    def update_theta_beta(self, data):
        diff_theta_beta = np.expand_dims(self.theta, axis=1) - np.expand_dims(self.beta, axis= 0)
        sig = self.sigmoid(diff_theta_beta)

        grad_theta= np.zeros_like(diff_theta_beta)
        grad_beta= np.zeros_like(diff_theta_beta)

        for ind in np.arange(len(data["is_correct"])):
            i = data["user_id"][ind]
            j= data["question_id"][ind]
            cij= data["is_correct"][ind]

            grad_theta[i, j] = cij - sig[i,j]
            grad_beta[i, j]= sig[i, j] -~ cij
        
        self.theta = self.theta + self.lr* np.sum(grad_theta, axis=1)
        self.beta = self.beta + self.lr* np.sum(grad_beta, axis=0)

    def evaluate(self, data):
        pred=[]
        for i, q in enumerate(data["question_id"]):
            u = data["user_id"][i]
            x= (self.theta[u]- self.beta[q]).sum()
            p_a = self.sigmoid(x)
            pred.append(p_a >=0.5)
        return np.sum((data["is_correct"] == np.array(pred)))/ len(data["is_correct"])

    def irt(self, train_data, val_data):
        for i in range(self.iterations):
            neg_lld = self.neg_log_lklihood(train_data)
            score = self.evaluate(val_data)
            self.val_acc_lst.append(score)
            self.neg_lld_lst.append(neg_lld)
            self.val_lld_lst.append(self.neg_log_lklihood(val_data))
            print("NLLK: {} \t Score: {}".format(neg_lld, score))
            self.update_theta_beta(train_data)

        return self.theta, self.beta, self.val_acc_lst, self.neg_lld_lst, self.val_lld_lst                 

In [None]:
def main(skill= 'Algebra'):
    train_data= df[df['KC(Default)']== skill]
    train_data= train_data[['Anon Student Id', 'Problem Name', 'Correct First Attempt']]
    train_data.columns = ['user_id',"question_id", "is_correct"]

    labels, levels = pd.factorize(train_data['user_id'])
    train_data['user_id']= labels
    user_dic= dict(zip(levels, list(range(len(levels)))))
    labels, levels = pd.factorize(train_data['question_id'])
    train_data['question_id']= labels
    question_dic= dict(zip(levels, list(range(len(levels)))))

    train_data, test_data = train_test_split(train_data, test_size=0.3)
    val_data, test_data = train_test_split(test_data, test_size=0.3)

    train_data= train_data.reset_index()
    train_data= train_data.drop(columns=['index'])
    test_data= test_data.reset_index()
    test_data= test_data.drop(columns=['index'])
    val_data= val_data.reset_index()
    val_data= val_data.drop(columns=['index'])

    lr =1e-2
    num_iterations=50
    irt_model=IRT(lr, num_iterations, len(user_dic), len(question_dic)) 
    theta, beta, val_acc_lst, neg_lld_lst, val_lld_lst= irt_model.irt(train_data, val_data)

    print("Validation accuracy: {}".format(val_acc_lst[-1]))
    test_acc= irt_model.evaluate(test_data)
    print("Test accuracy: {}". format(test_acc))
    return test_acc, user_dic, question_dic, theta, bytearray

skill_list = list(df['KC(Default)'].unique())
skill_acc = []

for s in skill_list:
    test_acc, user_dic, question_dic, theta, beta = main(s)
    skill_acc.append((s, test_acc))

for skill, acc in skill_acc:
    print(f"Skill: {skill}, Accuracy: {acc}")