In [87]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RFC, AdaBoostClassifier as ABC
from sklearn.neural_network import MLPClassifier as MLP
from sklearn.linear_model import Perceptron 
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
import optuna
from sklearn.metrics import confusion_matrix
import OpenLA as la
from sklearn.metrics import f1_score

### For At-risk prediction, load student vectors generated with fastText trained by ALL-2020 and Student Grades

In [88]:
def make_data_set(dim,cl_train,cl_test):
    Edudata = r'.\data\EduData_20221028'
    train_vec_path = r'.\data\vectors\norm_Student_Vctors_course{}_{}dim.csv'.format(cl_train,dim)
    test_vec_path = r'.\data\vectors\norm_Student_Vctors_course{}_{}dim.csv'.format(cl_test,dim)
    train_vec = pd.read_csv(train_vec_path).set_index('userid')
    test_vec = pd.read_csv(test_vec_path).set_index('userid')
    train_grade =  la.CourseInformation(files_dir=Edudata, course_id=cl_train).grade_point_df().set_index("userid")
    test_grade = la.CourseInformation(files_dir=Edudata, course_id=cl_test).grade_point_df().set_index("userid")
    grade_dict = {'A':0, 'B':0, 'C':1, 'D':1, 'F':1}
    train_grade =  train_grade.replace(grade_dict)
    test_grade = test_grade.replace(grade_dict)
    train_data = pd.merge(train_vec,train_grade,left_index=True,right_index=True,how="inner")
    test_data = pd.merge(test_vec,test_grade,left_index=True,right_index=True,how="inner")
    x_train = train_data.drop(columns="grade")
    y_train = train_data["grade"]
    x_test = test_data.drop(columns="grade")
    y_test = test_data["grade"]
    return x_train,y_train,x_test,y_test

In [89]:
def at_risk_prediction(train_data, train_label, test_data, test_label, model_k="rfc"):
    if model_k == "rfc":
        model = RFC(random_state=42)
        model_params ={
            'n_estimators':[10,20,30,50,100],
            'criterion': ['gini','entropy','log_loss'],
            'max_depth' : [10,20,None]
        }
        model_nt = RFC(random_state=42)
    elif model_k == "svc":
        model = SVC(random_state=42)
        model_params = {
            "C": [10** i for i in range(-3,3)],
            "kernel": ["linear","poly","rbf","sigmoid"]
        } 
        model_nt = SVC(random_state=42)
    elif model_k == "ada":
        model = ABC(random_state=42)
        model_params = {
            'n_estimators':[1,10,20,30,50,100],
            'learning_rate':[0.1,0.5,1.0,2.0,5.0],
            'algorithm': ['SAMME','SAMME.R']
        }
        model_nt = ABC(random_state=42)
    elif model_k == 'knn':
        model = KNN()
        model_params = {
            'n_neighbors' : [3,5,7,10],
            'weights': ["uniform","distance"],
            'algorithm':['auto','ball_tree','kd_tree','brute']
        }
        model_nt = KNN()
    #elif model_k == 
    # grid search によって最適なモデルを選択　パラメータの評価は3CV
    gscv = GridSearchCV(model, model_params,cv=3,scoring='f1',verbose=0)
    gscv.fit(train_data,train_label)
    # 最もよかったパラメータ
    best = gscv.best_estimator_

    # predict with best parameters model
    pred = best.predict(test_data)
    f1_best = f1_score(pred,test_label)

    #print(gscv.best_params_)
    #print(gscv.best_score_)
    #print(confusion_matrix(pred,test_label))

    #"Non Tuning model"
    model_nt.fit(train_data,train_label)
    pred_nt = model_nt.predict(test_data)
    f1_nt = f1_score(pred_nt,test_label)
    #print(confusion_matrix(pred_nt,test_label))
    return gscv.best_params_, f1_best, f1_nt

### The result of E2Vec k100

In [91]:
cl_courses = ["D-2022","D-2021","A-2022","A-2021"]
dim=100

In [92]:
columns = ["c_train","c_test","dim","epoch","model","best_param","f1_bestparam","f1_default"]
results_df = pd.DataFrame(columns=columns)

for cl_test in cl_courses:
        for cl_train in cl_courses:
            if cl_test == cl_train:
                continue
            x_train,y_train,x_test,y_test = make_data_set(dim,cl_train,cl_test)
            for model_select in ["rfc","svc","ada",'knn']:
                 best_param, best_param_f1, nt_f1 = at_risk_prediction(x_train,y_train,x_test,y_test,model_k=model_select)
                 row = pd.DataFrame(np.array([cl_train,cl_test,100,30,model_select,best_param,best_param_f1,nt_f1]).reshape(1,8),columns=columns)
                 results_df = pd.concat([results_df,row],ignore_index=True)
results_df["result"] = results_df[["f1_bestparam","f1_default"]].max(axis=1)

  self._df = pd.read_csv(file_path)
  self._df = pd.read_csv(file_path)
  self._df = pd.read_csv(file_path)
  self._df = pd.read_csv(file_path)
  self._df = pd.read_csv(file_path)
  self._df = pd.read_csv(file_path)


### Table 9

In [94]:
results_df[(results_df["c_train"]=="A-2021")&(results_df["c_test"]=="A-2022")]

Unnamed: 0,c_train,c_test,dim,epoch,model,best_param,f1_bestparam,f1_default,result
32,A-2021,A-2022,100,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.72,0.722222,0.722222
33,A-2021,A-2022,100,30,svc,"{'C': 1, 'kernel': 'rbf'}",0.685714,0.685714,0.685714
34,A-2021,A-2022,100,30,ada,"{'algorithm': 'SAMME.R', 'learning_rate': 0.1,...",0.60274,0.566038,0.60274
35,A-2021,A-2022,100,30,knn,"{'algorithm': 'auto', 'n_neighbors': 7, 'weigh...",0.489796,0.367347,0.489796


### Table 10

In [95]:
results_df[(results_df["c_train"]=="D-2021")&(results_df["c_test"]=="D-2022")]

Unnamed: 0,c_train,c_test,dim,epoch,model,best_param,f1_bestparam,f1_default,result
0,D-2021,D-2022,100,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.64,0.553191,0.64
1,D-2021,D-2022,100,30,svc,"{'C': 0.1, 'kernel': 'poly'}",0.618182,0.54902,0.618182
2,D-2021,D-2022,100,30,ada,"{'algorithm': 'SAMME.R', 'learning_rate': 1.0,...",0.677966,0.666667,0.677966
3,D-2021,D-2022,100,30,knn,"{'algorithm': 'auto', 'n_neighbors': 3, 'weigh...",0.35,0.35,0.35


### Table 11

In [96]:
results_df[(results_df["model"]=="rfc")].sort_values(["c_train","c_test"])

Unnamed: 0,c_train,c_test,dim,epoch,model,best_param,f1_bestparam,f1_default,result
32,A-2021,A-2022,100,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.72,0.722222,0.722222
20,A-2021,D-2021,100,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.597222,0.597222,0.597222
8,A-2021,D-2022,100,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.528926,0.521739,0.528926
44,A-2022,A-2021,100,30,rfc,"{'criterion': 'entropy', 'max_depth': 10, 'n_e...",0.707317,0.691358,0.707317
16,A-2022,D-2021,100,30,rfc,"{'criterion': 'entropy', 'max_depth': 10, 'n_e...",0.671875,0.645669,0.671875
4,A-2022,D-2022,100,30,rfc,"{'criterion': 'entropy', 'max_depth': 10, 'n_e...",0.490566,0.514286,0.514286
40,D-2021,A-2021,100,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.528302,0.478261,0.528302
28,D-2021,A-2022,100,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.242424,0.1875,0.242424
0,D-2021,D-2022,100,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.64,0.553191,0.64
36,D-2022,A-2021,100,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.590164,0.581818,0.590164


### The result of E2Vec k10

In [97]:
cl_courses = ["D-2022","D-2021","A-2022","A-2021"]
dim=10

In [98]:
columns = ["c_train","c_test","dim","epoch","model","best_param","f1_bestparam","f1_default"]
results_k10_df = pd.DataFrame(columns=columns)

for cl_test in cl_courses:
        for cl_train in cl_courses:
            if cl_test == cl_train:
                continue
            x_train,y_train,x_test,y_test = make_data_set(10,cl_train,cl_test)
            for model_select in ["rfc","svc","ada",'knn']:
                 best_param, best_param_f1, nt_f1 = at_risk_prediction(x_train,y_train,x_test,y_test,model_k=model_select)
                 row = pd.DataFrame(np.array([cl_train,cl_test,10,30,model_select,best_param,best_param_f1,nt_f1]).reshape(1,8),columns=columns)
                 results_k10_df = pd.concat([results_k10_df,row],ignore_index=True)
results_k10_df["result"] = results_k10_df[["f1_bestparam","f1_default"]].max(axis=1)

  self._df = pd.read_csv(file_path)
  self._df = pd.read_csv(file_path)
  self._df = pd.read_csv(file_path)
  self._df = pd.read_csv(file_path)
  self._df = pd.read_csv(file_path)
  self._df = pd.read_csv(file_path)


### Table 9

In [100]:
results_k10_df[(results_k10_df["c_train"]=="A-2021")&(results_k10_df["c_test"]=="A-2022")]

Unnamed: 0,c_train,c_test,dim,epoch,model,best_param,f1_bestparam,f1_default,result
32,A-2021,A-2022,10,30,rfc,"{'criterion': 'entropy', 'max_depth': 10, 'n_e...",0.684932,0.675676,0.684932
33,A-2021,A-2022,10,30,svc,"{'C': 10, 'kernel': 'rbf'}",0.591549,0.701299,0.701299
34,A-2021,A-2022,10,30,ada,"{'algorithm': 'SAMME', 'learning_rate': 1.0, '...",0.634921,0.626866,0.634921
35,A-2021,A-2022,10,30,knn,"{'algorithm': 'auto', 'n_neighbors': 3, 'weigh...",0.628571,0.619718,0.628571


### Table 10

In [101]:
results_k10_df[(results_k10_df["c_train"]=="D-2021")&(results_k10_df["c_test"]=="D-2022")]

Unnamed: 0,c_train,c_test,dim,epoch,model,best_param,f1_bestparam,f1_default,result
0,D-2021,D-2022,10,30,rfc,"{'criterion': 'entropy', 'max_depth': 10, 'n_e...",0.634921,0.645161,0.645161
1,D-2021,D-2022,10,30,svc,"{'C': 10, 'kernel': 'poly'}",0.588235,0.54902,0.588235
2,D-2021,D-2022,10,30,ada,"{'algorithm': 'SAMME.R', 'learning_rate': 0.1,...",0.678571,0.637681,0.678571
3,D-2021,D-2022,10,30,knn,"{'algorithm': 'auto', 'n_neighbors': 5, 'weigh...",0.454545,0.454545,0.454545


### Table 11

In [102]:
results_k10_df[(results_k10_df["model"]=="rfc")].sort_values(["c_train","c_test"])

Unnamed: 0,c_train,c_test,dim,epoch,model,best_param,f1_bestparam,f1_default,result
32,A-2021,A-2022,10,30,rfc,"{'criterion': 'entropy', 'max_depth': 10, 'n_e...",0.684932,0.675676,0.684932
20,A-2021,D-2021,10,30,rfc,"{'criterion': 'entropy', 'max_depth': 10, 'n_e...",0.526316,0.548148,0.548148
8,A-2021,D-2022,10,30,rfc,"{'criterion': 'entropy', 'max_depth': 10, 'n_e...",0.477876,0.486957,0.486957
44,A-2022,A-2021,10,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.657534,0.694444,0.694444
16,A-2022,D-2021,10,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.676471,0.685714,0.685714
4,A-2022,D-2022,10,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.484848,0.430769,0.484848
40,D-2021,A-2021,10,30,rfc,"{'criterion': 'entropy', 'max_depth': 10, 'n_e...",0.634921,0.617647,0.634921
28,D-2021,A-2022,10,30,rfc,"{'criterion': 'entropy', 'max_depth': 10, 'n_e...",0.615385,0.62963,0.62963
0,D-2021,D-2022,10,30,rfc,"{'criterion': 'entropy', 'max_depth': 10, 'n_e...",0.634921,0.645161,0.645161
36,D-2022,A-2021,10,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.52459,0.537313,0.537313


### E2VecA

In [112]:
dim =100

In [113]:
def make_data_set_A(dim,cl_train,cl_test):
    Edudata = r'.\data\EduData_20221028'
    train_vec_path = r'.\data\vectors\norm_Student_Vctors_course{}_{}dim_A20.csv'.format(cl_train,dim)
    test_vec_path = r'.\data\vectors\norm_Student_Vctors_course{}_{}dim_A20.csv'.format(cl_test,dim)
    train_vec = pd.read_csv(train_vec_path).set_index('userid')
    test_vec = pd.read_csv(test_vec_path).set_index('userid')
    train_grade =  la.CourseInformation(files_dir=Edudata, course_id=cl_train).grade_point_df().set_index("userid")
    test_grade = la.CourseInformation(files_dir=Edudata, course_id=cl_test).grade_point_df().set_index("userid")
    grade_dict = {'S':0, 'A':0, 'B':0, 'C':1, 'D':1, 'F':1}
    train_grade =  train_grade.replace(grade_dict)
    test_grade = test_grade.replace(grade_dict)
    train_data = pd.merge(train_vec,train_grade,left_index=True,right_index=True,how="inner")
    test_data = pd.merge(test_vec,test_grade,left_index=True,right_index=True,how="inner")
    x_train = train_data.drop(columns="grade")
    y_train = train_data["grade"]
    x_test = test_data.drop(columns="grade")
    y_test = test_data["grade"]
    return x_train,y_train,x_test,y_test

In [114]:
columns = ["c_train","c_test","dim","epoch","model","best_param","f1_bestparam","f1_default"]
results_A_df = pd.DataFrame(columns=columns)

for cl_test in cl_courses:
        for cl_train in cl_courses:
            if cl_test == cl_train:
                continue
            x_train,y_train,x_test,y_test = make_data_set_A(dim,cl_train,cl_test)
            for model_select in ["rfc","svc","ada",'knn']:
                 best_param, best_param_f1, nt_f1 = at_risk_prediction(x_train,y_train,x_test,y_test,model_k=model_select)
                 row = pd.DataFrame(np.array([cl_train,cl_test,100,30,model_select,best_param,best_param_f1,nt_f1]).reshape(1,8),columns=columns)
                 results_A_df = pd.concat([results_A_df,row],ignore_index=True)
results_A_df["result"] = results_A_df[["f1_bestparam","f1_default"]].max(axis=1)

  self._df = pd.read_csv(file_path)
  self._df = pd.read_csv(file_path)
  self._df = pd.read_csv(file_path)
  self._df = pd.read_csv(file_path)
  self._df = pd.read_csv(file_path)
  self._df = pd.read_csv(file_path)


### Table 9

In [115]:
results_A_df[(results_A_df["c_train"]=="A-2021")&(results_A_df["c_test"]=="A-2022")]

Unnamed: 0,c_train,c_test,dim,epoch,model,best_param,f1_bestparam,f1_default,result
32,A-2021,A-2022,100,30,rfc,"{'criterion': 'entropy', 'max_depth': 10, 'n_e...",0.676471,0.648649,0.676471
33,A-2021,A-2022,100,30,svc,"{'C': 0.001, 'kernel': 'linear'}",0.716049,0.702703,0.716049
34,A-2021,A-2022,100,30,ada,"{'algorithm': 'SAMME.R', 'learning_rate': 2.0,...",0.619718,0.617647,0.619718
35,A-2021,A-2022,100,30,knn,"{'algorithm': 'auto', 'n_neighbors': 10, 'weig...",0.44,0.538462,0.538462


### Table 10

In [116]:
results_A_df[(results_A_df["c_train"]=="D-2021")&(results_A_df["c_test"]=="D-2022")]

Unnamed: 0,c_train,c_test,dim,epoch,model,best_param,f1_bestparam,f1_default,result
0,D-2021,D-2022,100,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.625,0.583333,0.625
1,D-2021,D-2022,100,30,svc,"{'C': 1, 'kernel': 'poly'}",0.618182,0.615385,0.618182
2,D-2021,D-2022,100,30,ada,"{'algorithm': 'SAMME.R', 'learning_rate': 1.0,...",0.62963,0.631579,0.631579
3,D-2021,D-2022,100,30,knn,"{'algorithm': 'auto', 'n_neighbors': 3, 'weigh...",0.3,0.263158,0.3


### Table 11

In [117]:
results_A_df[(results_A_df["model"]=="rfc")].sort_values(["c_train","c_test"])

Unnamed: 0,c_train,c_test,dim,epoch,model,best_param,f1_bestparam,f1_default,result
32,A-2021,A-2022,100,30,rfc,"{'criterion': 'entropy', 'max_depth': 10, 'n_e...",0.676471,0.648649,0.676471
20,A-2021,D-2021,100,30,rfc,"{'criterion': 'entropy', 'max_depth': 10, 'n_e...",0.571429,0.575342,0.575342
8,A-2021,D-2022,100,30,rfc,"{'criterion': 'entropy', 'max_depth': 10, 'n_e...",0.534483,0.521008,0.534483
44,A-2022,A-2021,100,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.691358,0.691358,0.691358
16,A-2022,D-2021,100,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.759259,0.759259,0.759259
4,A-2022,D-2022,100,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.592593,0.592593,0.592593
40,D-2021,A-2021,100,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.325581,0.409091,0.409091
28,D-2021,A-2022,100,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.235294,0.235294,0.235294
0,D-2021,D-2022,100,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.625,0.583333,0.625
36,D-2022,A-2021,100,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.561404,0.45283,0.561404


### E2VecD

In [118]:
def make_data_set_D(dim,cl_train,cl_test):
    Edudata = r'.\data\EduData_20221028'
    train_vec_path = r'.\data\vectors\norm_Student_Vctors_course{}_{}dim_D20.csv'.format(cl_train,dim)
    test_vec_path = r'.\data\vectors\norm_Student_Vctors_course{}_{}dim_D20.csv'.format(cl_test,dim)
    train_vec = pd.read_csv(train_vec_path).set_index('userid')
    test_vec = pd.read_csv(test_vec_path).set_index('userid')
    train_grade =  la.CourseInformation(files_dir=Edudata, course_id=cl_train).grade_point_df().set_index("userid")
    test_grade = la.CourseInformation(files_dir=Edudata, course_id=cl_test).grade_point_df().set_index("userid")
    grade_dict = {'S':0, 'A':0, 'B':0, 'C':1, 'D':1, 'F':1}
    train_grade =  train_grade.replace(grade_dict)
    test_grade = test_grade.replace(grade_dict)
    train_data = pd.merge(train_vec,train_grade,left_index=True,right_index=True,how="inner")
    test_data = pd.merge(test_vec,test_grade,left_index=True,right_index=True,how="inner")
    x_train = train_data.drop(columns="grade")
    y_train = train_data["grade"]
    x_test = test_data.drop(columns="grade")
    y_test = test_data["grade"]
    return x_train,y_train,x_test,y_test

In [119]:
columns = ["c_train","c_test","dim","epoch","model","best_param","f1_bestparam","f1_default"]
results_D_df = pd.DataFrame(columns=columns)

for cl_test in cl_courses:
        for cl_train in cl_courses:
            if cl_test == cl_train:
                continue
            x_train,y_train,x_test,y_test = make_data_set_D(dim,cl_train,cl_test)
            for model_select in ["rfc","svc","ada",'knn']:
                 best_param, best_param_f1, nt_f1 = at_risk_prediction(x_train,y_train,x_test,y_test,model_k=model_select)
                 row = pd.DataFrame(np.array([cl_train,cl_test,100,30,model_select,best_param,best_param_f1,nt_f1]).reshape(1,8),columns=columns)
                 results_D_df = pd.concat([results_D_df,row],ignore_index=True)
results_D_df["result"] = results_D_df[["f1_bestparam","f1_default"]].max(axis=1)

  self._df = pd.read_csv(file_path)
  self._df = pd.read_csv(file_path)
  self._df = pd.read_csv(file_path)
  self._df = pd.read_csv(file_path)
  self._df = pd.read_csv(file_path)
  self._df = pd.read_csv(file_path)


### Table 9

In [120]:
results_D_df[(results_D_df["c_train"]=="A-2021")&(results_D_df["c_test"]=="A-2022")]

Unnamed: 0,c_train,c_test,dim,epoch,model,best_param,f1_bestparam,f1_default,result
32,A-2021,A-2022,100,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.591549,0.591549,0.591549
33,A-2021,A-2022,100,30,svc,"{'C': 1, 'kernel': 'linear'}",0.716049,0.708861,0.716049
34,A-2021,A-2022,100,30,ada,"{'algorithm': 'SAMME.R', 'learning_rate': 5.0,...",0.410256,0.59375,0.59375
35,A-2021,A-2022,100,30,knn,"{'algorithm': 'auto', 'n_neighbors': 10, 'weig...",0.655738,0.539683,0.655738


### Table 10

In [121]:
results_D_df[(results_D_df["c_train"]=="D-2021")&(results_D_df["c_test"]=="D-2022")]

Unnamed: 0,c_train,c_test,dim,epoch,model,best_param,f1_bestparam,f1_default,result
0,D-2021,D-2022,100,30,rfc,"{'criterion': 'entropy', 'max_depth': 10, 'n_e...",0.64,0.627451,0.64
1,D-2021,D-2022,100,30,svc,"{'C': 10, 'kernel': 'poly'}",0.586207,0.62069,0.62069
2,D-2021,D-2022,100,30,ada,"{'algorithm': 'SAMME', 'learning_rate': 1.0, '...",0.644068,0.607143,0.644068
3,D-2021,D-2022,100,30,knn,"{'algorithm': 'auto', 'n_neighbors': 3, 'weigh...",0.488889,0.444444,0.488889


### Table 11

In [122]:
results_D_df[(results_D_df["model"]=="rfc")].sort_values(["c_train","c_test"])

Unnamed: 0,c_train,c_test,dim,epoch,model,best_param,f1_bestparam,f1_default,result
32,A-2021,A-2022,100,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.591549,0.591549,0.591549
20,A-2021,D-2021,100,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.602941,0.602941,0.602941
8,A-2021,D-2022,100,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.547009,0.547009,0.547009
44,A-2022,A-2021,100,30,rfc,"{'criterion': 'entropy', 'max_depth': 10, 'n_e...",0.708861,0.682927,0.708861
16,A-2022,D-2021,100,30,rfc,"{'criterion': 'entropy', 'max_depth': 10, 'n_e...",0.836735,0.756757,0.836735
4,A-2022,D-2022,100,30,rfc,"{'criterion': 'entropy', 'max_depth': 10, 'n_e...",0.650602,0.613636,0.650602
40,D-2021,A-2021,100,30,rfc,"{'criterion': 'entropy', 'max_depth': 10, 'n_e...",0.468085,0.454545,0.468085
28,D-2021,A-2022,100,30,rfc,"{'criterion': 'entropy', 'max_depth': 10, 'n_e...",0.1875,0.242424,0.242424
0,D-2021,D-2022,100,30,rfc,"{'criterion': 'entropy', 'max_depth': 10, 'n_e...",0.64,0.627451,0.64
36,D-2022,A-2021,100,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.545455,0.545455,0.545455


### operation_count (OC)

In [123]:
from sklearn.preprocessing import normalize

In [124]:
def make_oc(course_id):
    Edudata = r'.\data\EduData_20221028'
    course_info = la.CourseInformation(files_dir=Edudata , course_id=course_id)
    event_stream = course_info.load_eventstream()
    operation_count = la.convert_into_operation_count(event_stream=event_stream,separate_marker_type=False)
    feature_df = operation_count.df.drop(columns='contentsid')
    feature_df = feature_df.groupby('userid').sum()
    feature_df = feature_df[['NEXT', 'PREV','ADD MARKER','OPEN','CLOSE','PAGE_JUMP','GETIT']]
    norm_oc_df = pd.DataFrame(normalize(feature_df,norm="l2",axis=1),index=feature_df.index)
    #print(norm_oc_df.head())
    return norm_oc_df

In [125]:
def make_data_set_oc(dim,cl_train,cl_test):
    Edudata = r'.\data\EduData_20221028'
    train_vec = make_oc(cl_train)
    test_vec = make_oc(cl_test)
    train_grade =  la.CourseInformation(files_dir=Edudata, course_id=cl_train).grade_point_df().set_index("userid")
    test_grade = la.CourseInformation(files_dir=Edudata, course_id=cl_test).grade_point_df().set_index("userid")
    grade_dict = {'S':0, 'A':0, 'B':0, 'C':1, 'D':1, 'F':1}
    train_grade =  train_grade.replace(grade_dict)
    test_grade = test_grade.replace(grade_dict)
    train_data = pd.merge(train_vec,train_grade,left_index=True,right_index=True,how="inner")
    test_data = pd.merge(test_vec,test_grade,left_index=True,right_index=True,how="inner")
    x_train = train_data.drop(columns="grade")
    y_train = train_data["grade"]
    x_test = test_data.drop(columns="grade")
    y_test = test_data["grade"]
    return x_train,y_train,x_test,y_test

In [126]:
columns = ["c_train","c_test","dim","epoch","model","best_param","f1_bestparam","f1_default"]
results_oc_df = pd.DataFrame(columns=columns)

for cl_test in cl_courses:
        for cl_train in cl_courses:
            if cl_test == cl_train:
                continue
            x_train,y_train,x_test,y_test = make_data_set_oc(dim,cl_train,cl_test)
            for model_select in ["rfc","svc","ada",'knn']:
                 best_param, best_param_f1, nt_f1 = at_risk_prediction(x_train,y_train,x_test,y_test,model_k=model_select)
                 row = pd.DataFrame(np.array([cl_train,cl_test,100,30,model_select,best_param,best_param_f1,nt_f1]).reshape(1,8),columns=columns)
                 results_oc_df = pd.concat([results_oc_df,row],ignore_index=True)
results_oc_df["result"] = results_oc_df[["f1_bestparam","f1_default"]].max(axis=1)

  self._df = pd.read_csv(file_path)
  self._df = pd.read_csv(file_path)
  self._df = pd.read_csv(file_path)
  self._df = pd.read_csv(file_path)
  self._df = pd.read_csv(file_path)
  self._df = pd.read_csv(file_path)
  self._df = pd.read_csv(file_path)
  self._df = pd.read_csv(file_path)
  self._df = pd.read_csv(file_path)
  self._df = pd.read_csv(file_path)
  self._df = pd.read_csv(file_path)
  self._df = pd.read_csv(file_path)


### Table 9

In [127]:
results_oc_df[(results_oc_df["c_train"]=="A-2021")&(results_oc_df["c_test"]=="A-2022")]

Unnamed: 0,c_train,c_test,dim,epoch,model,best_param,f1_bestparam,f1_default,result
32,A-2021,A-2022,100,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.590164,0.603175,0.603175
33,A-2021,A-2022,100,30,svc,"{'C': 0.001, 'kernel': 'linear'}",0.716049,0.716049,0.716049
34,A-2021,A-2022,100,30,ada,"{'algorithm': 'SAMME', 'learning_rate': 1.0, '...",0.571429,0.571429,0.571429
35,A-2021,A-2022,100,30,knn,"{'algorithm': 'auto', 'n_neighbors': 7, 'weigh...",0.628571,0.5625,0.628571


### Table 10

In [128]:
results_oc_df[(results_oc_df["c_train"]=="D-2021")&(results_oc_df["c_test"]=="D-2022")]

Unnamed: 0,c_train,c_test,dim,epoch,model,best_param,f1_bestparam,f1_default,result
0,D-2021,D-2022,100,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.634921,0.615385,0.634921
1,D-2021,D-2022,100,30,svc,"{'C': 100, 'kernel': 'poly'}",0.606061,0.478261,0.606061
2,D-2021,D-2022,100,30,ada,"{'algorithm': 'SAMME.R', 'learning_rate': 1.0,...",0.5625,0.557377,0.5625
3,D-2021,D-2022,100,30,knn,"{'algorithm': 'auto', 'n_neighbors': 3, 'weigh...",0.558824,0.545455,0.558824


### Table 11

In [129]:
results_oc_df[(results_oc_df["model"]=="rfc")].sort_values(["c_train","c_test"])

Unnamed: 0,c_train,c_test,dim,epoch,model,best_param,f1_bestparam,f1_default,result
32,A-2021,A-2022,100,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.590164,0.603175,0.603175
20,A-2021,D-2021,100,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.521127,0.521127,0.521127
8,A-2021,D-2022,100,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.495868,0.491803,0.495868
44,A-2022,A-2021,100,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.626866,0.626866,0.626866
16,A-2022,D-2021,100,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.684211,0.684211,0.684211
4,A-2022,D-2022,100,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.363636,0.363636,0.363636
40,D-2021,A-2021,100,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.744186,0.744186,0.744186
28,D-2021,A-2022,100,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.734177,0.734177,0.734177
0,D-2021,D-2022,100,30,rfc,"{'criterion': 'gini', 'max_depth': 10, 'n_esti...",0.634921,0.615385,0.634921
36,D-2022,A-2021,100,30,rfc,"{'criterion': 'gini', 'max_depth': 20, 'n_esti...",0.744186,0.744186,0.744186
