In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RFC, AdaBoostClassifier as ABC
from sklearn.neural_network import MLPClassifier as MLP
from sklearn.linear_model import Perceptron 
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
import optuna
from sklearn.metrics import confusion_matrix
import OpenLA as la
from sklearn.metrics import f1_score

In [None]:
def make_data_set(dim,cl_train,cl_test):
    Edudata = r'.\data\EduData_20221028'
    train_vec_path = r'.\data\vectors\norm_Student_Vctors_course{}_{}dim.csv'.format(cl_train,dim)
    test_vec_path = r'.\data\vectors\norm_Student_Vctors_course{}_{}dim.csv'.format(cl_test,dim)
    train_vec = pd.read_csv(train_vec_path).set_index('userid')
    test_vec = pd.read_csv(test_vec_path).set_index('userid')
    train_grade =  la.CourseInformation(files_dir=Edudata, course_id=cl_train).grade_point_df().set_index("userid")
    test_grade = la.CourseInformation(files_dir=Edudata, course_id=cl_test).grade_point_df().set_index("userid")
    grade_dict = {'S':0, 'A':0, 'B':0, 'C':1, 'D':1, 'F':1}
    train_grade =  train_grade.replace(grade_dict)
    test_grade = test_grade.replace(grade_dict)
    train_data = pd.merge(train_vec,train_grade,left_index=True,right_index=True,how="inner")
    test_data = pd.merge(test_vec,test_grade,left_index=True,right_index=True,how="inner")
    x_train = train_data.drop(columns="grade")
    y_train = train_data["grade"]
    x_test = test_data.drop(columns="grade")
    y_test = test_data["grade"]
    return x_train,y_train,x_test,y_test

In [None]:
def at_risk_prediction(train_data, train_label, test_data, test_label, model_k="rfc"):
    if model_k == "rfc":
        model = RFC(random_state=42)
        model_params ={
            'n_estimators':[10,20,30,50,100],
            'criterion': ['gini','entropy','log_loss'],
            'max_depth' : [10,20,None]
        }
        model_nt = RFC(random_state=42)
    elif model_k == "svc":
        model = SVC(random_state=42)
        model_params = {
            "C": [10** i for i in range(-3,3)],
            "kernel": ["linear","poly","rbf","sigmoid"]
        } 
        model_nt = SVC(random_state=42)
    elif model_k == "ada":
        model = ABC(random_state=42)
        model_params = {
            'n_estimators':[1,10,20,30,50,100],
            'learning_rate':[0.1,0.5,1.0,2.0,5.0],
            'algorithm': ['SAMME','SAMME.R']
        }
        model_nt = ABC(random_state=42)
    elif model_k == 'knn':
        model = KNN()
        model_params = {
            'n_neighbors' : [3,5,7,10],
            'weights': ["uniform","distance"],
            'algorithm':['auto','ball_tree','kd_tree','brute']
        }
        model_nt = KNN()
    elif model_k == 'mlp':
        model = MLP(random_state=42,early_stopping=True)
        model_params={
            'hidden_layer_sizes': [10,50,100,200,500,1000],
            'activation': ['identity','logistic','tanh','relu'],
            'solver': ['lbfgs','sgd','adam'],
            'alpha': [10** -3, 10** -4, 10** -5],
            'max_iter': [50,100,200,300,500,100]
        }
        model_nt = MLP(random_state=42,early_stopping=True)
    #elif model_k == 
    # grid search によって最適なモデルを選択　パラメータの評価は3CV
    gscv = GridSearchCV(model, model_params,cv=3,scoring='f1',verbose=3)
    gscv.fit(train_data,train_label)
    # 最もよかったパラメータ
    best = gscv.best_estimator_

    # predict with best parameters model
    pred = best.predict(test_data)
    f1_best = f1_score(pred,test_label)

    #print(gscv.best_params_)
    #print(gscv.best_score_)
    #print(confusion_matrix(pred,test_label))

    #"Non Tuning model"
    model_nt.fit(train_data,train_label)
    pred_nt = model_nt.predict(test_data)
    f1_nt = f1_score(pred_nt,test_label)
    #print(confusion_matrix(pred_nt,test_label))
    return gscv.best_params_, f1_best, f1_nt

### E2Vec k100

In [None]:
cl_courses = ["D-2022","D-2021","A-2022","A-2021"]
dim=100

In [None]:
columns = ["c_train","c_test","dim","epoch","model","best_param","f1_bestparam","f1_default"]
results_df = pd.DataFrame(columns=columns)

for cl_test in cl_courses:
        for cl_train in cl_courses:
            if cl_test == cl_train:
                continue
            x_train,y_train,x_test,y_test = make_data_set(dim,cl_train,cl_test)
            for model_select in ["rfc","svc","ada",'knn']:
                 best_param, best_param_f1, nt_f1 = at_risk_prediction(x_train,y_train,x_test,y_test,model_k=model_select)
                 row = pd.DataFrame(np.array([cl_train,cl_test,100,30,model_select,best_param,best_param_f1,nt_f1]).reshape(1,8),columns=columns)
                 results_df = pd.concat([results_df,row],ignore_index=True)


In [None]:
results_df

### Table 10 

In [None]:
results_df[(results_df["c_train"]=="A-2021")&(results_df["c_test"]=="A-2022")]

### Table 11

In [None]:
results_df[(results_df["c_train"]=="D-2021")&(results_df["c_test"]=="D-2022")]

### Table 12

In [None]:
results_df[(results_df["model"]=="rfc")].sort_values(["c_train","c_test"])

### E2Vec k10

In [None]:
cl_courses = ["D-2022","D-2021","A-2022","A-2021"]
dim=10

In [None]:
columns = ["c_train","c_test","dim","epoch","model","best_param","f1_bestparam","f1_default"]
results_k10_df = pd.DataFrame(columns=columns)

for cl_test in cl_courses:
        for cl_train in cl_courses:
            if cl_test == cl_train:
                continue
            x_train,y_train,x_test,y_test = make_data_set(10,cl_train,cl_test)
            for model_select in ["rfc","svc","ada",'knn']:
                 best_param, best_param_f1, nt_f1 = at_risk_prediction(x_train,y_train,x_test,y_test,model_k=model_select)
                 row = pd.DataFrame(np.array([cl_train,cl_test,10,30,model_select,best_param,best_param_f1,nt_f1]).reshape(1,8),columns=columns)
                 results_k10_df = pd.concat([results_k10_df,row],ignore_index=True)

In [None]:
results_k10_df

### Table 10

In [None]:
results_k10_df[(results_k10_df["c_train"]=="A-2021")&(results_k10_df["c_test"]=="A-2022")]

### Table 11

In [None]:
results_k10_df[(results_k10_df["c_train"]=="D-2021")&(results_k10_df["c_test"]=="D-2022")]

### Table 12

In [None]:
results_k10_df[(results_k10_df["model"]=="rfc")].sort_values(["c_train","c_test"])

### E2VecA

In [None]:
dim =100

In [None]:
def make_data_set_A(dim,cl_train,cl_test):
    Edudata = r'.\data\EduData_20221028'
    train_vec_path = r'.\data\vectors\norm_Student_Vctors_course{}_{}dim_A20.csv'.format(cl_train,dim)
    test_vec_path = r'.\data\vectors\norm_Student_Vctors_course{}_{}dim_A20.csv'.format(cl_test,dim)
    train_vec = pd.read_csv(train_vec_path).set_index('userid')
    test_vec = pd.read_csv(test_vec_path).set_index('userid')
    train_grade =  la.CourseInformation(files_dir=Edudata, course_id=cl_train).grade_point_df().set_index("userid")
    test_grade = la.CourseInformation(files_dir=Edudata, course_id=cl_test).grade_point_df().set_index("userid")
    grade_dict = {'S':0, 'A':0, 'B':0, 'C':1, 'D':1, 'F':1}
    train_grade =  train_grade.replace(grade_dict)
    test_grade = test_grade.replace(grade_dict)
    train_data = pd.merge(train_vec,train_grade,left_index=True,right_index=True,how="inner")
    test_data = pd.merge(test_vec,test_grade,left_index=True,right_index=True,how="inner")
    x_train = train_data.drop(columns="grade")
    y_train = train_data["grade"]
    x_test = test_data.drop(columns="grade")
    y_test = test_data["grade"]
    return x_train,y_train,x_test,y_test

In [None]:
columns = ["c_train","c_test","dim","epoch","model","best_param","f1_bestparam","f1_default"]
results_A_df = pd.DataFrame(columns=columns)

for cl_test in cl_courses:
        for cl_train in cl_courses:
            if cl_test == cl_train:
                continue
            x_train,y_train,x_test,y_test = make_data_set_A(dim,cl_train,cl_test)
            for model_select in ["rfc","svc","ada",'knn']:
                 best_param, best_param_f1, nt_f1 = at_risk_prediction(x_train,y_train,x_test,y_test,model_k=model_select)
                 row = pd.DataFrame(np.array([cl_train,cl_test,100,30,model_select,best_param,best_param_f1,nt_f1]).reshape(1,8),columns=columns)
                 results_A_df = pd.concat([results_A_df,row],ignore_index=True)

In [None]:
results_A_df[(results_A_df["c_train"]=="A-2021")&(results_A_df["c_test"]=="A-2022")]

In [None]:
results_A_df[(results_A_df["c_train"]=="D-2021")&(results_A_df["c_test"]=="D-2022")]

In [None]:
results_A_df[(results_A_df["model"]=="rfc")].sort_values(["c_train","c_test"])

### E2VecD

In [None]:
def make_data_set_D(dim,cl_train,cl_test):
    Edudata = r'.\data\EduData_20221028'
    train_vec_path = r'.\data\vectors\norm_Student_Vctors_course{}_{}dim_D20.csv'.format(cl_train,dim)
    test_vec_path = r'.\data\vectors\norm_Student_Vctors_course{}_{}dim_D20.csv'.format(cl_test,dim)
    train_vec = pd.read_csv(train_vec_path).set_index('userid')
    test_vec = pd.read_csv(test_vec_path).set_index('userid')
    train_grade =  la.CourseInformation(files_dir=Edudata, course_id=cl_train).grade_point_df().set_index("userid")
    test_grade = la.CourseInformation(files_dir=Edudata, course_id=cl_test).grade_point_df().set_index("userid")
    grade_dict = {'S':0, 'A':0, 'B':0, 'C':1, 'D':1, 'F':1}
    train_grade =  train_grade.replace(grade_dict)
    test_grade = test_grade.replace(grade_dict)
    train_data = pd.merge(train_vec,train_grade,left_index=True,right_index=True,how="inner")
    test_data = pd.merge(test_vec,test_grade,left_index=True,right_index=True,how="inner")
    x_train = train_data.drop(columns="grade")
    y_train = train_data["grade"]
    x_test = test_data.drop(columns="grade")
    y_test = test_data["grade"]
    return x_train,y_train,x_test,y_test

In [None]:
columns = ["c_train","c_test","dim","epoch","model","best_param","f1_bestparam","f1_default"]
results_D_df = pd.DataFrame(columns=columns)

for cl_test in cl_courses:
        for cl_train in cl_courses:
            if cl_test == cl_train:
                continue
            x_train,y_train,x_test,y_test = make_data_set_D(dim,cl_train,cl_test)
            for model_select in ["rfc","svc","ada",'knn']:
                 best_param, best_param_f1, nt_f1 = at_risk_prediction(x_train,y_train,x_test,y_test,model_k=model_select)
                 row = pd.DataFrame(np.array([cl_train,cl_test,100,30,model_select,best_param,best_param_f1,nt_f1]).reshape(1,8),columns=columns)
                 results_D_df = pd.concat([results_D_df,row],ignore_index=True)

In [None]:
results_D_df[(results_D_df["c_train"]=="A-2021")&(results_D_df["c_test"]=="A-2022")]

In [None]:
results_D_df[(results_D_df["c_train"]=="D-2021")&(results_D_df["c_test"]=="D-2022")]

In [None]:
results_D_df[(results_D_df["model"]=="rfc")].sort_values(["c_train","c_test"])