In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sklearn
from sklearn.preprocessing import scale

from sklearn.model_selection import train_test_split, KFold
from sklearn import metrics
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

## Useful Functions

In [15]:
def check_state(val, theta):
    if val >= theta: return 1
    else: return 0


def compare_result(result, truth):
    if truth == 1:
        if result == 1: return 1
        elif result == 0: return 2
    elif truth == 0:
        if result == 1: return 3
        elif result == 0: return 4


def estimation_measures(TP, FN, FP, TN):
    if TP == 0 and FP == 0:
        precision = 0
    else:
        precision = TP/(TP + FP)

    if TP == 0 and FN == 0:
        recall = 0
    else:
        recall = TP/(TP + FN)

    if precision == 0 and recall == 0:
        f1_score = 0
    else:
        f1_score = 2*precision*recall/(precision + recall)
    return f1_score



def check_estimation(y_estimate, y_truth, theta):
    TP = 0
    TN = 0
    FN = 0
    FP = 0
    for i in range(len(y_estimate)):
        estimated = y_estimate[i]
        truth = y_truth[i]
        result = check_state(estimated, theta)
        score_state = compare_result(result, truth)
        if score_state == 1: TP += 1
        elif score_state == 2: FN += 1
        elif score_state == 3: FP += 1
        elif score_state == 4: TN += 1
    data = estimation_measures(TP, FN, FP, TN)
    return data


In [16]:
def get_optimal_theta(est_val, ground_val):
    theta_values = [float("{0:.2f}".format((0.0 + 0.02*i))) for i in range(50)]
    f1_score = []
    for theta in theta_values:
        f1 = check_estimation(est_val, ground_val, theta)
        f1_score.append(f1)
    
    index = 0
    for i in range(1, len(theta_values)):
        if f1_score[index] < f1_score[i]:
            index = i
#     print(theta_values[index], f1_score[index])
#     plt.plot(theta_values, f1_score)
#     plt.show()
    print(theta_values[index])
    return theta_values[index]


def get_predicted_value(X, theta):
    pred = []
    for i in range(len(X)):
        if X[i] < theta:
            pred.append(0)
        else:
            pred.append(1)
    return pred

In [17]:
def print_results(result):
    print("Precision: %.1f" % result[0])
    print("Recall: %.1f" % result[1])
    print("F1 Score: %.1f" % result[2])
    print("Area Under Curve: %.1f" % result[3])
    

def evaluation_results(y_test, y_predict):
    recall = recall_score(y_test, y_predict)*100
    precision = precision_score(y_test, y_predict)*100
    f1 = f1_score(y_test, y_predict)*100
    auc = roc_auc_score(y_test, y_predict)*100
    return [precision, recall, f1, auc]


def k_fold_testing(X, Y):
    results = []
    kf = KFold(n_splits = 5)
    kf.get_n_splits(X)
    for train_index, test_index in kf.split(X):
        x_train, x_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        opt_theta = get_optimal_theta(x_train, y_train)
        y_predict = get_predicted_value(x_test, opt_theta)
        result = evaluation_results(y_test, y_predict)
        results.append(result)
    results = np.array(results)
    final_result = np.mean(results, axis = 0)
    return final_result

In [18]:
df_propose = pd.read_csv("predicted_prereq.csv")
df_refd = pd.read_csv("refd_estimated_results.csv")
df_final = pd.merge(df_propose, df_refd, how = 'inner', on = ['topic_a', 'topic_b', 'relation'])
df_final

Unnamed: 0,Unnamed: 0_x,topic_a,topic_b,relation,tfidf_score,wiki_tfidf_score,Unnamed: 0_y,Unnamed: 0.1,refd_equal,refd_tfidf
0,0,Differential equation,Number,1,0.006426,0.006426,0,0,0.054299,0.082802
1,1,Partial fraction decomposition,Arithmetic,1,0.047547,0.000000,3,3,0.076923,-0.001364
2,2,Equation,Quadratic equation,0,0.186873,0.186873,8,8,-0.030000,-0.120911
3,3,Mathematics,Linear inequality,0,0.000000,0.000000,9,9,-0.666667,-0.032310
4,4,Binomial theorem,Addition,1,0.035118,0.035118,10,10,0.171053,0.064591
...,...,...,...,...,...,...,...,...,...,...
913,913,Limit (mathematics),Mathematics,1,1.000000,1.000000,2048,2048,0.627841,0.909152
914,914,Sine,Inverse trigonometric functions,0,0.000000,0.000000,2050,2050,-0.218045,-0.638182
915,915,Quadratic equation,Rational number,0,0.022569,0.022569,2051,2051,0.308333,0.393114
916,916,Summation,Arithmetic,1,0.047547,0.047547,2053,2053,0.054377,-0.217233


In [19]:
df_propose = df_final[["topic_a", "topic_b", "relation", "tfidf_score", "wiki_tfidf_score"]]
df_propose.to_csv("final_propose.csv")

In [20]:
df_refd = df_final[["topic_a", "topic_b", "relation", "refd_equal", "refd_tfidf"]]
df_refd.to_csv("final_refd.csv")

## Results - Proposed Method

In [21]:
df_propose = pd.read_csv("final_propose.csv")
X_propose1 = df_propose[["tfidf_score"]].to_numpy().ravel()
X_propose2 = df_propose[["wiki_tfidf_score"]].to_numpy().ravel()
y_propose = df_propose[["relation"]].to_numpy().ravel()

In [22]:
# Proposed book tfidf
result = k_fold_testing(X_propose1, y_propose)
print_results(result)

0.04
0.04
0.04
0.04
0.04
Precision: 68.6
Recall: 54.4
F1 Score: 60.4
Area Under Curve: 69.9


In [23]:
# Proposed book + wikipedia tfidf
result = k_fold_testing(X_propose2, y_propose)
print_results(result)

0.02
0.0
0.0
0.0
0.0
Precision: 41.1
Recall: 88.3
F1 Score: 52.1
Area Under Curve: 52.4


## Results - RefD Method

In [24]:
df_propose = pd.read_csv("final_refd.csv")
X_propose1 = df_propose[["refd_equal"]].to_numpy().ravel()
X_propose2 = df_propose[["refd_tfidf"]].to_numpy().ravel()
y_propose = df_propose[["relation"]].to_numpy().ravel()

In [25]:
# RefD with Equal Method
result = k_fold_testing(X_propose1, y_propose)
print_results(result)

0.04
0.02
0.02
0.0
0.0
Precision: 62.1
Recall: 82.4
F1 Score: 70.7
Area Under Curve: 76.6


In [26]:
# RefD with TFIDF Method
result = k_fold_testing(X_propose2, y_propose)
print_results(result)

0.02
0.0
0.0
0.0
0.02
Precision: 54.0
Recall: 73.4
F1 Score: 61.8
Area Under Curve: 68.2


## Results - RefD Given Vals

In [5]:
df_propose = pd.read_csv("refd_given_vals.csv")
X_propose1 = df_propose[["refd_val"]].to_numpy().ravel()
# X_propose2 = df_propose[["refd_tfidf"]].to_numpy().ravel()
y_propose = df_propose[["output"]].to_numpy().ravel()

In [6]:
# RefD with Equal Method
result = k_fold_testing(X_propose1, y_propose)
print_results(result)

Precision: 61.5
Recall: 82.9
F1 Score: 70.4
Area Under Curve: 78.0
