In [124]:
import pandas as pd
import numpy as np

import sklearn

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import LinearSVC

from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split, KFold
from sklearn import metrics
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, average_precision_score
from sklearn.metrics import plot_precision_recall_curve, precision_recall_curve
import matplotlib.pyplot as plt

In [138]:
all_data = pd.read_csv("../output_files/precalculus_pairs_data.csv")
pred_tfidf_val = pd.read_csv("data/predicted_prereq.csv")
rank_val = pd.read_csv("data/concept_rank.csv")

In [139]:
all_data["tfidf"] = pred_tfidf_val["tfidf_score"].to_numpy().ravel()

In [140]:
concept_rank = {}

for i in range(rank_val.shape[0]):
    concept = rank_val[["concept"]].iloc[i].values[0]
    rank = rank_val[["rank"]].iloc[i].values[0]
    concept_rank[concept] = rank

In [141]:
def rank_score(c1, c2):
    r1 = int(concept_rank[c1])
    r2 = int(concept_rank[c2])
    if r1 <= r2:
        return (r2-r1)
    else:
        return (r2-r1)

In [142]:
pred_rank = []
for i in range(all_data.shape[0]):
    topic_a = all_data[["topic_a"]].iloc[i].values[0]
    topic_b = all_data[["topic_b"]].iloc[i].values[0]
    rank = rank_score(topic_a, topic_b)
    pred_rank.append(rank)

In [143]:
all_data["rank"] = pred_rank

In [144]:
all_data

Unnamed: 0.1,Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f33,f34,f35,f36,f37,relation,topic_a,topic_b,tfidf,rank
0,0,257.0,349.0,797.0,583.0,1.0,3.0,11.0,0.0,0.0,...,831.0,50.0,-0.000040,0.000014,-7.542200e-07,1.0,Differential equation,Number,0.014629,-4
1,1,55.0,197.0,149.0,286.0,2.0,3.0,10.0,1.0,1.0,...,374.0,61.0,0.000009,-0.000007,-1.910268e-06,1.0,Asymptote,Line (geometry),0.000000,-16
2,2,141.0,197.0,621.0,286.0,5.0,3.0,18.0,1.0,1.0,...,374.0,79.0,0.000018,0.000021,6.730535e-06,1.0,Cartesian coordinate system,Line (geometry),0.000000,2
3,3,57.0,198.0,28.0,660.0,3.0,2.0,6.0,0.0,0.0,...,524.0,36.0,-0.000045,-0.000055,-6.749430e-06,1.0,Partial fraction decomposition,Arithmetic,0.000000,-38
4,4,370.0,102.0,614.0,295.0,3.0,3.0,44.0,2.0,14.0,...,269.0,119.0,0.000051,0.000017,2.138242e-06,0.0,Logarithm,Natural logarithm,0.497389,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2055,2055,202.0,185.0,838.0,1088.0,2.0,5.0,17.0,0.0,2.0,...,319.0,55.0,-0.000038,-0.000011,-2.839315e-07,0.0,Polynomial,Natural number,0.000000,-18
2056,2056,442.0,20.0,15414.0,55.0,4.0,3.0,4.0,0.0,0.0,...,97.0,7.0,0.000066,0.001274,1.016967e-04,0.0,Mathematics,Polynomial long division,0.000000,11
2057,2057,146.0,64.0,399.0,86.0,4.0,2.0,8.0,0.0,0.0,...,141.0,23.0,0.000270,0.000015,3.585556e-06,0.0,Multiplication,Minor (linear algebra),0.000000,60
2058,2058,101.0,363.0,145.0,3440.0,2.0,1.0,4.0,0.0,0.0,...,665.0,3.0,-0.000051,-0.000227,-2.401727e-05,1.0,Trigonometric substitution,Geometry,0.000000,-32


# Data Normalization

In [145]:
df = all_data[["relation", "tfidf", "rank", "f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9",
            "f10", "f11", "f12", "f13", "f14", "f15", "f16", "f17", "f18", "f19",
            "f20", "f21", "f22", "f23", "f24", "f25", "f26", "f27", "f28", "f29",
            "f30", "f31", "f32", "f33", "f34", "f35", "f36", "f37"]]

In [147]:
df.corr()

Unnamed: 0,relation,tfidf,rank,f0,f1,f2,f3,f4,f5,f6,...,f28,f29,f30,f31,f32,f33,f34,f35,f36,f37
relation,1.0,0.224802,-0.372015,-0.376054,0.459822,-0.253471,0.436415,-0.078998,0.030278,0.033224,...,0.294589,-0.170442,0.380447,-0.228504,-0.379698,0.520073,0.000621,-0.410497,-0.442773,-0.481355
tfidf,0.224802,1.0,-0.118445,-0.125724,0.090248,-0.097498,-0.065114,-0.007688,-0.036898,0.14588,...,0.346333,0.08875,0.408409,0.023619,-0.134226,0.165505,0.13352,-0.140717,-0.013708,-0.029715
rank,-0.372015,-0.118445,1.0,0.252735,-0.260312,0.213087,-0.220683,-0.04007,0.002563,-0.006008,...,-0.098299,0.097776,-0.084924,0.086631,0.194366,-0.232901,-0.020991,0.154064,0.281128,0.299132
f0,-0.376054,-0.125724,0.252735,1.0,-0.222288,0.484482,-0.160726,0.280878,-0.02263,0.187785,...,-0.041035,0.080554,-0.186646,0.104538,0.625644,-0.231169,0.212601,0.311457,0.404356,0.454839
f1,0.459822,0.090248,-0.260312,-0.222288,1.0,-0.160441,0.510082,-0.036986,0.282289,0.179637,...,0.091879,-0.041979,0.108143,-0.181678,-0.220095,0.636092,0.17677,-0.321986,-0.41911,-0.473869
f2,-0.253471,-0.097498,0.213087,0.484482,-0.160441,1.0,-0.119084,0.208484,-0.035689,0.002436,...,-0.102067,-0.036003,-0.129758,-0.022595,0.519662,-0.171736,-0.025022,0.219291,0.740313,0.743528
f3,0.436415,-0.065114,-0.220683,-0.160726,0.510082,-0.119084,1.0,-0.042769,0.21028,0.000667,...,-0.037642,-0.102632,-0.022478,-0.133234,-0.1704,0.568009,-0.018669,-0.23802,-0.744925,-0.747519
f4,-0.078998,-0.007688,-0.04007,0.280878,-0.036986,0.208484,-0.042769,1.0,-0.008181,0.016676,...,0.049238,-0.05803,-0.062049,-0.052224,0.161064,-0.043256,0.080278,0.14818,0.162489,0.171783
f5,0.030278,-0.036898,0.002563,-0.02263,0.282289,-0.035689,0.21028,-0.008181,1.0,0.025123,...,-0.057388,0.057199,-0.051613,-0.052526,-0.026455,0.199244,0.110592,-0.130407,-0.157747,-0.171417
f6,0.033224,0.14588,-0.006008,0.187785,0.179637,0.002436,0.000667,0.016676,0.025123,1.0,...,0.27086,0.283912,0.161699,0.177907,0.222202,0.230892,0.597099,-0.006697,0.003892,0.000706


In [164]:
columns = ["f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9",
            "f10", "f11", "f12", "f13", "f14", "f15", "f16", "f17", "f18", "f19",
            "f20", "f21", "f22", "f23", "f24", "f25", "f26", "f27", "f28", "f29",
            "f30", "f31", "f32", "f33", "f34", "f35", "f36", "f37", "rank"]

def normalize_array(x):
    x = np.array(x)
    min_x = np.min(x)
    max_x = np.max(x)
    x = (x - min_x)/(max_x - min_x)
    return x


def normalized_df(df):
    X = [normalize_array([df[[col]].values[i][0] for i in range(len(df[[col]]))]) for col in columns]
    X = np.array(X)
    return X.T

In [165]:
X = all_data[["f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9",
            "f10", "f11", "f12", "f13", "f14", "f15", "f16", "f17", "f18", "f19",
            "f20", "f21", "f22", "f23", "f24", "f25", "f26", "f27", "f28", "f29",
            "f30", "f31", "f32", "f33", "f34", "f35", "f36", "f37", "rank"]]
X = normalized_df(X)

In [166]:
y = all_data[["relation"]].to_numpy().ravel()

# Data Modeling along with our features

In [167]:
def print_results(result):
    print("Precision: %.1f" % result[0])
    print("Recall: %.1f" % result[1])
    print("F1 Score: %.1f" % result[2])
    print("Area Under Curve: %.1f" % result[3])

def evaluation_results(y_test, y_predict):
    recall = recall_score(y_test, y_predict)*100
    precision = precision_score(y_test, y_predict)*100
    f1 = f1_score(y_test, y_predict)*100
    auc = roc_auc_score(y_test, y_predict)*100
    return [precision, recall, f1, auc]

rand_params = {
    "n_estimators" : 300,
    "max_features" : 'auto',
#     "max_depth" : 300,
#     "min_samples_split" : 2,
#     "min_samples_leaf" : 1
}


def random_forest(x_train, x_test, y_train, y_test):
    rand_forest = RandomForestClassifier(**rand_params)
    rand_forest.fit(x_train, y_train)
    y_predict = rand_forest.predict(x_test)
    return evaluation_results(y_test, y_predict)

def k_fold_training(X, Y):
    results = []
    kf = KFold(n_splits = 5)
    kf.get_n_splits(X)
    for train_index, test_index in kf.split(X):
        x_train, x_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        result = random_forest(x_train, x_test, y_train, y_test)
        results.append(result)
    results = np.array(results)
    final_result = np.mean(results, axis = 0)
    return final_result

In [169]:
result = k_fold_training(X, y)
print_results(result)

Precision: 89.8
Recall: 88.6
F1 Score: 89.1
Area Under Curve: 91.6


In [163]:
result = k_fold_training(X, y)
print_results(result)

Precision: 89.5
Recall: 88.3
F1 Score: 88.8
Area Under Curve: 91.4


In [156]:
result = k_fold_training(X, y)
print_results(result)

Precision: 89.0
Recall: 88.7
F1 Score: 88.8
Area Under Curve: 91.5


In [18]:
result = k_fold_training(X, y)
print_results(result)

Precision: 89.8
Recall: 88.2
F1 Score: 89.0
Area Under Curve: 91.5


In [16]:
result = k_fold_training(X, y)
print_results(result)

Precision: 94.4
Recall: 88.2
F1 Score: 91.2
Area Under Curve: 92.9
