In [2]:
import pandas as pd
import numpy as np

import sklearn

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import LinearSVC

from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split, KFold
from sklearn import metrics
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, average_precision_score
from sklearn.metrics import plot_precision_recall_curve, precision_recall_curve
import matplotlib.pyplot as plt

In [18]:
all_data = pd.read_csv("../output_files/geometry_pairs_data.csv")
pred_tfidf_val = pd.read_csv("data/predicted_prereq.csv")
rank_val = pd.read_csv("data/concept_rank.csv")

In [19]:
all_data["tfidf"] = pred_tfidf_val["tfidf_score"].to_numpy().ravel()

In [20]:
concept_rank = {}

for i in range(rank_val.shape[0]):
    concept = rank_val[["concept"]].iloc[i].values[0]
    rank = rank_val[["rank"]].iloc[i].values[0]
    concept_rank[concept] = rank

In [21]:
def rank_score(c1, c2):
    r1 = int(concept_rank[c1])
    r2 = int(concept_rank[c2])
    return (r1-r2)

In [22]:
pred_rank = []
for i in range(all_data.shape[0]):
    topic_a = all_data[["topic_a"]].iloc[i].values[0]
    topic_b = all_data[["topic_b"]].iloc[i].values[0]
    rank = rank_score(topic_a, topic_b)
    pred_rank.append(rank)

In [23]:
all_data["rank"] = pred_rank

In [24]:
all_data

Unnamed: 0.1,Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f33,f34,f35,f36,f37,relation,topic_a,topic_b,tfidf,rank
0,0,53.0,86.0,247.0,176.0,2.0,3.0,4.0,0.0,2.0,...,161.0,7.0,-5.425241e-05,1.966443e-05,4.727588e-07,0.0,Parallelogram,Rotational symmetry,0.000000,-31
1,1,184.0,61.0,642.0,147.0,1.0,3.0,11.0,0.0,0.0,...,186.0,25.0,3.362640e-04,2.500931e-05,8.087279e-06,0.0,Angle,Isosceles triangle,0.000000,-5
2,2,184.0,262.0,642.0,331.0,1.0,9.0,35.0,1.0,0.0,...,713.0,75.0,2.450131e-04,1.771560e-05,4.914820e-06,0.0,Angle,Pythagorean theorem,0.000000,-31
3,3,42.0,40.0,333.0,10.0,3.0,2.0,6.0,0.0,0.0,...,141.0,7.0,7.609828e-06,5.902672e-06,1.827928e-06,0.0,Edge (geometry),Acute and obtuse triangles,0.056831,39
4,4,141.0,197.0,621.0,286.0,5.0,3.0,18.0,1.0,1.0,...,374.0,79.0,1.794967e-05,2.058517e-05,6.730535e-06,1.0,Cartesian coordinate system,Line (geometry),0.131487,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1676,1676,238.0,42.0,766.0,333.0,2.0,3.0,13.0,0.0,0.0,...,67.0,12.0,2.200583e-05,5.148986e-05,3.152873e-06,1.0,Polygon,Edge (geometry),0.000000,-38
1677,1677,61.0,120.0,347.0,596.0,2.0,2.0,5.0,0.0,0.0,...,149.0,11.0,-4.459765e-07,3.928922e-06,1.387987e-06,0.0,Parallel (geometry),Prism (geometry),0.000000,-33
1678,1678,76.0,184.0,126.0,642.0,1.0,1.0,21.0,0.0,2.0,...,341.0,34.0,-3.961945e-04,-2.415513e-05,-8.008553e-06,1.0,Bisection,Angle,0.241991,18
1679,1679,85.0,363.0,149.0,3440.0,2.0,1.0,16.0,0.0,1.0,...,665.0,49.0,-5.148803e-05,-2.249849e-04,-2.364102e-05,1.0,Similarity (geometry),Geometry,1.000000,26


# Data Normalization

In [25]:
df = all_data[["relation", "tfidf", "rank", "f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9",
            "f10", "f11", "f12", "f13", "f14", "f15", "f16", "f17", "f18", "f19",
            "f20", "f21", "f22", "f23", "f24", "f25", "f26", "f27", "f28", "f29",
            "f30", "f31", "f32", "f33", "f34", "f35", "f36", "f37"]]

In [26]:
df.corr()

Unnamed: 0,relation,tfidf,rank,f0,f1,f2,f3,f4,f5,f6,...,f28,f29,f30,f31,f32,f33,f34,f35,f36,f37
relation,1.0,0.4046,0.30395,-0.212151,0.344983,-0.191297,0.331014,0.119956,-0.240325,0.046224,...,0.31522,-0.183382,0.322315,-0.187073,-0.134902,0.20461,-0.019251,-0.187546,-0.144948,-0.427982
tfidf,0.4046,1.0,0.320661,-0.130417,0.582568,-0.103451,0.810579,0.072794,-0.271924,0.066017,...,0.165119,-0.085094,0.197183,-0.093609,-0.087462,0.525024,0.076497,-0.128115,-0.384373,-0.624753
rank,0.30395,0.320661,1.0,-0.273984,0.267514,-0.167276,0.172825,0.231705,-0.270942,-0.005689,...,0.133317,-0.13499,0.129624,-0.116477,-0.163624,0.182504,-0.010272,-0.165861,-0.122672,-0.326458
f0,-0.212151,-0.130417,-0.273984,1.0,-0.107031,0.610464,-0.086231,-0.023618,0.027401,0.300276,...,-0.022537,0.201054,-0.196034,0.220188,0.818585,-0.083203,0.289327,0.229283,0.271708,0.517102
f1,0.344983,0.582568,0.267514,-0.107031,1.0,-0.077446,0.635319,0.041033,-0.012316,0.293253,...,0.191181,-0.016204,0.211623,-0.176437,-0.082194,0.80268,0.265054,-0.221998,-0.287999,-0.521822
f2,-0.191297,-0.103451,-0.167276,0.610464,-0.077446,1.0,-0.056251,-0.183748,0.033938,0.080672,...,-0.082877,0.105875,-0.115089,0.113678,0.526547,-0.063634,0.084985,0.169216,0.577237,0.657294
f3,0.331014,0.810579,0.172825,-0.086231,0.635319,-0.056251,1.0,0.042366,-0.133842,0.089028,...,0.106885,-0.082335,0.110527,-0.116219,-0.062263,0.54306,0.088501,-0.163092,-0.547606,-0.666871
f4,0.119956,0.072794,0.231705,-0.023618,0.041033,-0.183748,0.042366,1.0,-0.003077,0.073686,...,0.154011,-0.057333,0.018252,-0.06547,0.142554,0.062298,0.233152,-0.07915,-0.073668,-0.183175
f5,-0.240325,-0.271924,-0.270942,0.027401,-0.012316,0.033938,-0.133842,-0.003077,1.0,0.041331,...,-0.062304,0.138098,-0.071045,0.00665,0.044296,0.112701,0.202093,0.077997,0.059569,0.096684
f6,0.046224,0.066017,-0.005689,0.300276,0.293253,0.080672,0.089028,0.073686,0.041331,1.0,...,0.206221,0.223754,0.090301,0.114012,0.221682,0.190635,0.44206,0.005864,-0.023346,0.002799


In [27]:
columns = ["f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9",
            "f10", "f11", "f12", "f13", "f14", "f15", "f16", "f17", "f18", "f19",
            "f20", "f21", "f22", "f23", "f24", "f25", "f26", "f27", "f28", "f29",
            "f30", "f31", "f32", "f33", "f34", "f35", "f36", "f37", "tfidf", "rank"]

def normalize_array(x):
    x = np.array(x)
    min_x = np.min(x)
    max_x = np.max(x)
    x = (x - min_x)/(max_x - min_x)
    return x


def normalized_df(df):
    X = [normalize_array([df[[col]].values[i][0] for i in range(len(df[[col]]))]) for col in columns]
    X = np.array(X)
    return X.T

In [28]:
X = all_data[["f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9",
            "f10", "f11", "f12", "f13", "f14", "f15", "f16", "f17", "f18", "f19",
            "f20", "f21", "f22", "f23", "f24", "f25", "f26", "f27", "f28", "f29",
            "f30", "f31", "f32", "f33", "f34", "f35", "f36", "f37", "tfidf", "rank"]]
X = normalized_df(X)

In [29]:
y = all_data[["relation"]].to_numpy().ravel()

# Data Modeling along with our features

In [30]:
def print_results(result):
    print("Precision: %.1f" % result[0])
    print("Recall: %.1f" % result[1])
    print("F1 Score: %.1f" % result[2])
    print("Area Under Curve: %.1f" % result[3])

def evaluation_results(y_test, y_predict):
    recall = recall_score(y_test, y_predict)*100
    precision = precision_score(y_test, y_predict)*100
    f1 = f1_score(y_test, y_predict)*100
    auc = roc_auc_score(y_test, y_predict)*100
    return [precision, recall, f1, auc]


def model_train(x_train, x_test, y_train, y_test, model):
    model.fit(x_train, y_train)
    y_predict = model.predict(x_test)
    return evaluation_results(y_test, y_predict)


def k_fold_training(X, Y, model):
    results = []
    kf = KFold(n_splits = 5)
    kf.get_n_splits(X)
    for train_index, test_index in kf.split(X):
        x_train, x_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        result = model_train(x_train, x_test, y_train, y_test, model)
        results.append(result)
    results = np.array(results)
    final_result = np.mean(results, axis = 0)
    return final_result

## Random Forest

In [31]:
rand_params = {
    "n_estimators" : 200,
    "max_features" : 'auto',
}
model = RandomForestClassifier(**rand_params)
result = k_fold_training(X, y, model)
print_results(result)

print("\n")

rand_params = {
    "n_estimators" : 200,
    "max_features" : 'auto',
}
model = RandomForestClassifier(**rand_params)
result = k_fold_training(X[:,38:40], y, model)
print_results(result)

Precision: 93.8
Recall: 88.0
F1 Score: 90.8
Area Under Curve: 92.7


Precision: 70.1
Recall: 59.5
F1 Score: 64.1
Area Under Curve: 74.0


## Naive Bayes

In [32]:
model = gaussian_nb = GaussianNB()
result = k_fold_training(X, y, model)
print_results(result)

print("\n")

model = gaussian_nb = GaussianNB()
result = k_fold_training(X[:,38:40], y, model)
print_results(result)

Precision: 84.8
Recall: 44.5
F1 Score: 58.3
Area Under Curve: 70.5


Precision: 80.7
Recall: 29.2
F1 Score: 42.8
Area Under Curve: 63.0


## Logistic Regression

In [36]:
model = LogisticRegression(solver='lbfgs')
result = k_fold_training(X, y, model)
print_results(result)

print("\n")

model = LogisticRegression(solver='lbfgs')
result = k_fold_training(X[:,38:40], y, model)
print_results(result)

Precision: 84.8
Recall: 64.7
F1 Score: 73.3
Area Under Curve: 79.7


Precision: 84.4
Recall: 25.5
F1 Score: 39.1
Area Under Curve: 61.6


## SVM

In [31]:
model = LinearSVC(random_state=0, tol=1e-5)
result = k_fold_training(X, y, model)
print_results(result)

Precision: 83.6
Recall: 69.0
F1 Score: 75.5
Area Under Curve: 81.4


In [31]:
result = k_fold_training(X, y)
print_results(result)

Precision: 93.5
Recall: 88.4
F1 Score: 90.8
Area Under Curve: 92.8


In [16]:
result = k_fold_training(X, y)
print_results(result)

Precision: 94.4
Recall: 88.2
F1 Score: 91.2
Area Under Curve: 92.9
