In [1]:
import pandas as pd
import numpy as np

import sklearn
from sklearn.preprocessing import scale

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn.model_selection import train_test_split, KFold
from sklearn import metrics
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

In [2]:
columns = ["f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9",
            "f10", "f11", "f12", "f13", "f14", "f15", "f16", "f17", "f18", "f19",
            "f20", "f21", "f22", "f23", "f24", "f25", "f26", "f27", "f28", "f29",
            "f30", "f31", "f32", "f33", "f34", "f35", "f36", "f37", "relation"]


def extract_data(line):
    features = line.split()
    data = {
        "relation" : int(features[0]),
        "f0" : float(features[1].split(":")[1]),
        "f1" : float(features[2].split(":")[1]),
        "f2" : float(features[3].split(":")[1]),
        "f3" : float(features[4].split(":")[1]),
        "f4" : float(features[5].split(":")[1]),
        "f5" : float(features[6].split(":")[1]),
        "f6" : float(features[7].split(":")[1]),
        "f7" : float(features[8].split(":")[1]),
        "f8" : float(features[9].split(":")[1]),
        "f9" : float(features[10].split(":")[1]),
        "f10" : float(features[11].split(":")[1]),
        "f11" : float(features[12].split(":")[1]),
        "f12" : float(features[13].split(":")[1]),
        "f13" : float(features[14].split(":")[1]),
        "f14" : float(features[15].split(":")[1]),
        "f15" : float(features[16].split(":")[1]),
        "f16" : float(features[17].split(":")[1]),
        "f17" : float(features[18].split(":")[1]),
        "f18" : float(features[19].split(":")[1]),
        "f19" : float(features[20].split(":")[1]),
        "f20" : float(features[21].split(":")[1]),
        "f21" : float(features[22].split(":")[1]),
        "f22" : float(features[23].split(":")[1]),
        "f23" : float(features[24].split(":")[1]),
        "f24" : float(features[25].split(":")[1]),
        "f25" : float(features[26].split(":")[1]),
        "f26" : float(features[27].split(":")[1]),
        "f27" : float(features[28].split(":")[1]),
        "f28" : float(features[29].split(":")[1]),
        "f29" : float(features[30].split(":")[1]),
        "f30" : float(features[31].split(":")[1]),
        "f31" : float(features[32].split(":")[1]),
        "f32" : float(features[33].split(":")[1]),
        "f33" : float(features[34].split(":")[1]),
        "f34" : float(features[35].split(":")[1]),
        "f35" : float(features[36].split(":")[1]),
        "f36" : float(features[37].split(":")[1]),
        "f37" : float(features[38].split(":")[1]),
    }
    return data

def normalize_array(x):
    x = np.array(x)
    min_x = np.min(x)
    max_x = np.max(x)
    x = (x - min_x)/(max_x - min_x)
    return x


def normalized_df(df):
    X = [normalize_array([df[[col]].values[i][0] for i in range(len(df[[col]]))]) for col in columns]
    X = np.array(X)
    return X


def file_read(file_name):
    file = open(file_name, "r")
    df = pd.DataFrame(columns = columns)
    for line in file:
        df = df.append(extract_data(line), ignore_index=True)
#     data = normalized_df(df)
    return df

In [3]:
features_file = "../../dataset/precalculus.features"

In [4]:
features = file_read(features_file)

In [5]:
features

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f29,f30,f31,f32,f33,f34,f35,f36,f37,relation
0,257.0,349.0,797.0,583.0,1.0,3.0,11.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,352.0,831.0,50.0,-0.000040,0.000014,-7.542200e-07,1.0
1,55.0,197.0,149.0,286.0,2.0,3.0,10.0,1.0,1.0,1.0,...,0.0,0.006558,0.000000,289.0,374.0,61.0,0.000009,-0.000007,-1.910268e-06,1.0
2,141.0,197.0,621.0,286.0,5.0,3.0,18.0,1.0,1.0,2.0,...,0.0,0.004079,0.000000,555.0,374.0,79.0,0.000018,0.000021,6.730535e-06,1.0
3,57.0,198.0,28.0,660.0,3.0,2.0,6.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,278.0,524.0,36.0,-0.000045,-0.000055,-6.749430e-06,1.0
4,370.0,102.0,614.0,295.0,3.0,3.0,44.0,2.0,14.0,1.0,...,62.0,0.001687,0.023195,918.0,269.0,119.0,0.000051,0.000017,2.138242e-06,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2055,202.0,185.0,838.0,1088.0,2.0,5.0,17.0,0.0,2.0,0.0,...,0.0,0.000177,0.000000,580.0,319.0,55.0,-0.000038,-0.000011,-2.839315e-07,0.0
2056,442.0,20.0,15414.0,55.0,4.0,3.0,4.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,862.0,97.0,7.0,0.000066,0.001274,1.016967e-04,0.0
2057,146.0,64.0,399.0,86.0,4.0,2.0,8.0,0.0,0.0,0.0,...,1.0,0.000000,0.000853,326.0,141.0,23.0,0.000270,0.000015,3.585556e-06,0.0
2058,101.0,363.0,145.0,3440.0,2.0,1.0,4.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,45.0,665.0,3.0,-0.000051,-0.000227,-2.401727e-05,1.0


In [7]:
labeled_pairs = pd.read_csv("../../output_files/precalculus_labeled_pairs.csv")

In [8]:
labeled_pairs = labeled_pairs[["topic_a", "topic_b"]]
labeled_pairs

Unnamed: 0,topic_a,topic_b
0,Differential equation,Number
1,Asymptote,Line (geometry)
2,Cartesian coordinate system,Line (geometry)
3,Partial fraction decomposition,Arithmetic
4,Logarithm,Natural logarithm
...,...,...
2055,Polynomial,Natural number
2056,Mathematics,Polynomial long division
2057,Multiplication,Minor (linear algebra)
2058,Trigonometric substitution,Geometry


In [9]:
all_data = pd.concat([features, labeled_pairs], axis = 1)

In [10]:
all_data

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f31,f32,f33,f34,f35,f36,f37,relation,topic_a,topic_b
0,257.0,349.0,797.0,583.0,1.0,3.0,11.0,0.0,0.0,0.0,...,0.000000,352.0,831.0,50.0,-0.000040,0.000014,-7.542200e-07,1.0,Differential equation,Number
1,55.0,197.0,149.0,286.0,2.0,3.0,10.0,1.0,1.0,1.0,...,0.000000,289.0,374.0,61.0,0.000009,-0.000007,-1.910268e-06,1.0,Asymptote,Line (geometry)
2,141.0,197.0,621.0,286.0,5.0,3.0,18.0,1.0,1.0,2.0,...,0.000000,555.0,374.0,79.0,0.000018,0.000021,6.730535e-06,1.0,Cartesian coordinate system,Line (geometry)
3,57.0,198.0,28.0,660.0,3.0,2.0,6.0,0.0,0.0,0.0,...,0.000000,278.0,524.0,36.0,-0.000045,-0.000055,-6.749430e-06,1.0,Partial fraction decomposition,Arithmetic
4,370.0,102.0,614.0,295.0,3.0,3.0,44.0,2.0,14.0,1.0,...,0.023195,918.0,269.0,119.0,0.000051,0.000017,2.138242e-06,0.0,Logarithm,Natural logarithm
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2055,202.0,185.0,838.0,1088.0,2.0,5.0,17.0,0.0,2.0,0.0,...,0.000000,580.0,319.0,55.0,-0.000038,-0.000011,-2.839315e-07,0.0,Polynomial,Natural number
2056,442.0,20.0,15414.0,55.0,4.0,3.0,4.0,0.0,0.0,0.0,...,0.000000,862.0,97.0,7.0,0.000066,0.001274,1.016967e-04,0.0,Mathematics,Polynomial long division
2057,146.0,64.0,399.0,86.0,4.0,2.0,8.0,0.0,0.0,0.0,...,0.000853,326.0,141.0,23.0,0.000270,0.000015,3.585556e-06,0.0,Multiplication,Minor (linear algebra)
2058,101.0,363.0,145.0,3440.0,2.0,1.0,4.0,0.0,0.0,0.0,...,0.000000,45.0,665.0,3.0,-0.000051,-0.000227,-2.401727e-05,1.0,Trigonometric substitution,Geometry


In [11]:
all_data.to_csv("../../output_files/physics_pairs_data.csv")

In [12]:
X = all_data[["f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9",
            "f10", "f11", "f12", "f13", "f14", "f15", "f16", "f17", "f18", "f19",
            "f20", "f21", "f22", "f23", "f24", "f25", "f26", "f27", "f28", "f29",
            "f30", "f31", "f32", "f33", "f34", "f35", "f36", "f37"]]
y = all_data[["relation"]].to_numpy().ravel()

In [13]:
columns = ["f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9",
            "f10", "f11", "f12", "f13", "f14", "f15", "f16", "f17", "f18", "f19",
            "f20", "f21", "f22", "f23", "f24", "f25", "f26", "f27", "f28", "f29",
            "f30", "f31", "f32", "f33", "f34", "f35", "f36", "f37"]

def normalize_array(x):
    x = np.array(x)
    min_x = np.min(x)
    max_x = np.max(x)
    x = (x - min_x)/(max_x - min_x)
    return x


def normalized_df(df):
    X = [normalize_array([df[[col]].values[i][0] for i in range(len(df[[col]]))]) for col in columns]
    X = np.array(X)
    return X.T

In [14]:
X = normalized_df(X)

## Traning Model

In [15]:
def print_results(result):
    print("Precision: %.1f" % result[0])
    print("Recall: %.1f" % result[1])
    print("F1 Score: %.1f" % result[2])
    print("Area Under Curve: %.1f" % result[3])

def evaluation_results(y_test, y_predict):
    recall = recall_score(y_test, y_predict)*100
    precision = precision_score(y_test, y_predict)*100
    f1 = f1_score(y_test, y_predict)*100
    auc = roc_auc_score(y_test, y_predict)*100
    return [precision, recall, f1, auc]

rand_params = {
    "n_estimators" : 200,
    "max_features" : 'auto',
#     "max_depth" : 50,
#     "min_samples_split" : 2,
#     "min_samples_leaf" : 1
}


def random_forest(x_train, x_test, y_train, y_test):
    rand_forest = RandomForestClassifier(**rand_params)
    rand_forest.fit(x_train, y_train)
    y_predict = rand_forest.predict(x_test)
    return evaluation_results(y_test, y_predict)

def k_fold_training(X, Y):
    results = []
    kf = KFold(n_splits = 5)
    kf.get_n_splits(X)
    for train_index, test_index in kf.split(X):
        x_train, x_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        result = random_forest(x_train, x_test, y_train, y_test)
        results.append(result)
    results = np.array(results)
    final_result = np.mean(results, axis = 0)
    return final_result

In [16]:
result = k_fold_training(X, y)
print_results(result)

Precision: 89.9
Recall: 89.2
F1 Score: 89.5
Area Under Curve: 92.0
