In [1]:
import pandas as pd
import numpy as np

import sklearn
from sklearn.preprocessing import scale

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn.model_selection import train_test_split, KFold
from sklearn import metrics
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

In [2]:
columns = ["f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9",
            "f10", "f11", "f12", "f13", "f14", "f15", "f16", "f17", "f18", "f19",
            "f20", "f21", "f22", "f23", "f24", "f25", "f26", "f27", "f28", "f29",
            "f30", "f31", "f32", "f33", "f34", "f35", "f36", "f37", "relation"]


def extract_data(line):
    features = line.split()
    data = {
        "relation" : int(features[0]),
        "f0" : float(features[1].split(":")[1]),
        "f1" : float(features[2].split(":")[1]),
        "f2" : float(features[3].split(":")[1]),
        "f3" : float(features[4].split(":")[1]),
        "f4" : float(features[5].split(":")[1]),
        "f5" : float(features[6].split(":")[1]),
        "f6" : float(features[7].split(":")[1]),
        "f7" : float(features[8].split(":")[1]),
        "f8" : float(features[9].split(":")[1]),
        "f9" : float(features[10].split(":")[1]),
        "f10" : float(features[11].split(":")[1]),
        "f11" : float(features[12].split(":")[1]),
        "f12" : float(features[13].split(":")[1]),
        "f13" : float(features[14].split(":")[1]),
        "f14" : float(features[15].split(":")[1]),
        "f15" : float(features[16].split(":")[1]),
        "f16" : float(features[17].split(":")[1]),
        "f17" : float(features[18].split(":")[1]),
        "f18" : float(features[19].split(":")[1]),
        "f19" : float(features[20].split(":")[1]),
        "f20" : float(features[21].split(":")[1]),
        "f21" : float(features[22].split(":")[1]),
        "f22" : float(features[23].split(":")[1]),
        "f23" : float(features[24].split(":")[1]),
        "f24" : float(features[25].split(":")[1]),
        "f25" : float(features[26].split(":")[1]),
        "f26" : float(features[27].split(":")[1]),
        "f27" : float(features[28].split(":")[1]),
        "f28" : float(features[29].split(":")[1]),
        "f29" : float(features[30].split(":")[1]),
        "f30" : float(features[31].split(":")[1]),
        "f31" : float(features[32].split(":")[1]),
        "f32" : float(features[33].split(":")[1]),
        "f33" : float(features[34].split(":")[1]),
        "f34" : float(features[35].split(":")[1]),
        "f35" : float(features[36].split(":")[1]),
        "f36" : float(features[37].split(":")[1]),
        "f37" : float(features[38].split(":")[1]),
    }
    return data

def normalize_array(x):
    x = np.array(x)
    min_x = np.min(x)
    max_x = np.max(x)
    x = (x - min_x)/(max_x - min_x)
    return x


def normalized_df(df):
    X = [normalize_array([df[[col]].values[i][0] for i in range(len(df[[col]]))]) for col in columns]
    X = np.array(X)
    return X


def file_read(file_name):
    file = open(file_name, "r")
    df = pd.DataFrame(columns = columns)
    for line in file:
        df = df.append(extract_data(line), ignore_index=True)
#     data = normalized_df(df)
    return df

In [6]:
features_file = "geometry.features"

In [7]:
features = file_read(features_file)

In [8]:
features

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f29,f30,f31,f32,f33,f34,f35,f36,f37,relation
0,53.0,86.0,247.0,176.0,2.0,3.0,4.0,0.0,2.0,3.0,...,0.0,0.000000,0.000000,120.0,161.0,7.0,-5.425241e-05,1.966443e-05,4.727588e-07,0.0
1,184.0,61.0,642.0,147.0,1.0,3.0,11.0,0.0,0.0,1.0,...,20.0,0.000000,0.011461,341.0,186.0,25.0,3.362640e-04,2.500931e-05,8.087279e-06,0.0
2,184.0,262.0,642.0,331.0,1.0,9.0,35.0,1.0,0.0,0.0,...,35.0,0.000000,0.004860,341.0,713.0,75.0,2.450131e-04,1.771560e-05,4.914820e-06,0.0
3,42.0,40.0,333.0,10.0,3.0,2.0,6.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,67.0,141.0,7.0,7.609828e-06,5.902672e-06,1.827928e-06,0.0
4,141.0,197.0,621.0,286.0,5.0,3.0,18.0,1.0,1.0,2.0,...,0.0,0.004079,0.000000,555.0,374.0,79.0,1.794967e-05,2.058517e-05,6.730535e-06,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1676,238.0,42.0,766.0,333.0,2.0,3.0,13.0,0.0,0.0,1.0,...,5.0,0.000587,0.011086,254.0,67.0,12.0,2.200583e-05,5.148986e-05,3.152873e-06,1.0
1677,61.0,120.0,347.0,596.0,2.0,2.0,5.0,0.0,0.0,1.0,...,2.0,0.000000,0.001953,232.0,149.0,11.0,-4.459765e-07,3.928922e-06,1.387987e-06,0.0
1678,76.0,184.0,126.0,642.0,1.0,1.0,21.0,0.0,2.0,1.0,...,0.0,0.018243,0.000000,227.0,341.0,34.0,-3.961945e-04,-2.415513e-05,-8.008553e-06,1.0
1679,85.0,363.0,149.0,3440.0,2.0,1.0,16.0,0.0,1.0,1.0,...,1.0,0.002621,0.000274,251.0,665.0,49.0,-5.148803e-05,-2.249849e-04,-2.364102e-05,1.0


In [10]:
labeled_pairs = pd.read_csv("../output_files/geometry_labeled_pairs.csv")

In [11]:
labeled_pairs = labeled_pairs[["topic_a", "topic_b"]]
labeled_pairs

Unnamed: 0,topic_a,topic_b
0,Parallelogram,Rotational symmetry
1,Angle,Isosceles triangle
2,Angle,Pythagorean theorem
3,Edge (geometry),Acute and obtuse triangles
4,Cartesian coordinate system,Line (geometry)
...,...,...
1676,Polygon,Edge (geometry)
1677,Parallel (geometry),Prism (geometry)
1678,Bisection,Angle
1679,Similarity (geometry),Geometry


In [12]:
all_data = pd.concat([features, labeled_pairs], axis = 1)

In [13]:
all_data

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f31,f32,f33,f34,f35,f36,f37,relation,topic_a,topic_b
0,53.0,86.0,247.0,176.0,2.0,3.0,4.0,0.0,2.0,3.0,...,0.000000,120.0,161.0,7.0,-5.425241e-05,1.966443e-05,4.727588e-07,0.0,Parallelogram,Rotational symmetry
1,184.0,61.0,642.0,147.0,1.0,3.0,11.0,0.0,0.0,1.0,...,0.011461,341.0,186.0,25.0,3.362640e-04,2.500931e-05,8.087279e-06,0.0,Angle,Isosceles triangle
2,184.0,262.0,642.0,331.0,1.0,9.0,35.0,1.0,0.0,0.0,...,0.004860,341.0,713.0,75.0,2.450131e-04,1.771560e-05,4.914820e-06,0.0,Angle,Pythagorean theorem
3,42.0,40.0,333.0,10.0,3.0,2.0,6.0,0.0,0.0,0.0,...,0.000000,67.0,141.0,7.0,7.609828e-06,5.902672e-06,1.827928e-06,0.0,Edge (geometry),Acute and obtuse triangles
4,141.0,197.0,621.0,286.0,5.0,3.0,18.0,1.0,1.0,2.0,...,0.000000,555.0,374.0,79.0,1.794967e-05,2.058517e-05,6.730535e-06,1.0,Cartesian coordinate system,Line (geometry)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1676,238.0,42.0,766.0,333.0,2.0,3.0,13.0,0.0,0.0,1.0,...,0.011086,254.0,67.0,12.0,2.200583e-05,5.148986e-05,3.152873e-06,1.0,Polygon,Edge (geometry)
1677,61.0,120.0,347.0,596.0,2.0,2.0,5.0,0.0,0.0,1.0,...,0.001953,232.0,149.0,11.0,-4.459765e-07,3.928922e-06,1.387987e-06,0.0,Parallel (geometry),Prism (geometry)
1678,76.0,184.0,126.0,642.0,1.0,1.0,21.0,0.0,2.0,1.0,...,0.000000,227.0,341.0,34.0,-3.961945e-04,-2.415513e-05,-8.008553e-06,1.0,Bisection,Angle
1679,85.0,363.0,149.0,3440.0,2.0,1.0,16.0,0.0,1.0,1.0,...,0.000274,251.0,665.0,49.0,-5.148803e-05,-2.249849e-04,-2.364102e-05,1.0,Similarity (geometry),Geometry


In [15]:
all_data.to_csv("../output_files/physics_pairs_data.csv")

In [16]:
X = all_data[["f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9",
            "f10", "f11", "f12", "f13", "f14", "f15", "f16", "f17", "f18", "f19",
            "f20", "f21", "f22", "f23", "f24", "f25", "f26", "f27", "f28", "f29",
            "f30", "f31", "f32", "f33", "f34", "f35", "f36", "f37"]]
y = all_data[["relation"]].to_numpy().ravel()

In [17]:
columns = ["f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9",
            "f10", "f11", "f12", "f13", "f14", "f15", "f16", "f17", "f18", "f19",
            "f20", "f21", "f22", "f23", "f24", "f25", "f26", "f27", "f28", "f29",
            "f30", "f31", "f32", "f33", "f34", "f35", "f36", "f37"]

def normalize_array(x):
    x = np.array(x)
    min_x = np.min(x)
    max_x = np.max(x)
    x = (x - min_x)/(max_x - min_x)
    return x


def normalized_df(df):
    X = [normalize_array([df[[col]].values[i][0] for i in range(len(df[[col]]))]) for col in columns]
    X = np.array(X)
    return X.T

In [18]:
X = normalized_df(X)

## Traning Model

In [26]:
def print_results(result):
    print("Precision: %.1f" % result[0])
    print("Recall: %.1f" % result[1])
    print("F1 Score: %.1f" % result[2])
    print("Area Under Curve: %.1f" % result[3])

def evaluation_results(y_test, y_predict):
    recall = recall_score(y_test, y_predict)*100
    precision = precision_score(y_test, y_predict)*100
    f1 = f1_score(y_test, y_predict)*100
    auc = roc_auc_score(y_test, y_predict)*100
    return [precision, recall, f1, auc]


def model_train(x_train, x_test, y_train, y_test, model):
    model.fit(x_train, y_train)
    y_predict = model.predict(x_test)
    return evaluation_results(y_test, y_predict)


def k_fold_training(X, Y, model):
    results = []
    kf = KFold(n_splits = 5)
    kf.get_n_splits(X)
    for train_index, test_index in kf.split(X):
        x_train, x_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        result = model_train(x_train, x_test, y_train, y_test, model)
        results.append(result)
    results = np.array(results)
    final_result = np.mean(results, axis = 0)
    return final_result

## Random Forest

In [31]:
rand_params = {
    "n_estimators" : 200,
    "max_features" : 'auto',
}
model = RandomForestClassifier(**rand_params)
result = k_fold_training(X, y, model)
print_results(result)

Precision: 94.3
Recall: 86.1
F1 Score: 90.0
Area Under Curve: 91.8


## Naive Bayes

In [32]:
model = gaussian_nb = GaussianNB()
result = k_fold_training(X, y, model)
print_results(result)

Precision: 84.6
Recall: 44.7
F1 Score: 58.4
Area Under Curve: 70.5


## Logistic Regression

In [33]:
model = LogisticRegression(solver='lbfgs')
result = k_fold_training(X, y, model)
print_results(result)

Precision: 84.2
Recall: 62.0
F1 Score: 71.4
Area Under Curve: 78.3


## Support Vector Machine

In [34]:
model = LinearSVC(random_state=0, tol=1e-5)
result = k_fold_training(X, y, model)
print_results(result)

Precision: 82.2
Recall: 66.3
F1 Score: 73.4
Area Under Curve: 79.9


Precision: 94.5
Recall: 85.8
F1 Score: 89.9
Area Under Curve: 91.7


In [None]:
Precision: 94.7
Recall: 85.7
F1 Score: 89.9
Area Under Curve: 91.7