In [18]:
import pandas as pd
import numpy as np

import sklearn
from sklearn.preprocessing import scale

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn.model_selection import train_test_split, KFold
from sklearn import metrics
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

In [3]:
columns = ["f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9",
            "f10", "f11", "f12", "f13", "f14", "f15", "f16", "f17", "f18", "f19",
            "f20", "f21", "f22", "f23", "f24", "f25", "f26", "f27", "f28", "f29",
            "f30", "f31", "f32", "f33", "f34", "f35", "f36", "f37", "relation"]


def extract_data(line):
    features = line.split()
    data = {
        "relation" : int(features[0]),
        "f0" : float(features[1].split(":")[1]),
        "f1" : float(features[2].split(":")[1]),
        "f2" : float(features[3].split(":")[1]),
        "f3" : float(features[4].split(":")[1]),
        "f4" : float(features[5].split(":")[1]),
        "f5" : float(features[6].split(":")[1]),
        "f6" : float(features[7].split(":")[1]),
        "f7" : float(features[8].split(":")[1]),
        "f8" : float(features[9].split(":")[1]),
        "f9" : float(features[10].split(":")[1]),
        "f10" : float(features[11].split(":")[1]),
        "f11" : float(features[12].split(":")[1]),
        "f12" : float(features[13].split(":")[1]),
        "f13" : float(features[14].split(":")[1]),
        "f14" : float(features[15].split(":")[1]),
        "f15" : float(features[16].split(":")[1]),
        "f16" : float(features[17].split(":")[1]),
        "f17" : float(features[18].split(":")[1]),
        "f18" : float(features[19].split(":")[1]),
        "f19" : float(features[20].split(":")[1]),
        "f20" : float(features[21].split(":")[1]),
        "f21" : float(features[22].split(":")[1]),
        "f22" : float(features[23].split(":")[1]),
        "f23" : float(features[24].split(":")[1]),
        "f24" : float(features[25].split(":")[1]),
        "f25" : float(features[26].split(":")[1]),
        "f26" : float(features[27].split(":")[1]),
        "f27" : float(features[28].split(":")[1]),
        "f28" : float(features[29].split(":")[1]),
        "f29" : float(features[30].split(":")[1]),
        "f30" : float(features[31].split(":")[1]),
        "f31" : float(features[32].split(":")[1]),
        "f32" : float(features[33].split(":")[1]),
        "f33" : float(features[34].split(":")[1]),
        "f34" : float(features[35].split(":")[1]),
        "f35" : float(features[36].split(":")[1]),
        "f36" : float(features[37].split(":")[1]),
        "f37" : float(features[38].split(":")[1]),
    }
    return data

def normalize_array(x):
    x = np.array(x)
    min_x = np.min(x)
    max_x = np.max(x)
    x = (x - min_x)/(max_x - min_x)
    return x


def normalized_df(df):
    X = [normalize_array([df[[col]].values[i][0] for i in range(len(df[[col]]))]) for col in columns]
    X = np.array(X)
    return X


def file_read(file_name):
    file = open(file_name, "r")
    df = pd.DataFrame(columns = columns)
    for line in file:
        df = df.append(extract_data(line), ignore_index=True)
#     data = normalized_df(df)
    return df

In [4]:
features_file = "../../dataset/physics.features"

In [5]:
features = file_read(features_file)

In [6]:
features

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f29,f30,f31,f32,f33,f34,f35,f36,f37,relation
0,321.0,190.0,1613.0,197.0,2.0,4.0,19.0,0.0,0.0,0.0,...,1.0,0.000000,0.000217,908.0,504.0,87.0,0.000040,0.000086,1.118241e-05,0.0
1,36.0,77.0,109.0,101.0,3.0,2.0,4.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,230.0,161.0,21.0,0.000002,0.000002,9.914547e-08,1.0
2,808.0,495.0,3104.0,2394.0,2.0,3.0,25.0,0.0,0.0,0.0,...,6.0,0.000138,0.000717,961.0,1060.0,103.0,-0.000003,-0.000017,1.509753e-05,0.0
3,99.0,48.0,96.0,212.0,2.0,4.0,5.0,0.0,0.0,0.0,...,0.0,0.000832,0.000000,140.0,128.0,19.0,0.000007,-0.000004,-1.144610e-05,0.0
4,643.0,162.0,9935.0,129.0,1.0,5.0,21.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,893.0,109.0,12.0,0.000095,0.000940,6.710326e-05,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1957,350.0,203.0,4432.0,92.0,3.0,4.0,22.0,0.0,0.0,0.0,...,16.0,0.000000,0.001753,727.0,1078.0,139.0,0.000010,0.000172,2.182559e-05,0.0
1958,321.0,59.0,1613.0,420.0,2.0,3.0,7.0,1.0,0.0,0.0,...,0.0,0.000895,0.000000,908.0,104.0,30.0,0.000113,0.000062,7.647945e-06,0.0
1959,127.0,48.0,1368.0,269.0,4.0,1.0,11.0,0.0,0.0,4.0,...,19.0,0.000000,0.008109,202.0,269.0,28.0,0.000062,0.000055,8.195897e-06,0.0
1960,643.0,147.0,9935.0,851.0,1.0,4.0,24.0,0.0,0.0,0.0,...,2.0,0.000000,0.001042,893.0,224.0,37.0,0.000150,0.000904,5.851008e-05,0.0


In [7]:
labeled_pairs = pd.read_csv("../../output_files/physics_labeled_pairs.csv")

In [8]:
labeled_pairs = labeled_pairs[["topic_a", "topic_b"]]
labeled_pairs

Unnamed: 0,topic_a,topic_b
0,Magnetic field,Inertial frame of reference
1,Gravitational acceleration,Position (vector)
2,Energy,Planet
3,Mechanical energy,Gravitational field
4,Physics,Electric potential energy
...,...,...
1957,Mass,Le Sage's theory of gravitation
1958,Magnetic field,Length
1959,Voltage,Series and parallel circuits
1960,Physics,Velocity


In [9]:
all_data = pd.concat([features, labeled_pairs], axis = 1)

In [10]:
all_data

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f31,f32,f33,f34,f35,f36,f37,relation,topic_a,topic_b
0,321.0,190.0,1613.0,197.0,2.0,4.0,19.0,0.0,0.0,0.0,...,0.000217,908.0,504.0,87.0,0.000040,0.000086,1.118241e-05,0.0,Magnetic field,Inertial frame of reference
1,36.0,77.0,109.0,101.0,3.0,2.0,4.0,0.0,0.0,0.0,...,0.000000,230.0,161.0,21.0,0.000002,0.000002,9.914547e-08,1.0,Gravitational acceleration,Position (vector)
2,808.0,495.0,3104.0,2394.0,2.0,3.0,25.0,0.0,0.0,0.0,...,0.000717,961.0,1060.0,103.0,-0.000003,-0.000017,1.509753e-05,0.0,Energy,Planet
3,99.0,48.0,96.0,212.0,2.0,4.0,5.0,0.0,0.0,0.0,...,0.000000,140.0,128.0,19.0,0.000007,-0.000004,-1.144610e-05,0.0,Mechanical energy,Gravitational field
4,643.0,162.0,9935.0,129.0,1.0,5.0,21.0,0.0,0.0,0.0,...,0.000000,893.0,109.0,12.0,0.000095,0.000940,6.710326e-05,0.0,Physics,Electric potential energy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1957,350.0,203.0,4432.0,92.0,3.0,4.0,22.0,0.0,0.0,0.0,...,0.001753,727.0,1078.0,139.0,0.000010,0.000172,2.182559e-05,0.0,Mass,Le Sage's theory of gravitation
1958,321.0,59.0,1613.0,420.0,2.0,3.0,7.0,1.0,0.0,0.0,...,0.000000,908.0,104.0,30.0,0.000113,0.000062,7.647945e-06,0.0,Magnetic field,Length
1959,127.0,48.0,1368.0,269.0,4.0,1.0,11.0,0.0,0.0,4.0,...,0.008109,202.0,269.0,28.0,0.000062,0.000055,8.195897e-06,0.0,Voltage,Series and parallel circuits
1960,643.0,147.0,9935.0,851.0,1.0,4.0,24.0,0.0,0.0,0.0,...,0.001042,893.0,224.0,37.0,0.000150,0.000904,5.851008e-05,0.0,Physics,Velocity


In [86]:
all_data.to_csv("../../output_files/physics_pairs_data.csv")

In [13]:
X = all_data[["f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9",
            "f10", "f11", "f12", "f13", "f14", "f15", "f16", "f17", "f18", "f19",
            "f20", "f21", "f22", "f23", "f24", "f25", "f26", "f27", "f28", "f29",
            "f30", "f31", "f32", "f33", "f34", "f35", "f36", "f37"]]
y = all_data[["relation"]].to_numpy().ravel()

In [14]:
columns = ["f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9",
            "f10", "f11", "f12", "f13", "f14", "f15", "f16", "f17", "f18", "f19",
            "f20", "f21", "f22", "f23", "f24", "f25", "f26", "f27", "f28", "f29",
            "f30", "f31", "f32", "f33", "f34", "f35", "f36", "f37"]

def normalize_array(x):
    x = np.array(x)
    min_x = np.min(x)
    max_x = np.max(x)
    x = (x - min_x)/(max_x - min_x)
    return x


def normalized_df(df):
    X = [normalize_array([df[[col]].values[i][0] for i in range(len(df[[col]]))]) for col in columns]
    X = np.array(X)
    return X.T

In [15]:
X = normalized_df(X)

## Traning Model

In [54]:
def print_results(result):
    print("Precision: %.1f" % result[0])
    print("Recall: %.1f" % result[1])
    print("F1 Score: %.1f" % result[2])
    print("Area Under Curve: %.1f" % result[3])

def evaluation_results(y_test, y_predict):
    recall = recall_score(y_test, y_predict)*100
    precision = precision_score(y_test, y_predict)*100
    f1 = f1_score(y_test, y_predict)*100
    auc = roc_auc_score(y_test, y_predict)*100
    return [precision, recall, f1, auc]

rand_params = {
    "n_estimators" : 200,
    "max_features" : 'auto',
#     "max_depth" : 50,
#     "min_samples_split" : 2,
#     "min_samples_leaf" : 1
}


def random_forest(x_train, x_test, y_train, y_test):
    rand_forest = RandomForestClassifier(**rand_params)
    rand_forest.fit(x_train, y_train)
    y_predict = rand_forest.predict(x_test)
    return evaluation_results(y_test, y_predict)

def k_fold_training(X, Y):
    results = []
    kf = KFold(n_splits = 5)
    kf.get_n_splits(X)
    for train_index, test_index in kf.split(X):
        x_train, x_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        result = random_forest(x_train, x_test, y_train, y_test)
        results.append(result)
    results = np.array(results)
    final_result = np.mean(results, axis = 0)
    return final_result

In [55]:
result = k_fold_training(X, y)
print_results(result)

Precision: 82.8
Recall: 61.7
F1 Score: 70.7
Area Under Curve: 78.8
