In [1]:
import pandas as pd
import numpy as np

import sklearn
from sklearn.preprocessing import scale

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn.model_selection import train_test_split, KFold
from sklearn import metrics
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, average_precision_score
from sklearn.metrics import plot_precision_recall_curve, precision_recall_curve
import matplotlib.pyplot as plt

In [78]:
columns = ["f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9",
            "f10", "f11", "f12", "f13", "f14", "f15", "f16", "f17", "f18", "f19",
            "f20", "f21", "f22", "f23", "f24", "f25", "f26", "f27", "f28", "f29",
            "f30", "f31", "f32", "f33", "f34", "f35", "f36", "f37", "relation"]


def extract_data(line):
    features = line.split()
    data = {
        "relation" : int(features[0]),
        "f0" : float(features[1].split(":")[1]),
        "f1" : float(features[2].split(":")[1]),
        "f2" : float(features[3].split(":")[1]),
        "f3" : float(features[4].split(":")[1]),
        "f4" : float(features[5].split(":")[1]),
        "f5" : float(features[6].split(":")[1]),
        "f6" : float(features[7].split(":")[1]),
        "f7" : float(features[8].split(":")[1]),
        "f8" : float(features[9].split(":")[1]),
        "f9" : float(features[10].split(":")[1]),
        "f10" : float(features[11].split(":")[1]),
        "f11" : float(features[12].split(":")[1]),
        "f12" : float(features[13].split(":")[1]),
        "f13" : float(features[14].split(":")[1]),
        "f14" : float(features[15].split(":")[1]),
        "f15" : float(features[16].split(":")[1]),
        "f16" : float(features[17].split(":")[1]),
        "f17" : float(features[18].split(":")[1]),
        "f18" : float(features[19].split(":")[1]),
        "f19" : float(features[20].split(":")[1]),
        "f20" : float(features[21].split(":")[1]),
        "f21" : float(features[22].split(":")[1]),
        "f22" : float(features[23].split(":")[1]),
        "f23" : float(features[24].split(":")[1]),
        "f24" : float(features[25].split(":")[1]),
        "f25" : float(features[26].split(":")[1]),
        "f26" : float(features[27].split(":")[1]),
        "f27" : float(features[28].split(":")[1]),
        "f28" : float(features[29].split(":")[1]),
        "f29" : float(features[30].split(":")[1]),
        "f30" : float(features[31].split(":")[1]),
        "f31" : float(features[32].split(":")[1]),
        "f32" : float(features[33].split(":")[1]),
        "f33" : float(features[34].split(":")[1]),
        "f34" : float(features[35].split(":")[1]),
        "f35" : float(features[36].split(":")[1]),
        "f36" : float(features[37].split(":")[1]),
        "f37" : float(features[38].split(":")[1]),
    }
    return data

def normalize_array(x):
    x = np.array(x)
    min_x = np.min(x)
    max_x = np.max(x)
    x = (x - min_x)/(max_x - min_x)
    return x


def normalized_df(df):
    X = [normalize_array([df[[col]].values[i][0] for i in range(len(df[[col]]))]) for col in columns]
    X = np.array(X)
    return X


def file_read(file_name):
    file = open(file_name, "r")
    df = pd.DataFrame(columns = columns)
    for line in file:
        df = df.append(extract_data(line), ignore_index=True)
#     data = normalized_df(df)
    return df

In [79]:
features_file = "../../dataset/physics.features"

In [80]:
features = file_read(features_file)

In [81]:
features

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f29,f30,f31,f32,f33,f34,f35,f36,f37,relation
0,321.0,190.0,1613.0,197.0,2.0,4.0,19.0,0.0,0.0,0.0,...,1.0,0.000000,0.000217,908.0,504.0,87.0,0.000040,0.000086,1.118241e-05,0.0
1,36.0,77.0,109.0,101.0,3.0,2.0,4.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,230.0,161.0,21.0,0.000002,0.000002,9.914547e-08,1.0
2,808.0,495.0,3104.0,2394.0,2.0,3.0,25.0,0.0,0.0,0.0,...,6.0,0.000138,0.000717,961.0,1060.0,103.0,-0.000003,-0.000017,1.509753e-05,0.0
3,99.0,48.0,96.0,212.0,2.0,4.0,5.0,0.0,0.0,0.0,...,0.0,0.000832,0.000000,140.0,128.0,19.0,0.000007,-0.000004,-1.144610e-05,0.0
4,643.0,162.0,9935.0,129.0,1.0,5.0,21.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,893.0,109.0,12.0,0.000095,0.000940,6.710326e-05,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1957,350.0,203.0,4432.0,92.0,3.0,4.0,22.0,0.0,0.0,0.0,...,16.0,0.000000,0.001753,727.0,1078.0,139.0,0.000010,0.000172,2.182559e-05,0.0
1958,321.0,59.0,1613.0,420.0,2.0,3.0,7.0,1.0,0.0,0.0,...,0.0,0.000895,0.000000,908.0,104.0,30.0,0.000113,0.000062,7.647945e-06,0.0
1959,127.0,48.0,1368.0,269.0,4.0,1.0,11.0,0.0,0.0,4.0,...,19.0,0.000000,0.008109,202.0,269.0,28.0,0.000062,0.000055,8.195897e-06,0.0
1960,643.0,147.0,9935.0,851.0,1.0,4.0,24.0,0.0,0.0,0.0,...,2.0,0.000000,0.001042,893.0,224.0,37.0,0.000150,0.000904,5.851008e-05,0.0


In [82]:
labeled_pairs = pd.read_csv("../../output_files/physics_labeled_pairs.csv")

In [83]:
labeled_pairs = labeled_pairs[["topic_a", "topic_b"]]
labeled_pairs

Unnamed: 0,topic_a,topic_b
0,Magnetic field,Inertial frame of reference
1,Gravitational acceleration,Position (vector)
2,Energy,Planet
3,Mechanical energy,Gravitational field
4,Physics,Electric potential energy
...,...,...
1957,Mass,Le Sage's theory of gravitation
1958,Magnetic field,Length
1959,Voltage,Series and parallel circuits
1960,Physics,Velocity


In [84]:
all_data = pd.concat([features, labeled_pairs], axis = 1)

In [85]:
all_data

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f31,f32,f33,f34,f35,f36,f37,relation,topic_a,topic_b
0,321.0,190.0,1613.0,197.0,2.0,4.0,19.0,0.0,0.0,0.0,...,0.000217,908.0,504.0,87.0,0.000040,0.000086,1.118241e-05,0.0,Magnetic field,Inertial frame of reference
1,36.0,77.0,109.0,101.0,3.0,2.0,4.0,0.0,0.0,0.0,...,0.000000,230.0,161.0,21.0,0.000002,0.000002,9.914547e-08,1.0,Gravitational acceleration,Position (vector)
2,808.0,495.0,3104.0,2394.0,2.0,3.0,25.0,0.0,0.0,0.0,...,0.000717,961.0,1060.0,103.0,-0.000003,-0.000017,1.509753e-05,0.0,Energy,Planet
3,99.0,48.0,96.0,212.0,2.0,4.0,5.0,0.0,0.0,0.0,...,0.000000,140.0,128.0,19.0,0.000007,-0.000004,-1.144610e-05,0.0,Mechanical energy,Gravitational field
4,643.0,162.0,9935.0,129.0,1.0,5.0,21.0,0.0,0.0,0.0,...,0.000000,893.0,109.0,12.0,0.000095,0.000940,6.710326e-05,0.0,Physics,Electric potential energy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1957,350.0,203.0,4432.0,92.0,3.0,4.0,22.0,0.0,0.0,0.0,...,0.001753,727.0,1078.0,139.0,0.000010,0.000172,2.182559e-05,0.0,Mass,Le Sage's theory of gravitation
1958,321.0,59.0,1613.0,420.0,2.0,3.0,7.0,1.0,0.0,0.0,...,0.000000,908.0,104.0,30.0,0.000113,0.000062,7.647945e-06,0.0,Magnetic field,Length
1959,127.0,48.0,1368.0,269.0,4.0,1.0,11.0,0.0,0.0,4.0,...,0.008109,202.0,269.0,28.0,0.000062,0.000055,8.195897e-06,0.0,Voltage,Series and parallel circuits
1960,643.0,147.0,9935.0,851.0,1.0,4.0,24.0,0.0,0.0,0.0,...,0.001042,893.0,224.0,37.0,0.000150,0.000904,5.851008e-05,0.0,Physics,Velocity


In [86]:
all_data.to_csv("../../output_files/physics_pairs_data.csv")

In [98]:
all_data = pd.read_csv("../../output_files/physics_pairs_data.csv")
all_data = all_data.sample(frac = 1)
all_data = all_data.sample(frac = 1)
all_data.reset_index(inplace = True)
all_data = all_data.drop(columns = ["index", "Unnamed: 0"])

In [99]:
all_data

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f31,f32,f33,f34,f35,f36,f37,relation,topic_a,topic_b
0,161.0,347.0,1677.0,940.0,2.0,2.0,59.0,1.0,2.0,4.0,...,0.004845,495.0,525.0,57.0,0.000002,0.000041,7.133262e-06,0.0,Wavelength,Electromagnetic spectrum
1,109.0,180.0,332.0,1885.0,3.0,3.0,55.0,0.0,2.0,0.0,...,0.000000,298.0,410.0,36.0,-0.000062,-0.000293,-2.145507e-05,0.0,Ohm,Metre
2,256.0,643.0,2151.0,9935.0,3.0,1.0,30.0,0.0,0.0,0.0,...,0.000000,675.0,893.0,71.0,-0.000080,-0.000861,-5.752111e-05,1.0,Refractive index,Physics
3,240.0,99.0,683.0,130.0,3.0,4.0,91.0,0.0,1.0,0.0,...,0.007964,173.0,127.0,16.0,0.000070,0.000036,6.344666e-06,0.0,Motion,Relative velocity
4,184.0,457.0,157.0,991.0,3.0,5.0,32.0,0.0,0.0,0.0,...,0.000000,347.0,1225.0,77.0,0.000049,-0.000033,-1.149550e-05,1.0,Gravity of Earth,Force
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1957,242.0,163.0,537.0,301.0,3.0,2.0,21.0,0.0,0.0,0.0,...,0.001326,531.0,458.0,77.0,-0.000025,0.000016,1.387193e-05,0.0,Potential energy,Electrostatics
1958,71.0,419.0,423.0,1610.0,7.0,2.0,4.0,0.0,1.0,0.0,...,0.000000,172.0,726.0,27.0,-0.000030,-0.000084,-1.282900e-05,0.0,Fracture,Light
1959,198.0,148.0,603.0,477.0,2.0,3.0,18.0,0.0,0.0,0.0,...,0.000366,564.0,438.0,60.0,0.000026,0.000025,3.480464e-06,0.0,Wave,Dielectric
1960,112.0,149.0,442.0,275.0,3.0,6.0,11.0,1.0,0.0,0.0,...,0.004061,291.0,451.0,55.0,-0.000006,0.000017,1.987625e-07,0.0,Distance,Work (physics)


In [109]:
training_data = all_data.loc[:1362]
training_data.reset_index(inplace = True)
training_data = training_data.drop(columns = ["index"])


testing_data = all_data.loc[1362:]
testing_data.reset_index(inplace = True)
testing_data = testing_data.drop(columns = ["index"])

In [110]:
training_data

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f31,f32,f33,f34,f35,f36,f37,relation,topic_a,topic_b
0,161.0,347.0,1677.0,940.0,2.0,2.0,59.0,1.0,2.0,4.0,...,0.004845,495.0,525.0,57.0,0.000002,0.000041,0.000007,0.0,Wavelength,Electromagnetic spectrum
1,109.0,180.0,332.0,1885.0,3.0,3.0,55.0,0.0,2.0,0.0,...,0.000000,298.0,410.0,36.0,-0.000062,-0.000293,-0.000021,0.0,Ohm,Metre
2,256.0,643.0,2151.0,9935.0,3.0,1.0,30.0,0.0,0.0,0.0,...,0.000000,675.0,893.0,71.0,-0.000080,-0.000861,-0.000058,1.0,Refractive index,Physics
3,240.0,99.0,683.0,130.0,3.0,4.0,91.0,0.0,1.0,0.0,...,0.007964,173.0,127.0,16.0,0.000070,0.000036,0.000006,0.0,Motion,Relative velocity
4,184.0,457.0,157.0,991.0,3.0,5.0,32.0,0.0,0.0,0.0,...,0.000000,347.0,1225.0,77.0,0.000049,-0.000033,-0.000011,1.0,Gravity of Earth,Force
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1358,326.0,147.0,499.0,851.0,2.0,4.0,3.0,0.0,1.0,0.0,...,0.000000,152.0,224.0,10.0,0.000062,-0.000023,-0.000007,0.0,Dispersion (optics),Velocity
1359,198.0,76.0,603.0,161.0,2.0,4.0,7.0,0.0,0.0,0.0,...,0.003275,564.0,437.0,35.0,0.000082,0.000028,0.000007,0.0,Wave,Sonic boom
1360,426.0,162.0,1509.0,918.0,3.0,4.0,104.0,1.0,1.0,1.0,...,0.001181,405.0,221.0,46.0,0.000038,0.000071,0.000006,0.0,Gravity,Acceleration
1361,162.0,112.0,918.0,442.0,4.0,3.0,17.0,0.0,0.0,0.0,...,0.000000,221.0,291.0,30.0,0.000003,0.000006,0.000003,1.0,Acceleration,Distance


In [111]:
testing_data

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f31,f32,f33,f34,f35,f36,f37,relation,topic_a,topic_b
0,77.0,18.0,101.0,20.0,2.0,1.0,4.0,0.0,0.0,0.0,...,0.000000,161.0,106.0,11.0,0.000057,0.000002,1.697514e-06,0.0,Position (vector),Projectile motion
1,103.0,444.0,429.0,3746.0,1.0,8.0,16.0,0.0,3.0,0.0,...,0.000000,404.0,966.0,46.0,-0.000204,-0.000205,-2.864012e-05,0.0,Shock wave,Temperature
2,485.0,117.0,2700.0,110.0,7.0,6.0,15.0,0.0,0.0,1.0,...,0.000000,1253.0,347.0,72.0,0.000139,0.000187,1.503403e-05,0.0,Laser,Absorption spectroscopy
3,178.0,73.0,541.0,808.0,2.0,4.0,1.0,0.0,0.0,0.0,...,0.002745,182.0,209.0,14.0,-0.000031,-0.000014,9.577774e-07,0.0,Speed,Refraction
4,643.0,190.0,9935.0,197.0,1.0,4.0,40.0,0.0,0.0,1.0,...,0.002384,893.0,504.0,92.0,0.000080,0.000936,6.577707e-05,0.0,Physics,Inertial frame of reference
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,242.0,163.0,537.0,301.0,3.0,2.0,21.0,0.0,0.0,0.0,...,0.001326,531.0,458.0,77.0,-0.000025,0.000016,1.387193e-05,0.0,Potential energy,Electrostatics
596,71.0,419.0,423.0,1610.0,7.0,2.0,4.0,0.0,1.0,0.0,...,0.000000,172.0,726.0,27.0,-0.000030,-0.000084,-1.282900e-05,0.0,Fracture,Light
597,198.0,148.0,603.0,477.0,2.0,3.0,18.0,0.0,0.0,0.0,...,0.000366,564.0,438.0,60.0,0.000026,0.000025,3.480464e-06,0.0,Wave,Dielectric
598,112.0,149.0,442.0,275.0,3.0,6.0,11.0,1.0,0.0,0.0,...,0.004061,291.0,451.0,55.0,-0.000006,0.000017,1.987625e-07,0.0,Distance,Work (physics)


In [113]:
training_data.to_csv("../../output_files/training_data.csv")
testing_data.to_csv("../../output_files/testing_data.csv")

In [114]:
df_testing = pd.read_csv("../../output_files/testing_data.csv")

In [115]:
df_testing = df_testing[["topic_a", "topic_b", "relation"]]

In [116]:
df_testing

Unnamed: 0,topic_a,topic_b,relation
0,Position (vector),Projectile motion,0.0
1,Shock wave,Temperature,0.0
2,Laser,Absorption spectroscopy,0.0
3,Speed,Refraction,0.0
4,Physics,Inertial frame of reference,0.0
...,...,...,...
595,Potential energy,Electrostatics,0.0
596,Fracture,Light,0.0
597,Wave,Dielectric,0.0
598,Distance,Work (physics),0.0
