In [39]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
import csv
import pandas as pd
import os
import numpy as np
from numpy.linalg import norm

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import average_precision_score
from sklearn import metrics
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import mean_squared_error
from sklearn.metrics import recall_score, f1_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import statistics as st



In [40]:
# read in training file and convert to array like matrix
#load trans clos of that DS

os.chdir('../prepped_csv')
train = pd.read_csv('alt_gold_trans_clos_embeddings.csv', header=0, sep="\t")
test = pd.read_csv('alt_test_chrystal_clos_embeddings.csv', sep='\t', header=0)


In [41]:
def poin_dist(tensor1, tensor2):
    return np.arcsinh(1 + (2*(norm(tensor1 - tensor2)))/ ((1 - norm(tensor1) ** 2)*(1 - norm(tensor2) ** 2)))

In [42]:

train['id1 freq'] = train.groupby('id1')['id1'].transform('count')
train['id2 freq'] = train.groupby('id2')['id2'].transform('count')
train.head()

test['id1 freq'] = test.groupby('id1')['id1'].transform('count')
test['id2 freq'] = test.groupby('id2')['id2'].transform('count')
test.head()

Unnamed: 0,id1,id2,1_x,1_y,2_x,2_y,class label,id1 freq,id2 freq
0,186201345,189721042,-0.950052,-0.013506,-0.886165,-0.013648,0.0,69,8
1,186131996,261481841,-0.317058,-0.627595,-0.299037,-0.619111,1.0,24,8
2,186131996,261481841,-0.317058,-0.627595,-0.299037,-0.619111,1.0,24,8
3,484995553,262491683,0.322711,0.400252,0.306095,0.413837,1.0,3,7
4,186201601,253227719,-0.598286,-0.800964,-0.425721,-0.806831,0.0,871,5


In [43]:
#Change current working directory for reading processed feature files and saving intermediate computed features
os.chdir('../feature_csv')

id1_neighs = pd.read_csv('id1_neighbours.csv', sep=',',header=0)
id2_neighs = pd.read_csv('id2_neighbours.csv', sep=',',header=0)
c_train = train.merge(id1_neighs, on=['id1'],how='left')
c_train.head()

Unnamed: 0,id1,id2,1_x,1_y,2_x,2_y,class label,id1 freq,id2 freq,id1_neighs
0,249564556,209895253,-0.340766,0.715908,-0.278152,0.843449,1.0,14,1,"[189967549, 209895253, 252925068, 252925068]"
1,250233077,186201601,0.226315,0.801542,-0.160874,-0.594917,0.0,16,871,"[210630498, 210630498, 248833274, 210630498]"
2,252578211,186201601,0.069049,-0.990024,-0.160874,-0.594917,0.0,7,871,[253206593]
3,192557288,170589394,-0.669857,-0.686993,-0.668075,-0.621127,0.0,18,33,"[209895092, 248811486, 257945222, 209895092]"
4,522426882,170589324,-0.374436,0.901481,-0.266791,0.697339,0.0,5,474,[189720839]


In [44]:
#merge again on id2 to then compute n_common neighbours

c_train = c_train.merge(id2_neighs, on='id2', how='left')
c_train.head()

Unnamed: 0,id1,id2,1_x,1_y,2_x,2_y,class label,id1 freq,id2 freq,id1_neighs,id2_neighs
0,249564556,209895253,-0.340766,0.715908,-0.278152,0.843449,1.0,14,1,"[189967549, 209895253, 252925068, 252925068]",[249564556]
1,250233077,186201601,0.226315,0.801542,-0.160874,-0.594917,0.0,16,871,"[210630498, 210630498, 248833274, 210630498]","[186201345, 170589404, 210636295, 186201603]"
2,252578211,186201601,0.069049,-0.990024,-0.160874,-0.594917,0.0,7,871,[253206593],"[186201345, 170589404, 210636295, 186201603]"
3,192557288,170589394,-0.669857,-0.686993,-0.668075,-0.621127,0.0,18,33,"[209895092, 248811486, 257945222, 209895092]","[190195190, 210638310, 257947224, 257946310, 1..."
4,522426882,170589324,-0.374436,0.901481,-0.266791,0.697339,0.0,5,474,[189720839],"[170589394, 210636186, 170589400, 170589402, 1..."


In [45]:
df_out = pd.DataFrame()

for index,row in c_train.iterrows():
    left = set(row.id1_neighs)
    right = row.id2_neighs
    
    common_count = len(left.intersection(right))
    c_train['n_comm_neighs'] = df_out.append([common_count])

val = {'n_comm_neighs':0}
train = c_train.fillna(value=val)

In [46]:
c_test = test.merge(id1_neighs, on=['id1'],how='left')
c_test = c_test.merge(id2_neighs, on='id2', how='left')

df_out = pd.DataFrame()

for index,row in c_test.iterrows():
    
    if pd.isna(row.id1_neighs):
        left = set([999999999])
    else:
        left = set(row.id1_neighs)
        
    right = row.id2_neighs
    
    if pd.isna(right):
        right = [999999999]
    
    common_count = len(left.intersection(right))
    c_test['n_comm_neighs'] = df_out.append([common_count])

val = {'n_comm_neighs':0}
test = c_test.fillna(value=val)
test.head()

Unnamed: 0,id1,id2,1_x,1_y,2_x,2_y,class label,id1 freq,id2 freq,id1_neighs,id2_neighs,n_comm_neighs
0,186201345,189721042,-0.950052,-0.013506,-0.886165,-0.013648,0.0,69,8,[186201601],,0.0
1,186131996,261481841,-0.317058,-0.627595,-0.299037,-0.619111,1.0,24,8,"[189723279, 248811471]","[192559053, 192559053]",0.0
2,186131996,261481841,-0.317058,-0.627595,-0.299037,-0.619111,1.0,24,8,"[189723279, 248811471]","[192559053, 192559053]",0.0
3,484995553,262491683,0.322711,0.400252,0.306095,0.413837,1.0,3,7,[190195185],[249563834],0.0
4,186201601,253227719,-0.598286,-0.800964,-0.425721,-0.806831,0.0,871,5,[189723269],,0.0


In [47]:
test = test[['id1', 'id2', '1_x', '1_y', '2_x', '2_y', 'class label', 'id1 freq','id2 freq', 'n_comm_neighs']]
train = train[['id1', 'id2', '1_x', '1_y', '2_x', '2_y', 'class label', 'id1 freq','id2 freq', 'n_comm_neighs']]

In [48]:
# read in feature files for  merging


node_variance = pd.read_csv('test_variance_distance of all nodes.csv', sep=',',header=None)
node_variance = node_variance.rename(columns={0: 'id1', 1: 'variance_distance'})

two_hop = pd.read_csv('two_hops_node_list.csv', sep=',',header=None)
two_hop = two_hop.rename(columns={0: 'id1', 1: 'id2',2: 'two_hop_bool'})



In [49]:
#merge to test-set
test = test.merge(two_hop, on=['id1','id2'],how='left')
#0 as there is no two hop path
test = test.fillna(0)

test = test.merge(node_variance, on='id1',how='left')
#indicates very high median distance
test = test.fillna(100)

test.head()

Unnamed: 0,id1,id2,1_x,1_y,2_x,2_y,class label,id1 freq,id2 freq,n_comm_neighs,two_hop_bool,variance_distance
0,186201345,189721042,-0.950052,-0.013506,-0.886165,-0.013648,0.0,69,8,0.0,0.0,0.964979
1,186131996,261481841,-0.317058,-0.627595,-0.299037,-0.619111,1.0,24,8,0.0,0.0,0.598694
2,186131996,261481841,-0.317058,-0.627595,-0.299037,-0.619111,1.0,24,8,0.0,0.0,0.598694
3,484995553,262491683,0.322711,0.400252,0.306095,0.413837,1.0,3,7,0.0,0.0,0.387707
4,186201601,253227719,-0.598286,-0.800964,-0.425721,-0.806831,0.0,871,5,0.0,0.0,0.848654


In [50]:
train = train.merge(two_hop, on=['id1','id2'],how='left')
#0 as there is no two hop path
train = train.fillna(0)

node_variance = pd.read_csv('train_variance_distance of all nodes.csv', sep=',',header=None)
node_variance = node_variance.rename(columns={0: 'id1', 1: 'variance_distance'})

train = train.merge(node_variance, on='id1',how='left')
#indicates very high median distance
train = train.fillna(100)

train.head()

Unnamed: 0,id1,id2,1_x,1_y,2_x,2_y,class label,id1 freq,id2 freq,n_comm_neighs,two_hop_bool,variance_distance
0,249564556,209895253,-0.340766,0.715908,-0.278152,0.843449,1.0,14,1,14.0,0.0,0.750247
1,250233077,186201601,0.226315,0.801542,-0.160874,-0.594917,0.0,16,871,0.0,0.0,0.598403
2,252578211,186201601,0.069049,-0.990024,-0.160874,-0.594917,0.0,7,871,0.0,0.0,0.688657
3,192557288,170589394,-0.669857,-0.686993,-0.668075,-0.621127,0.0,18,33,0.0,0.0,0.803718
4,522426882,170589324,-0.374436,0.901481,-0.266791,0.697339,0.0,5,474,0.0,0.0,0.74937


In [51]:
# get index of very root which is science-dummy node
root =  train[(train.id1 == 189723269) & (train.id2 == 999999999)]
root

Unnamed: 0,id1,id2,1_x,1_y,2_x,2_y,class label,id1 freq,id2 freq,n_comm_neighs,two_hop_bool,variance_distance
15056,189723269,999999999,-0.048931,-0.00421,0.026315,-0.142341,1.0,1,2555,0.0,0.0,0.74669


In [52]:
#OMG NEVER AGAIN TEST ACCURACY DROPPED BY ALMOST 50%
# upsampling positive samples for test and training set

all_pos = train[train['class label']== 1.0]
all_pos = all_pos.rename(columns={'id1': 'id2', 'id2': 'id1','1_x':'2_x','1_y':'2_y','2_x':'1_x','2_y':'1_y'})

train = pd.concat([train,all_pos])

all_post = test[test['class label']== 1.0]
all_post = all_post.rename(columns={'id1': 'id2', 'id2': 'id1','1_x':'2_x','1_y':'2_y','2_x':'1_x','2_y':'1_y'})

test = pd.concat([test,all_post])

#this is simply not working...

In [53]:
#ccheck if anything is null
df=train[train.isnull().any(axis=1)]
df

Unnamed: 0,id1,id2,1_x,1_y,2_x,2_y,class label,id1 freq,id2 freq,n_comm_neighs,two_hop_bool,variance_distance


In [54]:
X_train = train[['1_x','1_y','2_x','2_y','id1 freq','id2 freq','n_comm_neighs','two_hop_bool','variance_distance']].to_numpy()
Y_train = train['class label'].to_numpy()

In [55]:
all_dists = []
radius_id1 = []
radius_id2 = []
center = np.array([0,0])
for row in X_train:

    
    x1,y1,x2,y2 = row[0:4]

    t1 = np.array(x1,y1)
    t2 = np.array(x2,y2)
    all_dists.append(poin_dist(t1,t2))
    radius_id1.append(poin_dist(center,t1))
    radius_id2.append(poin_dist(center,t2))

In [56]:
## save at least root node, this should always be in the training set, otherwise it would be too hard
X_train = pd.DataFrame(data=X_train)

X_train['edge dist'] = all_dists
X_train['radius_id1'] = radius_id1
X_train['radius_id2'] = radius_id2
# sqr_r_id1 = [i ** 2 for i in radius_id1]
# sqr_r_id2 = [i ** 2 for i in radius_id2]
# X_train['sq_r_id1'] = sqr_r_id1 
# X_train['sq_r_id2'] = sqr_r_id2
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,edge dist,radius_id1,radius_id2
0,-0.340766,0.715908,-0.278152,0.843449,14.0,1.0,14.0,0.0,0.750247,0.985901,1.483373,1.375754
1,0.226315,0.801542,-0.160874,-0.594917,16.0,871.0,0.0,0.0,0.598403,1.368684,1.287907,1.176379
2,0.069049,-0.990024,-0.160874,-0.594917,7.0,871.0,0.0,0.0,0.688657,1.180437,1.013562,1.176379
3,-0.669857,-0.686993,-0.668075,-0.621127,18.0,33.0,0.0,0.0,0.803718,0.889604,2.19553,2.190262
4,-0.374436,0.901481,-0.266791,0.697339,5.0,474.0,0.0,0.0,0.74937,1.059769,1.542725,1.356459


In [57]:
X_test = test[['1_x','1_y','2_x','2_y','id1 freq','id2 freq','n_comm_neighs','two_hop_bool','variance_distance']].to_numpy()

Y_test = test['class label'].to_numpy()

In [58]:
all_dists = []
radius_id1 = []
radius_id2 = []
center = np.array([0,0])
for row in X_test:

    
    x1,y1,x2,y2 = row[0:4]

    t1 = np.array(x1,y1)
    t2 = np.array(x2,y2)
    all_dists.append(poin_dist(t1,t2))
    radius_id1.append(poin_dist(center,t1))
    radius_id2.append(poin_dist(center,t2))

In [59]:
X_test = pd.DataFrame(data=X_test)

X_test['edge dist'] = all_dists
X_test['radius_id1'] = radius_id1
X_test['radius_id2'] = radius_id2
# sqr_r_id1 = [i ** 2 for i in radius_id1]
# sqr_r_id2 = [i ** 2 for i in radius_id2]
# X_test['sq_r_id1'] = sqr_r_id1 
# X_test['sq_r_id2'] = sqr_r_id2


X_test.head()
X_test = X_test.to_numpy()

In [60]:
# not nice but efective, check for coordinates instead of ids
#science(-0.048931 , -0.00421) dummy (0.026315 ,-0.142341)



X_tr, X_check, y_tr, y_check = train_test_split(X_train, Y_train, test_size=0.33, random_state=42)



In [61]:
clf = RandomForestClassifier(max_depth=6,n_estimators=100)

clf.fit(X_tr,y_tr)
forest_pred3 = clf.predict(X_check)
clf.score(X_check,y_check)

print(classification_report(y_check,forest_pred3))
print(accuracy_score(y_check, forest_pred3))
print('Micro F1 Score:{}, Binary F1 score:{}'.format(f1_score(y_check, forest_pred3, average='micro'),f1_score(y_check, forest_pred3, average='binary')))
print(recall_score(y_check,forest_pred3))    

              precision    recall  f1-score   support

         0.0       0.97      0.87      0.91      4774
         1.0       0.79      0.94      0.86      2520

    accuracy                           0.89      7294
   macro avg       0.88      0.90      0.89      7294
weighted avg       0.90      0.89      0.89      7294

0.8926514943789416
Micro F1 Score:0.8926514943789416, Binary F1 score:0.8583830710797612
0.9416666666666667


## now for real life data

In [62]:
# not nice but efective, check for coordinates instead of ids
#science(-0.048931 , -0.00421) dummy (0.026315 ,-0.142341)



X_tr, X_check, y_tr, y_check = train_test_split(X_test, Y_test, test_size=0.33, random_state=42)



In [63]:
clf = RandomForestClassifier(max_depth=6,n_estimators=100)

clf.fit(X_tr,y_tr)
forest_pred4 = clf.predict(X_check)
clf.score(X_check,y_check)

print(classification_report(y_check,forest_pred4))
print(accuracy_score(y_check, forest_pred4))
print('Micro F1 Score:{}, Binary F1 score:{}'.format(f1_score(y_check, forest_pred4, average='micro'),f1_score(y_check, forest_pred4, average='binary')))
print(recall_score(y_check,forest_pred4))    

              precision    recall  f1-score   support

         0.0       0.97      0.89      0.93      3917
         1.0       0.85      0.96      0.90      2534

    accuracy                           0.92      6451
   macro avg       0.91      0.92      0.91      6451
weighted avg       0.92      0.92      0.92      6451

0.9155169741125407
Micro F1 Score:0.9155169741125407, Binary F1 score:0.8987929433611885
0.9550118389897395


In [64]:
from sklearn.metrics import precision_score
print(precision_score(y_check,forest_pred4,average='binary'))

0.8488249736934409
