In [1]:
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

train = pd.read_csv(r'Data/new_train_data.csv', dtype = {'node1': 'int32', 'node2': 'int32', 'label': 'int32'})
test = pd.read_csv(r'Data/new_test_data.csv', dtype = {'node1': 'int32', 'node2': 'int32'})

from core import Graph

graph_out = Graph()
graph_in  = Graph()

for _, row in train[train['label'] == 1].iterrows():
    graph_out.add_edge(row['node1'], row['node2'])
    graph_in.add_edge(row['node2'], row['node1'])

def cal_fun_score(df, graph_data, direction='out'):

    print(f'Calculating {direction} features...')
    
    if direction == 'out':
        node1 = 'node1'
        node2 = 'node2'
    else:
        node1 = 'node2'
        node2 = 'node1'
    
    df[f'node1_{direction}'] = df['node1'].apply(lambda node: graph_data.get_neighbors_size(node))
    df[f'node2_{direction}'] = df['node2'].apply(lambda node: graph_data.get_neighbors_size(node))
    
    df[f'node_cn_{direction}'] = df.apply(lambda row: graph_data.common_neighbors(row[node1], row[node2]), axis=1)
    df[f'node_jc_{direction}'] = df.apply(lambda row: graph_data.jaccard_coefficient(row[node1], row[node2]), axis=1)
    df[f'node_ks_{direction}'] = df.apply(lambda row: graph_data.katz_score(row[node1], row[node1]), axis=1)
    df[f'node_pa_{direction}'] = df.apply(lambda row: graph_data.preferential_attachment(row[node1], row[node2]), axis=1)
    df[f'node_aa_{direction}'] = df.apply(lambda row: graph_data.adamic_adar(row[node1], row[node2]), axis=1)

In [2]:
cal_fun_score(df=train, graph_data=graph_out, direction='out')
cal_fun_score(df=train, graph_data=graph_in, direction='in')

cal_fun_score(df=test, graph_data=graph_out, direction='out')
cal_fun_score(df=test, graph_data=graph_in, direction='in')

Calculating out features...
Calculating in features...
Calculating out features...
Calculating in features...


In [3]:
train.label.value_counts()

0    12000
1    12000
Name: label, dtype: int64

In [6]:
len(set(graph_out.get_nodes))

10230

In [7]:
graph_out.get_average_degree

2.1417097983223274

In [8]:
graph_in.get_average_degree

1.9746585486259667

In [19]:
new = set(set(test.node1) | set(test.node2))
old = set(graph_in.get_nodes)

In [23]:
len(new.difference(old))

1880

In [25]:
len(new)

5512

In [27]:
len(set(test.node1).union(set(test.node2)))

5512

In [28]:
1880/5512

0.34107402031930334

In [29]:
train

Unnamed: 0,node1,node2,label,node1_out,node2_out,node_cn_out,node_jc_out,node_ks_out,node_pa_out,node_aa_out,node1_in,node2_in,node_cn_in,node_jc_in,node_ks_in,node_pa_in,node_aa_in
0,9112,38149,0,1,2,0,0.0,1.000000e+00,2,0.0,-1,6,-1,-1.000000,5.598600e+04,-6,-1.0
1,38751,38824,1,7,1,0,0.0,9.607990e+05,7,0.0,11,9,0,0.000000,4.358480e+08,99,0.0
2,23013,7184,0,9,-1,-1,-1.0,4.358480e+08,-9,-1.0,-1,-1,-1,-1.000000,-1.000000e+00,1,-1.0
3,38000,38145,1,13,-1,-1,-1.0,3.281147e+14,-13,-1.0,1,8,0,0.000000,1.917396e+07,8,0.0
4,37109,8452,0,2,13,0,0.0,6.000000e+00,26,0.0,-1,1,-1,-1.000000,1.000000e+00,-1,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23995,28115,28124,1,1,-1,-1,-1.0,1.000000e+00,-1,-1.0,-1,1,-1,-1.000000,1.000000e+00,-1,-1.0
23996,38135,38141,1,8,5,0,0.0,1.917396e+07,40,0.0,4,9,1,0.083333,4.358480e+08,36,-1.0
23997,3970,38226,0,1,3,0,0.0,1.000000e+00,3,0.0,-1,13,-1,-1.000000,3.281147e+14,-13,-1.0
23998,11727,38264,0,2,3,0,0.0,6.000000e+00,6,0.0,1,14,0,0.000000,1.196678e+16,14,0.0
