In [18]:
def cal_func_score(df, graph_data, direction='out', name=''):

    print(f'Calculating {name} features...')
    
    if direction == 'out' or direction == 'all':
        node1 = 'node1'
        node2 = 'node2'
    
    elif direction == 'in':
        node1 = 'node2'
        node2 = 'node1'

    ts = datetime.datetime.now().timestamp()

    df[f'node1_{direction}_{name}'] = df['node1'].apply(lambda node: graph_data.get_neighbors_size(node))
    df[f'node_cn_{direction}_{name}'] = df.apply(lambda row: graph_data.common_neighbors(row[node1], row[node2]), axis=1)
    df[f'node_jc_{direction}_{name}'] = df.apply(lambda row: graph_data.jaccard_coefficient(row[node1], row[node2]), axis=1)
    # df[f'node_aa_{direction}'] = graph_data.apply(lambda row: graph_out.adamic_adar(row[node1], row[node1]), axis=1)
    df[f'node_pa_{direction}_{name}'] = df.apply(lambda row: graph_data.preferential_attachment(row[node1], row[node2]), axis=1)
    # df[f'node_sp_{direction}'] = df.apply(lambda row: graph_data.shortest_path(row[node1], row[node2]), axis=1)
    return df

def cal_func_score_in_out_all(df, graph_out, graph_in, graph_all):
    df_out = cal_func_score(df, graph_out, direction='out')
    df_in = cal_func_score(df, graph_in, direction='in')
    df_all = cal_func_score(df, graph_all, direction='all')
    return pd.concat([df_out, df_in, df_all], axis=1)

def generate_graph(df):
    
    graph_out = Graph()
    graph_in = Graph()
    graph_all = Graph()
    
    for _, row in df.iterrows():
        graph_out.add_edge(row['node1'], row['node2'])
        graph_in.add_edge(row['node2'], row['node1'])
        graph_all.add_edge(row['node1'], row['node2'])
        graph_all.add_edge(row['node2'], row['node1'])
    
    return graph_out, graph_in, graph_all

In [19]:
import pandas as pd
import warnings
import datetime

warnings.filterwarnings("ignore")

train = pd.read_csv(r'Data/new_train_data.csv', dtype = {'node1': 'int32', 'node2': 'int32', 'label': 'int32'})
test = pd.read_csv(r'Data/new_test_data.csv', dtype = {'node1': 'int32', 'node2': 'int32'})

train = train[train.node1 != train.node2]

In [20]:
from core import Graph
from core import DegreeBased
import operator

graph_out, graph_in, graph_all = generate_graph(train[train['label'] == 1])

In [32]:
import numpy as np
import itertools

operations = [operator.le, operator.ge, operator.eq, operator.ne]
# degrees = [1, 2, 3, 4, 5, 6, 7]
degrees = [1, 2]
graphs = [(graph_out, 'out'), (graph_in, 'in'), (graph_all, 'all')]

combinations = list(itertools.product(operations,degrees,graphs))

sparsified_train_df = train.copy()
sparsified_test_df = test.copy()

for op, de, ga in combinations:
    g, dir = ga[0], ga[1]
    sg = DegreeBased(graph=g, degree=de, operation=op).fit()
    sparsified_train_df.merge(cal_func_score(train, g, dir, name=f'{dir}_{de}_{op.__name__}'), on=['node1', 'node2'])
    sparsified_test_df.merge(cal_func_score(test, g, dir, name=f'{dir}_{de}_{op.__name__}'), on=['node1', 'node2'])

Calculating out_1_le features...
Calculating out_1_le features...
Calculating in_1_le features...
Calculating in_1_le features...
Calculating all_1_le features...
Calculating all_1_le features...
Calculating out_2_le features...
Calculating out_2_le features...
Calculating in_2_le features...
Calculating in_2_le features...
Calculating all_2_le features...
Calculating all_2_le features...
Calculating out_1_ge features...
Calculating out_1_ge features...
Calculating in_1_ge features...
Calculating in_1_ge features...
Calculating all_1_ge features...
Calculating all_1_ge features...
Calculating out_2_ge features...
Calculating out_2_ge features...
Calculating in_2_ge features...
Calculating in_2_ge features...
Calculating all_2_ge features...
Calculating all_2_ge features...
Calculating out_1_eq features...
Calculating out_1_eq features...
Calculating in_1_eq features...
Calculating in_1_eq features...
Calculating all_1_eq features...
Calculating all_1_eq features...
Calculating out_2_eq

In [33]:
x_col = sparsified_train_df.columns.to_list()
x_col.remove('node1')
x_col.remove('node2')
x_col.remove('label')

y_col = 'label'

In [40]:
X_train = sparsified_train_df[x_col]
y_train = sparsified_train_df[y_col]

X_test = sparsified_test_df[x_col]

In [45]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=500, max_depth=20, random_state=0)
rf.fit(X_train, y_train)

In [48]:
sparsified_train_df

Unnamed: 0,node1,node2,label,node1_out_out_1_le,node_cn_out_out_1_le,node_jc_out_out_1_le,node_pa_out_out_1_le,node1_in_in_1_le,node_cn_in_in_1_le,node_jc_in_in_1_le,...,node_jc_out_out_2_ne,node_pa_out_out_2_ne,node1_in_in_2_ne,node_cn_in_in_2_ne,node_jc_in_in_2_ne,node_pa_in_in_2_ne,node1_all_all_2_ne,node_cn_all_all_2_ne,node_jc_all_all_2_ne,node_pa_all_all_2_ne
0,9112,38149,0,1,0,0.0,2,-1,-1,-1.000000,...,0.0,2,-1,-1,-1.000000,-6,1,0,0.00,8
1,38751,38824,1,7,0,0.0,7,11,0,0.000000,...,0.0,7,11,0,0.000000,99,18,0,0.00,180
2,23013,7184,0,9,-1,-1.0,-9,-1,-1,-1.000000,...,-1.0,-9,-1,-1,-1.000000,1,9,-1,-1.00,-9
3,38000,38145,1,13,-1,-1.0,-13,1,0,0.000000,...,-1.0,-13,1,0,0.000000,8,14,0,0.00,112
4,37109,8452,0,2,0,0.0,26,-1,-1,-1.000000,...,0.0,26,-1,-1,-1.000000,-1,2,0,0.00,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23995,28115,28124,1,1,-1,-1.0,-1,-1,-1,-1.000000,...,-1.0,-1,-1,-1,-1.000000,-1,1,0,0.00,1
23996,38135,38141,1,8,0,0.0,40,4,1,0.083333,...,0.0,40,4,1,0.083333,36,12,1,0.04,168
23997,3970,38226,0,1,0,0.0,3,-1,-1,-1.000000,...,0.0,3,-1,-1,-1.000000,-13,1,0,0.00,16
23998,11727,38264,0,2,0,0.0,6,1,0,0.000000,...,0.0,6,1,0,0.000000,14,3,0,0.00,51


In [1]:
{
    "Name": "RN Lee",
    "Age": 22,
    "Q1": "Yes",
    "Q2": [
        "cat",
        "dog"
        "pig"
    ]
}

In [2]:
import json

In [5]:
json.dump(a, open('test.json', 'w'))