In [1]:
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

train = pd.read_csv(r'Data/new_train_data.csv', dtype = {'node1': 'int32', 'node2': 'int32', 'label': 'int32'})
test = pd.read_csv(r'Data/new_test_data.csv', dtype = {'node1': 'int32', 'node2': 'int32'})

In [2]:
from core import Graph

graph_out = Graph()
graph_in  = Graph()

for _, row in train[train['label'] == 1].iterrows():
    graph_out.add_edge(row['node1'], row['node2'])
    graph_in.add_edge(row['node2'], row['node1'])

In [3]:
def cal_fun_score(df, graph_data, direction='out'):

    print(f'Calculating {direction} features...')
    
    if direction == 'out':
        node1 = 'node1'
        node2 = 'node2'
    else:
        node1 = 'node2'
        node2 = 'node1'
    
    df[f'node1_{direction}'] = df['node1'].apply(lambda node: graph_data.get_neighbors_size(node))
    df[f'node2_{direction}'] = df['node2'].apply(lambda node: graph_data.get_neighbors_size(node))
    
    df[f'node_cn_{direction}'] = df.apply(lambda row: graph_data.common_neighbors(row[node1], row[node2]), axis=1)
    df[f'node_jc_{direction}'] = df.apply(lambda row: graph_data.jaccard_coefficient(row[node1], row[node2]), axis=1)
    df[f'node_ks_{direction}'] = df.apply(lambda row: graph_data.katz_score(row[node1], row[node1]), axis=1)
    df[f'node_pa_{direction}'] = df.apply(lambda row: graph_data.preferential_attachment(row[node1], row[node2]), axis=1)
    df[f'node_aa_{direction}'] = df.apply(lambda row: graph_data.adamic_adar(row[node1], row[node2]), axis=1)

In [4]:
cal_fun_score(df=train, graph_data=graph_out, direction='out')
cal_fun_score(df=train, graph_data=graph_in, direction='in')

cal_fun_score(df=test, graph_data=graph_out, direction='out')
cal_fun_score(df=test, graph_data=graph_in, direction='in')

Calculating out features...
Calculating in features...
Calculating out features...
Calculating in features...


In [5]:
ii = train.node1_out >= graph_out.get_average_degree

In [7]:
train[ii]

Unnamed: 0,node1,node2,label,node1_out,node2_out,node_cn_out,node_jc_out,node_ks_out,node_pa_out,node_aa_out,node1_in,node2_in,node_cn_in,node_jc_in,node_ks_in,node_pa_in,node_aa_in
1,38751,38824,1,7,1,0,0.000000,9.607990e+05,7,0.000000,11,9,0,0.000000,4.358480e+08,99,0.0
2,23013,7184,0,9,-1,-1,-1.000000,4.358480e+08,-9,-1.000000,-1,-1,-1,-1.000000,-1.000000e+00,1,-1.0
3,38000,38145,1,13,-1,-1,-1.000000,3.281147e+14,-13,-1.000000,1,8,0,0.000000,1.917396e+07,8,0.0
6,38041,38100,1,9,7,0,0.000000,4.358480e+08,63,0.000000,2,7,1,0.125000,9.607990e+05,14,-1.0
7,38517,38602,1,11,6,0,0.000000,3.138428e+11,66,0.000000,1,7,0,0.000000,9.607990e+05,7,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23991,38679,38790,1,10,4,1,0.076923,1.111111e+10,40,0.621335,4,20,0,0.000000,1.103764e+26,80,0.0
23992,40014,26051,0,3,-1,-1,-1.000000,3.900000e+01,-3,-1.000000,-1,1,-1,-1.000000,1.000000e+00,-1,-1.0
23993,16479,17086,1,4,-1,-1,-1.000000,3.400000e+02,-4,-1.000000,-1,2,-1,-1.000000,6.000000e+00,-2,-1.0
23994,29963,1296,0,21,1,0,0.000000,6.134716e+27,21,0.000000,-1,1,-1,-1.000000,1.000000e+00,-1,-1.0


In [13]:
# DegreeBased

train_node1_out_gt = train[train.node1_out >= graph_out.get_average_degree]
train_node2_out_gt = train[train.node2_out >= graph_out.get_average_degree]

train_node1_in_gt = train[train.node1_in >= graph_in.get_average_degree]
train_node2_in_gt = train[train.node2_in >= graph_in.get_average_degree]

train_node1_out_l = train[train.node1_out < graph_out.get_average_degree]
train_node1_in_l = train[train.node1_in < graph_in.get_average_degree]

In [6]:
x_col = train.columns[3:].to_list()
y_col = 'label'

In [7]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

rfc = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)
rfc.fit(train[x_col], train[y_col])

xgb = XGBClassifier(n_estimators=100, max_depth=10, random_state=0)
xgb.fit(train[x_col], train[y_col])

lr = LogisticRegression(random_state=0)
lr.fit(train[x_col], train[y_col])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
def submit(model, filename):
    s = pd.DataFrame(
        {
            "node_pair_id": list(test.node_pair_id),
            "ans": list(model.predict(test[x_col]))
        }
    )
    s.to_csv(f"{filename}.csv", index=False)

In [9]:
submit(rfc, 'rfc')
submit(xgb, 'xgb')
submit(lr, 'lr')

In [10]:
s = pd.DataFrame(
    {
        "rfc": rfc.predict(test[x_col]),
        "xgb": xgb.predict(test[x_col]),
        "lr": lr.predict(test[x_col])
    }
)

In [11]:
s.rfc.value_counts(), s.xgb.value_counts(), s.lr.value_counts()

(0    3310
 1    2690
 Name: rfc, dtype: int64,
 0    3570
 1    2430
 Name: xgb, dtype: int64,
 1    3299
 0    2701
 Name: lr, dtype: int64)

In [12]:
s['node_pair_id'] = test.node_pair_id
s['ans'] = s[['rfc', 'xgb', 'lr']].apply(lambda x: 1 if x.sum() >= 2 else 0, axis = 1)

In [13]:
# s[['node_pair_id', 'ans']].to_csv('ensemble.csv', index = False)

In [14]:
s.ans.value_counts()

0    3459
1    2541
Name: ans, dtype: int64

In [15]:
s[s.rfc != s.xgb]

Unnamed: 0,rfc,xgb,lr,node_pair_id,ans
9,1,0,1,9,1
20,1,0,0,20,0
22,1,0,1,22,1
28,0,1,1,28,1
34,0,1,1,34,1
...,...,...,...,...,...
5951,1,0,1,5951,1
5952,1,0,0,5952,0
5967,1,0,1,5967,1
5969,1,0,1,5969,1


In [16]:
s[s.rfc != s.lr]

Unnamed: 0,rfc,xgb,lr,node_pair_id,ans
0,0,0,1,0,0
2,1,1,0,2,1
8,1,1,0,8,1
11,0,0,1,11,0
12,0,0,1,12,0
...,...,...,...,...,...
5981,0,0,1,5981,0
5989,1,1,0,5989,1
5990,0,0,1,5990,0
5996,0,0,1,5996,0


In [17]:
s[s.xgb != s.lr]

Unnamed: 0,rfc,xgb,lr,node_pair_id,ans
0,0,0,1,0,0
2,1,1,0,2,1
8,1,1,0,8,1
9,1,0,1,9,1
11,0,0,1,11,0
...,...,...,...,...,...
5986,1,0,1,5986,1
5989,1,1,0,5989,1
5990,0,0,1,5990,0
5996,0,0,1,5996,0


In [18]:
x_col

['node1_out',
 'node_cn_out',
 'node_jc_out',
 'node_pa_out',
 'node_aa_out',
 'node1_in',
 'node_cn_in',
 'node_jc_in',
 'node_pa_in',
 'node_aa_in']