In [33]:
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

train = pd.read_csv(r'Data/new_train_data.csv', dtype = {'node1': 'int32', 'node2': 'int32', 'label': 'int32'})
test = pd.read_csv(r'Data/new_test_data.csv', dtype = {'node1': 'int32', 'node2': 'int32'})

In [54]:
train = train[train.node1 != train.node2]
test = test[test.node1 != test.node2]

In [55]:
from core import Graph

graph_out, graph_in, graph_all = Graph(), Graph(), Graph()

for _, row in train[train['label'] == 1].iterrows():
    graph_out.add_edge(row['node1'], row['node2'])
    graph_in.add_edge(row['node2'], row['node1'])
    graph_all.add_edge(row['node1'], row['node2'])
    graph_all.add_edge(row['node2'], row['node1'])

In [56]:
def cal_fun_score(df, graph_data, direction='out'):

    print(f'Calculating {direction} features...')
    
    if direction == 'out' or direction == 'all':
        node1 = 'node1'
        node2 = 'node2'
    elif direction == 'in':
        node1 = 'node2'
        node2 = 'node1'

    df[f'node1_{direction}'] = df['node1'].apply(lambda node: graph_data.get_neighbors_size(node))
    df[f'node_cn_{direction}'] = df.apply(lambda row: graph_data.common_neighbors(row[node1], row[node2]), axis=1)
    df[f'node_jc_{direction}'] = df.apply(lambda row: graph_data.jaccard_coefficient(row[node1], row[node2]), axis=1)
    # df[f'node_aa_{direction}'] = graph_data.apply(lambda row: graph_out.adamic_adar(row[node1], row[node1]), axis=1)
    df[f'node_pa_{direction}'] = df.apply(lambda row: graph_data.preferential_attachment(row[node1], row[node2]), axis=1)
    # df[f'node_sp_{direction}'] = df.apply(lambda row: graph_data.shortest_path(row[node1], row[node2]), axis=1)

In [57]:
cal_fun_score(df=train, graph_data=graph_out, direction='out')
cal_fun_score(df=train, graph_data=graph_in, direction='in')
# cal_fun_score(df=train, graph_data=graph_all, direction='all')

cal_fun_score(df=test, graph_data=graph_out, direction='out')
cal_fun_score(df=test, graph_data=graph_in, direction='in')
# cal_fun_score(df=test, graph_data=graph_all, direction='all')

Calculating out features...
Calculating in features...
Calculating out features...
Calculating in features...


In [58]:
x_col = train.columns[3:].to_list()
y_col = 'label'

In [59]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

rfc = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)
rfc.fit(train[x_col], train[y_col])

xgb = XGBClassifier(n_estimators=100, max_depth=10, random_state=0)
xgb.fit(train[x_col], train[y_col])

lr = LogisticRegression(random_state=0)
lr.fit(train[x_col], train[y_col])

In [60]:
def submit(model, filename):
    s = pd.DataFrame(
        {
            "node_pair_id": list(test.node_pair_id),
            "ans": list(model.predict(test[x_col]))
        }
    )
    s.to_csv(f"{filename}.csv", index=False)

In [61]:
submit(rfc, 'rfc')
submit(xgb, 'xgb')
submit(lr, 'lr')

In [62]:
s = pd.DataFrame(
    {
        "rfc": rfc.predict(test[x_col]),
        "xgb": xgb.predict(test[x_col]),
        "lr": lr.predict(test[x_col])
    }
)

In [63]:
s.rfc.value_counts(), s.xgb.value_counts(), s.lr.value_counts()

(0    3371
 1    2628
 Name: rfc, dtype: int64,
 0    3617
 1    2382
 Name: xgb, dtype: int64,
 1    3288
 0    2711
 Name: lr, dtype: int64)

In [43]:
s.rfc.value_counts(), s.xgb.value_counts(), s.lr.value_counts()

(0    3345
 1    2655
 Name: rfc, dtype: int64,
 0    3606
 1    2394
 Name: xgb, dtype: int64,
 1    3300
 0    2700
 Name: lr, dtype: int64)

In [44]:
s.rfc.value_counts(), s.xgb.value_counts(), s.lr.value_counts()

(0    3345
 1    2655
 Name: rfc, dtype: int64,
 0    3606
 1    2394
 Name: xgb, dtype: int64,
 1    3300
 0    2700
 Name: lr, dtype: int64)

In [45]:
s.rfc.value_counts(), s.xgb.value_counts(), s.lr.value_counts()

(0    3345
 1    2655
 Name: rfc, dtype: int64,
 0    3606
 1    2394
 Name: xgb, dtype: int64,
 1    3300
 0    2700
 Name: lr, dtype: int64)

In [46]:
s['node_pair_id'] = test.node_pair_id
s['ans'] = s[['rfc', 'xgb', 'lr']].apply(lambda x: 1 if x.sum() >= 3 else 0, axis = 1)

In [47]:
# s[['node_pair_id', 'ans']].to_csv('ensemble.csv', index = False)

In [48]:
s.ans.value_counts()

0    4366
1    1634
Name: ans, dtype: int64

In [49]:
s[s.rfc != s.xgb]

Unnamed: 0,rfc,xgb,lr,node_pair_id,ans
9,1,0,1,9,0
15,1,0,0,15,0
16,1,0,1,16,0
20,1,0,0,20,0
22,1,0,1,22,0
...,...,...,...,...,...
5967,1,0,1,5967,0
5969,1,0,1,5969,0
5978,1,0,0,5978,0
5986,1,0,1,5986,0


In [50]:
s[s.rfc != s.lr]

Unnamed: 0,rfc,xgb,lr,node_pair_id,ans
0,0,0,1,0,0
8,1,1,0,8,0
11,0,0,1,11,0
12,0,0,1,12,0
15,1,0,0,15,0
...,...,...,...,...,...
5981,0,0,1,5981,0
5989,1,1,0,5989,0
5990,0,0,1,5990,0
5996,0,0,1,5996,0


In [51]:
s[s.xgb != s.lr]

Unnamed: 0,rfc,xgb,lr,node_pair_id,ans
0,0,0,1,0,0
8,1,1,0,8,0
9,1,0,1,9,0
11,0,0,1,11,0
12,0,0,1,12,0
...,...,...,...,...,...
5989,1,1,0,5989,0
5990,0,0,1,5990,0
5991,1,0,1,5991,0
5996,0,0,1,5996,0


In [52]:
x_col

['node1_out',
 'node_cn_out',
 'node_jc_out',
 'node_pa_out',
 'node1_in',
 'node_cn_in',
 'node_jc_in',
 'node_pa_in']

In [53]:
train.describe()

Unnamed: 0,node1,node2,label,node1_out,node_cn_out,node_jc_out,node_pa_out,node1_in,node_cn_in,node_jc_in,node_pa_in
count,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0
mean,25907.957083,29898.270208,0.5,5.506958,-0.463958,-0.509745,16.40575,1.82425,-0.459958,-0.505612,16.15275
std,14876.741795,13657.498777,0.50001,5.095115,0.600388,0.504042,37.35335,3.75906,0.597351,0.503904,37.71904
min,454.0,458.0,0.0,-1.0,-2.0,-1.0,-21.0,-1.0,-2.0,-1.0,-21.0
25%,8596.75,17031.75,0.0,1.0,-1.0,-1.0,-2.0,-1.0,-1.0,-1.0,-1.0
50%,32137.0,38136.0,0.5,4.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0,1.0
75%,38542.0,38772.0,1.0,9.0,0.0,0.0,20.0,4.0,0.0,0.0,18.0
max,52845.0,52841.0,1.0,21.0,4.0,0.666667,378.0,20.0,3.0,0.666667,323.0


In [20]:
from collections import deque
q = deque()

In [21]:
q

deque([])

In [22]:
q.append(("1111", 1))

In [23]:
q

deque([('1111', 1)])

In [24]:
q.popleft()

('1111', 1)

In [25]:
q

deque([])

In [26]:
my_dict = {"k": [1, 2,3,4], "B": [1,3, 5,7]}

unique_list = list(set(sum(my_dict.values(), [])) | set(my_dict.keys()))

print(unique_list)  # output: ['k', 1, 2, 3, 4, 5, 7, 'B']


[1, 2, 3, 4, 5, 'B', 7, 'k']
