In [1]:
import pandas as pd

train = pd.read_csv(r'Data/new_train_data.csv', dtype = {'node1': 'int32', 'node2': 'int32', 'label': 'int32'})
test = pd.read_csv(r'Data/new_test_data.csv', dtype = {'node1': 'int32', 'node2': 'int32'})

In [2]:
from core import Graph

graph_out = Graph()
graph_in  = Graph()

for _, row in train[train['label'] == 1].iterrows():
    graph_out.add_edge(row['node1'], row['node2'])
    graph_in.add_edge(row['node2'], row['node1'])

In [3]:
def cal_fun_score(df, graph_data, direction='out'):

    print(f'Calculating {direction} features...')
    
    if direction == 'out':
        node1 = 'node1'
        node2 = 'node2'
    else:
        node1 = 'node2'
        node2 = 'node1'
    
    df[f'node1_{direction}'] = df['node1'].apply(lambda node: graph_data.get_neighbors_size(node))
    # df[f'node_cn_{direction}'] = df.apply(lambda row: graph_data.common_neighbors(row[node1], row[node2]), axis=1)
    # df[f'node_jc_{direction}'] = df.apply(lambda row: graph_data.jaccard_coefficient(row[node1], row[node2]), axis=1)
    # df[f'node_aa_{direction}'] = graph_data.apply(lambda row: graph_out.adamic_adar(row[node1], row[node1]), axis=1)
    # df[f'node_pa_{direction}'] = df.apply(lambda row: graph_data.preferential_attachment(row[node1], row[node2]), axis=1)
    # df[f'node_sp_{direction}'] = df.apply(lambda row: graph_data.shortest_path(row[node1], row[node2]), axis=1)

In [4]:
cal_fun_score(df=train, graph_data=graph_out, direction='out')
cal_fun_score(df=train, graph_data=graph_in, direction='in')

cal_fun_score(df=test, graph_data=graph_out, direction='out')
cal_fun_score(df=test, graph_data=graph_in, direction='in')

Calculating out features...
Calculating in features...
Calculating out features...
Calculating in features...




In [5]:
x_col = train.columns[3:].to_list()
y_col = 'label'

In [6]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

rfc = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)
rfc.fit(train[x_col], train[y_col])

xgb = XGBClassifier(n_estimators=100, max_depth=10, random_state=0)
xgb.fit(train[x_col], train[y_col])

lr = LogisticRegression(random_state=0)
lr.fit(train[x_col], train[y_col])

In [7]:
def submit(model, filename):
    s = pd.DataFrame(
        {
            "node_pair_id": list(test.node_pair_id),
            "ans": list(model.predict(test[x_col]))
        }
    )
    s.to_csv(f"{filename}.csv", index=False)

In [8]:
submit(rfc, 'rfc')
submit(xgb, 'xgb')
submit(lr, 'lr')

In [9]:
s = pd.DataFrame(
    {
        "rfc": rfc.predict(test[x_col]),
        "xgb": xgb.predict(test[x_col]),
        "lr": lr.predict(test[x_col])
    }
)

In [10]:
s.rfc.value_counts(), s.xgb.value_counts(), s.lr.value_counts()

(1    3369
 0    2631
 Name: rfc, dtype: int64,
 1    3213
 0    2787
 Name: xgb, dtype: int64,
 0    3758
 1    2242
 Name: lr, dtype: int64)

In [12]:
s.rfc.value_counts(), s.xgb.value_counts(), s.lr.value_counts()

(0    3395
 1    2605
 Name: rfc, dtype: int64,
 0    3537
 1    2463
 Name: xgb, dtype: int64,
 0    3952
 1    2048
 Name: lr, dtype: int64)

In [11]:
s['node_pair_id'] = test.node_pair_id
s['ans'] = s[['rfc', 'xgb', 'lr']].apply(lambda x: 1 if x.sum() >= 3 else 0, axis = 1)

In [12]:
# s[['node_pair_id', 'ans']].to_csv('ensemble.csv', index = False)

In [13]:
s.ans.value_counts()

0    4380
1    1620
Name: ans, dtype: int64

In [16]:
s[s.rfc != s.xgb]

Unnamed: 0,rfc,xgb,lr,node_pair_id,ans
2,1,0,1,2,1
9,1,0,1,9,1
20,1,0,0,20,0
28,0,1,0,28,0
34,0,1,0,34,0
...,...,...,...,...,...
5956,1,0,1,5956,1
5967,1,0,1,5967,1
5969,1,0,1,5969,1
5978,0,1,0,5978,0


In [17]:
s[s.rfc != s.lr]

Unnamed: 0,rfc,xgb,lr,node_pair_id,ans
1,1,1,0,1,1
3,0,0,1,3,0
8,1,1,0,8,1
10,1,1,0,10,1
13,0,0,1,13,0
...,...,...,...,...,...
5991,1,1,0,5991,1
5993,1,1,0,5993,1
5994,1,1,0,5994,1
5995,1,1,0,5995,1


In [18]:
s[s.xgb != s.lr]

Unnamed: 0,rfc,xgb,lr,node_pair_id,ans
1,1,1,0,1,1
2,1,0,1,2,1
3,0,0,1,3,0
8,1,1,0,8,1
9,1,0,1,9,1
...,...,...,...,...,...
5991,1,1,0,5991,1
5993,1,1,0,5993,1
5994,1,1,0,5994,1
5995,1,1,0,5995,1


In [19]:
x_col

['node1_out',
 'node1_in',
 'node_cn_out',
 'node_cn_in',
 'node_jc_out',
 'node_jc_in',
 'node_pa_out',
 'node_pa_in']

In [20]:
train.describe()

Unnamed: 0,node1,node2,label,node1_out,node1_in,node_cn_out,node_cn_in,node_jc_out,node_jc_in,node_pa_out,node_pa_in
count,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0
mean,25907.957083,29898.270208,0.5,5.541,2.310667,0.050958,0.05,0.003851,0.003652,20.416958,19.994708
std,14876.741795,13657.498777,0.50001,5.054706,3.410677,0.262896,0.242904,0.025371,0.023573,35.320628,35.830475
min,454.0,458.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,8596.75,17031.75,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
50%,32137.0,38136.0,0.5,4.0,1.0,0.0,0.0,0.0,0.0,6.0,6.0
75%,38542.0,38772.0,1.0,9.0,4.0,0.0,0.0,0.0,0.0,20.0,18.0
max,52845.0,52841.0,1.0,21.0,20.0,10.0,6.0,1.0,1.0,378.0,323.0


In [None]:
from collections import deque
q = deque()

In [None]:
q

In [None]:
q.append(("1111", 1))

In [None]:
q

In [None]:
q.popleft()

In [None]:
q

In [None]:
my_dict = {"k": [1, 2,3,4], "B": [1,3, 5,7]}

unique_list = list(set(sum(my_dict.values(), [])) | set(my_dict.keys()))

print(unique_list)  # output: ['k', 1, 2, 3, 4, 5, 7, 'B']
