In [1]:
from chore import *
from core import Graph
import warnings

warnings.filterwarnings("ignore")

train, test = load_data()

graph_out, graph_in, graph_all = Graph(), Graph(), Graph()

for _, row in train[train['label'] == 1].iterrows():
    graph_out.add_edge(row['node1'], row['node2'])
    graph_in.add_edge(row['node2'], row['node1'])
    graph_all.add_edge(row['node1'], row['node2'])
    graph_all.add_edge(row['node2'], row['node1'])

cal_fun_score(df=train, graph_data=graph_out, direction='out')
cal_fun_score(df=train, graph_data=graph_in, direction='in')
cal_fun_score(df=train, graph_data=graph_all, direction='all')

cal_fun_score(df=test, graph_data=graph_out, direction='out')
cal_fun_score(df=test, graph_data=graph_in, direction='in')
cal_fun_score(df=test, graph_data=graph_all, direction='all')

Calculating out features...
Calculating in features...
Calculating all features...
Calculating out features...
Calculating in features...
Calculating all features...


In [2]:
degree_based_node_out_group = {
    "N1_GTOD":    "self.{}.node1_out > self.graph.get_average_degree",
    "N2_GTOD":    "self.{}.node2_out > self.graph.get_average_degree",
    "N1N2I_GTOD": "(self.{}.node1_out > self.graph.get_average_degree) & (self.{}.node2_out > self.graph.get_average_degree)",
    "N1N2U_GTOD": "(self.{}.node1_out > self.graph.get_average_degree) | (self.{}.node2_out > self.graph.get_average_degree)",
    "N1_LOD":     "self.{}.node1_out < self.graph.get_average_degree",
    "N2_LOD":     "self.{}.node2_out < self.graph.get_average_degree",
    "N1N2I_LOD":  "(self.{}.node1_out < self.graph.get_average_degree) & (self.{}.node2_out < self.graph.get_average_degree)",
    "N1N2U_LOD":  "(self.{}.node1_out < self.graph.get_average_degree) | (self.{}.node2_out < self.graph.get_average_degree)",
}
degree_based_node_in_group = {
    "N1_GTID":    "self.{}.node1_in > self.graph.get_average_degree",
    "N2_GTID":    "self.{}.node2_in > self.graph.get_average_degree",
    "N1N2I_GTID": "(self.{}.node1_in > self.graph.get_average_degree) & (self.{}.node2_in > self.graph.get_average_degree)",
    "N1N2U_GTID": "(self.{}.node1_in > self.graph.get_average_degree) | (self.{}.node2_in > self.graph.get_average_degree)",
    "N1_LID":     "self.{}.node1_in < self.graph.get_average_degree",
    "N2_LID":     "self.{}.node2_in < self.graph.get_average_degree",
    "N1N2I_LID":  "(self.{}.node1_in < self.graph.get_average_degree) & (self.{}.node2_in < self.graph.get_average_degree)",
    "N1N2U_LID":  "(self.{}.node1_in < self.graph.get_average_degree) | (self.{}.node2_in < self.graph.get_average_degree)",
}

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

x_col = train.columns[3:].to_list()
y_col = 'label'

models = [LogisticRegression(), DecisionTreeClassifier(), RandomForestClassifier(), AdaBoostClassifier(), KNeighborsClassifier()]

In [3]:
from core.sparsification import DegreeBased

degree_based_node_out = DegreeBased(
    graph=graph_out, 
    train=train, 
    test=test, 
    x_col=x_col, y_col=y_col, 
    models=models, 
    **degree_based_node_out_group
)
degree_based_node_out.fit()

degree_based_node_in = DegreeBased(
    graph=graph_in,
    train=degree_based_node_out.train,
    test=degree_based_node_out.test,
    x_col=x_col, y_col=y_col,
    models=models,
    **degree_based_node_in_group
)
degree_based_node_in.fit()

1    6786
0    6589
Name: label, dtype: int64
0    4006
1    3996
Name: label, dtype: int64
1    3928
0    2249
Name: label, dtype: int64
0    8346
1    6854
Name: label, dtype: int64
0    5411
1    5214
Name: label, dtype: int64
1    8004
0    7994
Name: label, dtype: int64
1    5146
0    3654
Name: label, dtype: int64
0    9751
1    8072
Name: label, dtype: int64
1    6786
0    6589
Name: label, dtype: int64
0    4006
1    3996
Name: label, dtype: int64
1    3928
0    2249
Name: label, dtype: int64
0    8346
1    6854
Name: label, dtype: int64
0    5411
1    5214
Name: label, dtype: int64
1    8004
0    7994
Name: label, dtype: int64
1    5146
0    3654
Name: label, dtype: int64
0    9751
1    8072
Name: label, dtype: int64
1    6786
0    6589
Name: label, dtype: int64
0    4006
1    3996
Name: label, dtype: int64
1    3928
0    2249
Name: label, dtype: int64
0    8346
1    6854
Name: label, dtype: int64
0    5411
1    5214
Name: label, dtype: int64
1    8004
0    7994
Name: label, d

In [4]:
train = degree_based_node_in.train
test = degree_based_node_in.test

In [5]:
x_col = train.columns.to_list()
x_col.remove('node1')
x_col.remove('node2')
x_col.remove('label')

y_col = 'label'

In [6]:
from sklearn.preprocessing import RobustScaler

ss = RobustScaler()
ss.fit(train[x_col])

train[x_col] = ss.transform(train[x_col])
test[x_col] = ss.transform(test[x_col])

In [7]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

rfc = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=0)
rfc.fit(train[x_col], train[y_col])

xgb = XGBClassifier(n_estimators=100, max_depth=20, random_state=0)
xgb.fit(train[x_col], train[y_col])

lr = LogisticRegression(random_state=0)
lr.fit(train[x_col], train[y_col])

In [8]:
def submit(model, filename):
    s = pd.DataFrame(
        {
            "node_pair_id": list(test.node_pair_id),
            "ans": list(model.predict(test[x_col]))
        }
    )
    s.to_csv(f"{filename}.csv", index=False)

In [9]:
submit(rfc, 'rfc')
submit(xgb, 'xgb')
submit(lr, 'lr')

In [10]:
s = pd.DataFrame(
    {
        "rfc": rfc.predict(test[x_col]),
        "xgb": xgb.predict(test[x_col]),
        "lr": lr.predict(test[x_col])
    }
)

In [11]:
s.rfc.value_counts(), s.xgb.value_counts(), s.lr.value_counts()

(0    3580
 1    2420
 Name: rfc, dtype: int64,
 0    3564
 1    2436
 Name: xgb, dtype: int64,
 1    3093
 0    2907
 Name: lr, dtype: int64)

In [14]:
s['node_pair_id'] = test.node_pair_id
s['ans'] = s[['rfc', 'xgb', 'lr']].apply(lambda x: 1 if x.sum() >= 2 else 0, axis = 1)

In [15]:
s.ans.value_counts()

0    3546
1    2454
Name: ans, dtype: int64