In [1]:
def cal_neighbors_size(row, **kwargs):

    return kwargs['graph'].get_neighbors_size(row["node1"])

def cal_common_neighbors(row, **kwargs):

    return kwargs['graph'].common_neighbors(row["node1"], row["node2"])

def cal_jaccard_coefficient(row, **kwargs):

    return kwargs['graph'].jaccard_coefficient(row["node1"], row["node2"])

def cal_preferential_attachment(row, **kwargs):

    return kwargs['graph'].preferential_attachment(row["node1"], row["node2"])

socre_func = {
    "dir": cal_neighbors_size,
    "common_neighbors": cal_common_neighbors,
    "jaccard_coefficient": cal_jaccard_coefficient,
    "preferential_attachment": cal_preferential_attachment
}

# **Load Data**

In [2]:
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

train = pd.read_csv(r'Data/new_train_data.csv', dtype = {'node1': 'int32', 'node2': 'int32', 'label': 'int32'})
test = pd.read_csv(r'Data/new_test_data.csv', dtype = {'node1': 'int32', 'node2': 'int32'})

train = train[train.node1 != train.node2]

# **Create Graph**

In [3]:
from core import Graph
from core import ScoreFuncPipeline
from core import DegreeBased

sp = ScoreFuncPipeline(**socre_func)

graph_out = Graph()
graph_in  = Graph()

for _, row in train[train.label==1].iterrows():
    graph_out.add_edge(row['node1'], row['node2'])
    graph_in.add_edge(row['node2'], row['node1'])

# **Sparsification**

## **Degree Based**

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier

x_train_drop = ['node1', 'node2', 'label']
y_train = ['label']
x_test_drop = ['node_pair_id', 'node1', 'node2']

In [5]:
import pandas as pd

train_ensemble = pd.DataFrame()
test_ensemble = pd.DataFrame()

def train_model(model, x_train, y_train, x_test, name):
    model.fit(x_train, y_train)
    train_ensemble[name] = model.predict(x_train)
    test_ensemble[name] = model.predict(x_test)

def train_model_out_in(train, test, name, rs=0, lr_penalty='l2', lr_solver='lbfgs'):
    train_model(
        model=LogisticRegression(random_state=rs, penalty=lr_penalty, solver=lr_solver),
        x_train=train.drop(columns=x_train_drop),
        y_train=train[y_train],
        x_test=test.drop(columns=x_test_drop),
        name=f"lr_{name}"
    )

    # train_model(
    #     model=AdaBoostClassifier(),
    #     x_train=train.drop(columns=x_train_drop),
    #     y_train=train[y_train],
    #     x_test=test.drop(columns=x_test_drop),
    #     name=f"abc_{name}"
    # )

In [6]:
lr_params = [
    ("lbfgs", "l2"),
    ("lbfgs", "none"),
    ("liblinear", "l1"),
    ("liblinear", "l2"),
    ("newton-cg", "l2"),
    ("sag", "l2"),
    ("saga", "l1"),
    ("saga", "l2"),
    # ("saga", "elasticnet")
]

In [7]:
import itertools
import operator

# degrees = [i for i in range(1, 10)]
degrees = [1]
operations = [operator.le, operator.ge, operator.eq, operator.ne]

degree_based_combinations = list(itertools.product(degrees, operations, lr_params))

for degree, operation, (s, p) in degree_based_combinations:

    print(f"degree_based_{degree}_{operation.__name__}_{p}_{s}")
    
    graph_out_db = DegreeBased(graph=graph_out, degree=degree, operation=operation).fit()
    train_db_out, test_db_out = sp.transform(graph=graph_out_db, df_train=train, df_test=test)

    graph_in_db = DegreeBased(graph=graph_in, degree=degree, operation=operation).fit()
    train_db_in, test_db_in = sp.transform(graph=graph_in_db, df_train=train, df_test=test)

    train_model_out_in(train_db_out, test_db_out, f"degree_based_{degree}_{operation.__name__}_{p}_{s}_out", lr_penalty=p, lr_solver=s)
    train_model_out_in(train_db_in, test_db_in, f"degree_based_{degree}_{operation.__name__}_{p}_{s}_in", lr_penalty=p, lr_solver=s)

degree_based_1_le_l2_lbfgs
degree_based_1_le_none_lbfgs
degree_based_1_le_l1_liblinear
degree_based_1_le_l2_liblinear
degree_based_1_le_l2_newton-cg
degree_based_1_le_l2_sag
degree_based_1_le_l1_saga
degree_based_1_le_l2_saga
degree_based_1_ge_l2_lbfgs
degree_based_1_ge_none_lbfgs
degree_based_1_ge_l1_liblinear
degree_based_1_ge_l2_liblinear
degree_based_1_ge_l2_newton-cg
degree_based_1_ge_l2_sag
degree_based_1_ge_l1_saga
degree_based_1_ge_l2_saga
degree_based_1_eq_l2_lbfgs
degree_based_1_eq_none_lbfgs
degree_based_1_eq_l1_liblinear
degree_based_1_eq_l2_liblinear
degree_based_1_eq_l2_newton-cg
degree_based_1_eq_l2_sag
degree_based_1_eq_l1_saga
degree_based_1_eq_l2_saga
degree_based_1_ne_l2_lbfgs
degree_based_1_ne_none_lbfgs
degree_based_1_ne_l1_liblinear
degree_based_1_ne_l2_liblinear
degree_based_1_ne_l2_newton-cg
degree_based_1_ne_l2_sag
degree_based_1_ne_l1_saga
degree_based_1_ne_l2_saga


## **Random Walk**

In [8]:
from core import RandomWalk

# random_states = [i for i in range(0, 2)]
# node1_dropouts = [i/100 for i in range(1, 20)]
# neighbors_dropouts = [i/50 for i in range(10, 35, 5)]
random_states = [i for i in range(0, 1)]
node1_dropouts = [0.1]
neighbors_dropouts = [0.1]

dropout_combinations = list(itertools.product(random_states, node1_dropouts, neighbors_dropouts, lr_params))

for rs, node1_dropout, neighbor_dropout, (s, p) in dropout_combinations:
    print(f"random_walk_{node1_dropout}_{neighbor_dropout}_{rs}_{p}_{s}")
    graph_out_rw = RandomWalk.fit(graph=graph_out, node1_dropout=node1_dropout, neighbor_dropout=neighbor_dropout)
    train_rw_out, test_rw_out = sp.transform(graph=graph_out_rw, df_train=train, df_test=test)

    graph_in_rw = RandomWalk.fit(graph=graph_in, node1_dropout=node1_dropout, neighbor_dropout=neighbor_dropout)
    train_rw_in, test_rw_in = sp.transform(graph=graph_in_rw, df_train=train, df_test=test)

    train_model_out_in(train_rw_out, test_rw_out, f"random_walk_{node1_dropout}_{neighbor_dropout}_{rs}_{p}_{s}_out", rs=rs, lr_penalty=p, lr_solver=s)
    train_model_out_in(train_rw_in, test_rw_in, f"random_walk_{node1_dropout}_{neighbor_dropout}_{rs}_{p}_{s}_in", rs=rs, lr_penalty=p, lr_solver=s)
    

random_walk_0.1_0.1_0_l2_lbfgs
random_walk_0.1_0.1_0_none_lbfgs
random_walk_0.1_0.1_0_l1_liblinear
random_walk_0.1_0.1_0_l2_liblinear
random_walk_0.1_0.1_0_l2_newton-cg
random_walk_0.1_0.1_0_l2_sag
random_walk_0.1_0.1_0_l1_saga
random_walk_0.1_0.1_0_l2_saga


In [9]:
test_ensemble.apply(lambda x: 1 if x.sum() > len(test_ensemble.columns)/2 else 0, axis = 1).value_counts()

1    3325
0    2675
dtype: int64

In [10]:
ans = pd.DataFrame()
ans['node_pair_id'] = test['node_pair_id'].to_list()
ans['ans'] = test_ensemble.apply(lambda x: 1 if x.sum() > len(test_ensemble.columns)/2 else 0, axis = 1)
ans.ans.value_counts()

1    3325
0    2675
Name: ans, dtype: int64

In [11]:
ans.to_csv('ans.csv', index=False)