In [1]:
import pandas as pd
import csv
import numpy as np
import random
import numpy as np
import networkx as nx
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime
import pickle

### Loading Sampled Graph

In [15]:
G = nx.read_graphml("data/20240302_1504_sampled_graph.graphml", node_type = int)
print(len(G.edges()))

28121


### Loading Trained Model

In [3]:
with open("data/20240302_1509_model.pkl", "rb") as f:
    model = pickle.load(f)

In [4]:
model.coef_

array([[27.1642302 ,  8.44685511,  2.58483057,  1.28505366,  1.2912489 ,
         1.73058473,  4.07498997]])

### Loading the Scaler

In [6]:
with open("data/20240302_1509_scaler.pkl", "rb") as f:
    scaler = pickle.load(f)

### Load Test Data

In [7]:
test_df = pd.read_csv('test.csv')
test_df.head(3)

Unnamed: 0,Id,From,To
0,1,3360982,4457271
1,2,4761876,4698439
2,3,4198430,3615486


### Creating Features

In [18]:
test_edge_list = list(zip(test_df['From'].tolist(), test_df['To'].tolist()))
test_nodes_list = list(set(test_df['From'].tolist()).union(set(test_df['To'].tolist())))
#G.add_edges_from(test_edge_list)
G.add_nodes_from(test_nodes_list)

test_df['source_degree'] = test_df['From'].apply(lambda x : G.degree(x))
test_df['sink_degree'] = test_df['To'].apply(lambda x : G.degree(x))
test_df['common_neighbours'] = [len(set(nx.common_neighbors(G, u, v))) for u, v in test_edge_list]

ra_index_generator = nx.resource_allocation_index(G, test_edge_list)
ra_index = [x for _,_,x in ra_index_generator]
test_df['ra_index'] = ra_index

jaccard_coef_generator = nx.jaccard_coefficient(G, test_edge_list)
jaccard_coef = [x for _,_,x in jaccard_coef_generator]
test_df['jaccard_coef'] = jaccard_coef

aa_index_generator = nx.adamic_adar_index(G, test_edge_list)
aa_index = [x for _,_,x in aa_index_generator]
test_df['aa_index'] = aa_index

pa_generator = nx.preferential_attachment(G, test_edge_list)
pref_attach = [x for _,_,x in pa_generator]
test_df['pref_attach'] = pref_attach

# kc_generator = nx.katz_centrality_numpy(G)
# katz_cent = {}
# for n,kc in sorted(kc_generator.items()):
#     katz_cent[n] = kc
# test_df['katz_cent'] = test_df['From'].apply(lambda x : katz_cent[x])

test_df.head(3)

Unnamed: 0,Id,From,To,source_degree,sink_degree,common_neighbours,ra_index,jaccard_coef,aa_index,pref_attach
0,1,3360982,4457271,1,5,0,0.0,0.0,0.0,5
1,2,4761876,4698439,3,1,0,0.0,0.0,0.0,3
2,3,4198430,3615486,5,1,0,0.0,0.0,0.0,5


In [19]:
X_test = scaler.transform(test_df.drop(['From','To','Id'], axis=1))

In [20]:
output_df = test_df.copy()
output_df.drop(['From' ,'To','source_degree','common_neighbours','sink_degree', 'ra_index', 'jaccard_coef', 'aa_index', 'pref_attach'], axis = 1, inplace = True)

### Prediction

In [21]:
output_df['Predictions'] = np.transpose(model.predict_proba(X_test))[1]

In [22]:
pd.Series(model.predict(X_test)).value_counts()

0    1296
1     704
Name: count, dtype: int64

In [23]:
model.predict(X_test)

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [24]:
output_df.head()

Unnamed: 0,Id,Predictions
0,1,0.426774
1,2,0.491334
2,3,0.660347
3,4,0.578126
4,5,0.578126


In [25]:
today_date = datetime.now().strftime("%Y%m%d_%H%M")
output_df.to_csv(f'submissions\{today_date}_submission.csv', index = False)