In [53]:
import pandas as pd
import csv
import numpy as np
import random
import numpy as np
import networkx as nx
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime
import pickle

### Loading Sampled Graph

In [54]:
G = nx.read_graphml("data/20240228_0646_sampled_graph.graphml", node_type = int)
print(len(G.edges()))

342258


### Loading Trained Model

In [64]:
with open("data/20240228_1429_model.pkl", "rb") as f:
    model = pickle.load(f)

In [65]:
model.coef_

array([[17.23542418, 11.40524542, 26.5937535 , 14.21980516]])

### Loading the Scaler

In [66]:
with open("data/20240228_1429_scaler.pkl", "rb") as f:
    scaler = pickle.load(f)

### Load Test Data

In [67]:
test_df = pd.read_csv('test.csv')
test_df.head(3)

Unnamed: 0,Id,From,To
0,1,3360982,4457271
1,2,4761876,4698439
2,3,4198430,3615486


### Creating Features

In [68]:
test_edge_list = list(zip(test_df['From'].tolist(), test_df['To'].tolist()))
G.add_edges_from(test_edge_list)

test_df['source_degree'] = test_df['From'].apply(lambda x : G.degree(x))
test_df['sink_degree'] = test_df['To'].apply(lambda x : G.degree(x))
test_df['common_neighbours'] = [len(set(nx.common_neighbors(G, u, v))) for u, v in test_edge_list]

ra_index_generator = nx.resource_allocation_index(G, test_edge_list)
ra_index = [x for _,_,x in ra_index_generator]
test_df['ra_index'] = ra_index

jaccard_coef_generator = nx.jaccard_coefficient(G, test_edge_list)
jaccard_coef = [x for _,_,x in jaccard_coef_generator]
test_df['jaccard_coef'] = jaccard_coef

aa_index_generator = nx.adamic_adar_index(G, test_edge_list)
aa_index = [x for _,_,x in aa_index_generator]
test_df['aa_index'] = aa_index

test_df.head(3)

Unnamed: 0,Id,From,To,source_degree,sink_degree,common_neighbours,ra_index,jaccard_coef,aa_index
0,1,3360982,4457271,13,13,0,0.0,0.0,0.0
1,2,4761876,4698439,17,1,0,0.0,0.0,0.0
2,3,4198430,3615486,16,1,0,0.0,0.0,0.0


In [69]:
X_test = scaler.transform(test_df.drop(['From','To','Id', 'source_degree', 'sink_degree'], axis=1))

In [70]:
output_df = test_df.copy()
output_df.drop(['From' ,'To','source_degree','common_neighbours','sink_degree', 'ra_index', 'jaccard_coef', 'aa_index'], axis = 1, inplace = True)

### Prediction

In [71]:
output_df['Predictions'] = np.transpose(model.predict_proba(X_test))[1]

In [72]:
pd.Series(model.predict(X_test)).value_counts()

1    2000
Name: count, dtype: int64

In [50]:
model.predict(X_test)

array([1, 0, 1, ..., 0, 1, 1], dtype=int64)

In [51]:
output_df.head()

Unnamed: 0,Id,Predictions
0,1,0.671506
1,2,0.48783
2,3,0.609446
3,4,0.609446
4,5,0.528935


In [52]:
today_date = datetime.now().strftime("%Y%m%d_%H%M")
output_df.to_csv(f'submissions\{today_date}_submission.csv', index = False)