## Imports

In [1]:
import pandas as pd
import csv
import numpy as np
import random
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime
import networkx as nx

## File Paths

In [2]:
train_file = 'train.csv'

## Functions

In [3]:
def create_edges(adjacency_list : dict):
    edges = []
    for node in adjacency_list.keys():
        for neighbour in adjacency_list[node]:
            edges.append([node, neighbour])

    return edges

## Adjacency List

In [4]:
# adjacency_list = {}
# with open(train_file, 'r') as csvfile:
#     reader = csv.reader(csvfile)
#     for row in reader:
#         if len(row) > 1:
#             adjacency_list[row[0]] = row[1:]
#         else:
#             adjacency_list[row[0]] = []

## Adjacency List Sampled

In [5]:
seed_value = 16
random.seed(seed_value)

adjacency_list_sampled = {}
sampling_ratio = 0.001

with open(train_file, 'r') as csvfile:
    reader = csv.reader(csvfile)
    for row_string in reader:
        row = [int(x) for x in row_string]
        num_samples = max(int(len(row) * sampling_ratio), min(20,len(row)-1)) #sample atleast 20 edges from each node
        if len(row) > 1:
            adjacency_list_sampled[row[0]] = set(random.sample(row[1:], num_samples))
        else:
            #To handle nodes with no neighbors
            adjacency_list_sampled[row[0]] = set([])

## Sampled Edges Dataframe

### Creating from sampled adjacency list

In [6]:
sampled_edges = create_edges(adjacency_list_sampled)

In [7]:
sampled_edges_df = pd.DataFrame(sampled_edges, columns = ['source', 'sink'])
sampled_edges_df.head(3)

Unnamed: 0,source,sink
0,687794,4763554
1,687794,1224868
2,687794,32423


In [8]:
sampled_edges_df.shape

(377684, 2)

In [9]:
sampled_edges_df.source.nunique() #430 nodes have no edges

19570

### Adding Labels column

In [10]:
sampled_edges_df['label'] = 1
sampled_edges_df.head(3)

Unnamed: 0,source,sink,label
0,687794,4763554,1
1,687794,1224868,1
2,687794,32423,1


### Add False Edges (using nodes with degree = 0)

In [11]:
false_edges_adjacency_list = {}

sinks = list(set(sampled_edges_df['sink'].values).union(set(sampled_edges_df['source'].values)))

for node in adjacency_list_sampled.keys():
    if len(adjacency_list_sampled[node]) == 0:
        degree = random.randint(5, 50)
        sink_nodes = set(random.sample(sinks, degree))
        false_edges_adjacency_list[node] = sink_nodes
    else:
        degree = random.randint(5, 50)
        sink_nodes = set(random.sample(sinks, degree))
        sink_nodes = [x for x in sink_nodes if x not in adjacency_list_sampled[node]]
        false_edges_adjacency_list[node] = sink_nodes        

In [12]:
false_edges = create_edges(false_edges_adjacency_list)

In [13]:
false_edges_df = pd.DataFrame(false_edges, columns = ['source', 'sink'])
false_edges_df.head(3)

Unnamed: 0,source,sink
0,687794,1565376
1,687794,1868864
2,687794,4506663


In [14]:
false_edges_df['label'] = 0
false_edges_df.head(3)

Unnamed: 0,source,sink,label
0,687794,1565376,0
1,687794,1868864,0
2,687794,4506663,0


In [15]:
train_df = pd.concat([sampled_edges_df, false_edges_df])
train_df.head()

Unnamed: 0,source,sink,label
0,687794,4763554,1
1,687794,1224868,1
2,687794,32423,1
3,687794,1198888,1
4,687794,1822375,1


In [16]:
train_df.shape

(925318, 3)

In [17]:
train_df.label.value_counts()

label
0    547634
1    377684
Name: count, dtype: int64

## Making a Graph

In [27]:
train_df = train_df.loc[~(train_df['source'] == train_df['sink'])] #to remove self loops

In [28]:
edge_list = list(zip(train_df['source'].tolist(), train_df['sink'].tolist()))

In [31]:
G = nx.Graph(edge_list)

## Features

### Feature 1: Source Degree

In [33]:
train_df['source_degree'] = train_df['source'].apply(lambda x : G.degree(x))
train_df.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['source_degree'] = train_df['source'].apply(lambda x : G.degree(x))


Unnamed: 0,source,sink,label,source_degree,sink_degree
0,687794,4763554,1,69,89
1,687794,1224868,1,69,86
2,687794,32423,1,69,54


### Feature 2: Sink Degree

In [34]:
train_df['sink_degree'] = train_df['sink'].apply(lambda x : G.degree(x))
train_df.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['sink_degree'] = train_df['sink'].apply(lambda x : G.degree(x))


Unnamed: 0,source,sink,label,source_degree,sink_degree
0,687794,4763554,1,69,89
1,687794,1224868,1,69,86
2,687794,32423,1,69,54


### Feature 3: Common Neighbours

In [35]:
train_df['common_neighbours'] = [len(set(nx.common_neighbors(G, u, v))) for u, v in edge_list]
train_df.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['common_neighbours'] = [len(set(nx.common_neighbors(G, u, v))) for u, v in edge_list]


Unnamed: 0,source,sink,label,source_degree,sink_degree,common_neighbours
0,687794,4763554,1,69,89,0
1,687794,1224868,1,69,86,1
2,687794,32423,1,69,54,1


### Feature 4: Resource Allocation Index

The resource_allocation_index is a measure in graph theory that quantifies the similarity between two nodes in a network by considering the common neighbors they share. It is a local index used in the context of link prediction or node similarity in social networks, biological networks, and other complex systems

In [36]:
ra_index_generator = nx.resource_allocation_index(G, edge_list)
ra_index = [x for _,_,x in ra_index_generator]

In [37]:
train_df['ra_index'] = ra_index
train_df.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['ra_index'] = ra_index


Unnamed: 0,source,sink,label,source_degree,sink_degree,common_neighbours,ra_index
0,687794,4763554,1,69,89,0,0.0
1,687794,1224868,1,69,86,1,0.008
2,687794,32423,1,69,54,1,0.014085


### Feature 5: Jaccard Coefficient

In [38]:
jaccard_coef_generator = nx.jaccard_coefficient(G, edge_list)
jaccard_coef = [x for _,_,x in jaccard_coef_generator]

In [39]:
train_df['jaccard_coef'] = jaccard_coef
train_df.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['jaccard_coef'] = jaccard_coef


Unnamed: 0,source,sink,label,source_degree,sink_degree,common_neighbours,ra_index,jaccard_coef
0,687794,4763554,1,69,89,0,0.0,0.0
1,687794,1224868,1,69,86,1,0.008,0.006494
2,687794,32423,1,69,54,1,0.014085,0.008197


### Feature 6: Adamic Agar Index

In [40]:
aa_index_generator = nx.adamic_adar_index(G, edge_list)
aa_index = [x for _,_,x in aa_index_generator]

In [41]:
train_df['aa_index'] = aa_index
train_df.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['aa_index'] = aa_index


Unnamed: 0,source,sink,label,source_degree,sink_degree,common_neighbours,ra_index,jaccard_coef,aa_index
0,687794,4763554,1,69,89,0,0.0,0.0,0.0
1,687794,1224868,1,69,86,1,0.008,0.006494,0.207112
2,687794,32423,1,69,54,1,0.014085,0.008197,0.234594


## Sample Weights

In [42]:
#sample_weights = train_df['label'].apply(lambda x : 1 if x == 1 else 0.7)

## Model

In [48]:
X = train_df.drop(['source','sink','label'], axis=1)
y = train_df['label']

In [49]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [50]:
model = LogisticRegression(C=1, class_weight = 'balanced')

In [51]:
model.fit(X,y)

In [52]:
model.coef_

array([[ 0.86062812, 22.83906591, 13.0674437 ,  7.94719745, 33.70844069,
        13.00832443]])

## Load Test Data

In [70]:
test_df = pd.read_csv('test.csv')
test_df.head(3)

Unnamed: 0,Id,From,To
0,1,3360982,4457271
1,2,4761876,4698439
2,3,4198430,3615486


In [71]:
test_edge_list = list(zip(test_df['From'].tolist(), test_df['To'].tolist()))
TG = G.copy()
TG.add_edges_from(test_edge_list)

test_df['source_degree'] = test_df['From'].apply(lambda x : TG.degree(x))
test_df['sink_degree'] = test_df['To'].apply(lambda x : TG.degree(x))
test_df['common_neighbours'] = [len(set(nx.common_neighbors(TG, u, v))) for u, v in test_edge_list]

ra_index_generator = nx.resource_allocation_index(TG, test_edge_list)
ra_index = [x for _,_,x in ra_index_generator]
test_df['ra_index'] = ra_index

jaccard_coef_generator = nx.jaccard_coefficient(TG, test_edge_list)
jaccard_coef = [x for _,_,x in jaccard_coef_generator]
test_df['jaccard_coef'] = jaccard_coef

aa_index_generator = nx.adamic_adar_index(TG, test_edge_list)
aa_index = [x for _,_,x in aa_index_generator]
test_df['aa_index'] = aa_index

test_df.head(3)

Unnamed: 0,Id,From,To,source_degree,sink_degree,common_neighbours,ra_index,jaccard_coef,aa_index
0,1,3360982,4457271,35,61,0,0.0,0.0,0.0
1,2,4761876,4698439,41,1,0,0.0,0.0,0.0
2,3,4198430,3615486,33,1,0,0.0,0.0,0.0


In [72]:
X_test = scaler.transform(test_df.drop(['From','To','Id'], axis=1))

In [74]:
output_df = test_df.copy()
output_df.drop(['From' ,'To','source_degree','common_neighbours','sink_degree', 'ra_index', 'jaccard_coef', 'aa_index'], axis = 1, inplace = True)

In [75]:
output_df['Predictions'] = np.transpose(model.predict_proba(X_test))[1]

In [76]:
pd.Series(model.predict(X_test)).value_counts()

0    1612
1     388
Name: count, dtype: int64

In [77]:
model.predict(X_test)

array([1, 0, 0, ..., 0, 0, 0], dtype=int64)

In [78]:
output_df.head()

Unnamed: 0,Id,Predictions
0,1,0.656078
1,2,0.40636
2,3,0.40511
3,4,0.423408
4,5,0.438478


In [79]:
today_date = datetime.now().strftime("%Y%m%d_%H%M")
output_df.to_csv(f'submissions\{today_date}_submission.csv', index = False)

## Playground

In [None]:
get_ra_index(train_df, ['687794', '1212121'])

In [None]:
# Define the number of folds
k = 5

# Initialize the KFold object
kf = KFold(n_splits=k, shuffle=True, stratify = y)

# Initialize lists to store predictions and true labels
all_predictions = []
all_true_labels = []

# Iterate over each fold
for train_index, val_index in kf.split(X):
    # Split the data into training and validation sets
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # Define and train your machine learning model
    model = LogisticRegression()
    model.fit(X_train, y_train)
    
    # Make predictions on the validation set
    predictions = model.predict(X_val)
    
    # Collect the predictions and true labels for each fold
    all_predictions.extend(predictions)
    all_true_labels.extend(y_val)

# Compute the confusion matrix
cm = confusion_matrix(all_true_labels, all_predictions)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='d')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()