## Imports

In [1]:
import pandas as pd
import csv
import numpy as np
import random
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime
import networkx as nx

## File Paths

In [2]:
train_file = 'train.csv'

## Functions

In [3]:
def create_edges(adjacency_list : dict):
    edges = []
    for node in adjacency_list.keys():
        for neighbour in adjacency_list[node]['neighbours']:
            edges.append([node, neighbour])

    return edges

In [4]:
def get_common_neighbours(edge_pair):
    source = edge_pair[0]
    sink = edge_pair[1]
    if source in adjacency_list_sampled.keys() and sink in adjacency_list_sampled.keys():
        return len(adjacency_list_sampled[source]['neighbours'].intersection(adjacency_list_sampled[sink]['neighbours']))
    else:
        return 0

## Adjacency List

In [5]:
# adjacency_list = {}
# with open(train_file, 'r') as csvfile:
#     reader = csv.reader(csvfile)
#     for row in reader:
#         if len(row) > 1:
#             adjacency_list[row[0]] = row[1:]
#         else:
#             adjacency_list[row[0]] = []

## Adjacency List Sampled

In [6]:
adjacency_list_sampled = {}
sampling_ratio = 0.01

with open(train_file, 'r') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        num_samples = max(int(len(row) * sampling_ratio), min(50,len(row)-1)) #sample atleast 50 edges from nodes with degree less than 5000
        if len(row) > 1:
            adjacency_list_sampled[row[0]] = {
                'neighbours': set(random.sample(row[1:], num_samples)),
                'degree': len(row[1:])
            }
        else:
            #To handle nodes with no neighbors
            adjacency_list_sampled[row[0]] = {
                'neighbours': set([]),
                'degree': 0
            }

## Sampled Edges Dataframe

### Creating from sampled adjacency list

In [7]:
sampled_edges = create_edges(adjacency_list_sampled)

In [8]:
sampled_edges_df = pd.DataFrame(sampled_edges, columns = ['source', 'sink'])
sampled_edges_df.head(3)

Unnamed: 0,source,sink
0,687794,2973900
1,687794,3091489
2,687794,2623624


In [9]:
sampled_edges_df.shape

(987445, 2)

In [10]:
sampled_edges_df.source.nunique() #430 nodes have no edges

19570

### Adding Labels column

In [11]:
sampled_edges_df['label'] = 1
sampled_edges_df.head(3)

Unnamed: 0,source,sink,label
0,687794,2973900,1
1,687794,3091489,1
2,687794,2623624,1


### Add False Edges (using nodes with degree = 0)

In [12]:
false_edges_adjacency_list = {}

sinks = list(set(sampled_edges_df['sink'].values).union(set(sampled_edges_df['source'].values)))

for node in adjacency_list_sampled.keys():
    if adjacency_list_sampled[node]['degree'] == 0:
        degree = random.randint(5, 100)
        sink_nodes = set(random.sample(sinks, degree))
        false_edges_adjacency_list[node] = {'neighbours' : sink_nodes}
    else:
        degree = random.randint(5, 100)
        sink_nodes = set(random.sample(sinks, degree))
        sink_nodes = [x for x in sink_nodes if x not in adjacency_list_sampled[node]['neighbours']]
        false_edges_adjacency_list[node] = {'neighbours' : sink_nodes}        

In [13]:
false_edges = create_edges(false_edges_adjacency_list)

In [14]:
false_edges_df = pd.DataFrame(false_edges, columns = ['source', 'sink'])
false_edges_df.head(3)

Unnamed: 0,source,sink
0,687794,1357898
1,687794,378139
2,687794,2848006


In [15]:
false_edges_df['label'] = 0
false_edges_df.head(3)

Unnamed: 0,source,sink,label
0,687794,1357898,0
1,687794,378139,0
2,687794,2848006,0


In [16]:
train_df = pd.concat([sampled_edges_df, false_edges_df])
train_df.head()

Unnamed: 0,source,sink,label
0,687794,2973900,1
1,687794,3091489,1
2,687794,2623624,1
3,687794,1554274,1
4,687794,1224868,1


In [17]:
train_df.shape

(2036747, 3)

In [18]:
train_df.label.value_counts()

label
0    1049302
1     987445
Name: count, dtype: int64

### Feature 1: Source Degree

In [19]:
train_df['source_degree'] = train_df['source'].apply(lambda x : adjacency_list_sampled[x]['degree'])
train_df.head(3)

Unnamed: 0,source,sink,label,source_degree
0,687794,2973900,1,143
1,687794,3091489,1,143
2,687794,2623624,1,143


### Feature 2: Sink Degree

In [20]:
train_df['sink_degree'] = train_df['sink'].apply(lambda x : adjacency_list_sampled[x]['degree'] if x in adjacency_list_sampled.keys() else 0)
train_df.head(3)

Unnamed: 0,source,sink,label,source_degree,sink_degree
0,687794,2973900,1,143,1458
1,687794,3091489,1,143,31
2,687794,2623624,1,143,767


### Feature 3: Common Neighbours

In [21]:
train_df['common_neighbours'] = np.apply_along_axis(get_common_neighbours, axis=1, arr=train_df[['source', 'sink']].values)
train_df.head(3)

Unnamed: 0,source,sink,label,source_degree,sink_degree,common_neighbours
0,687794,2973900,1,143,1458,0
1,687794,3091489,1,143,31,2
2,687794,2623624,1,143,767,1


### Feature 4: Resource Allocation Index

The resource_allocation_index is a measure in graph theory that quantifies the similarity between two nodes in a network by considering the common neighbors they share. It is a local index used in the context of link prediction or node similarity in social networks, biological networks, and other complex systems

## Model

In [23]:
X = train_df.drop(['source','sink','label'], axis=1)
y = train_df['label']

In [24]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [25]:
model = LogisticRegression()

In [26]:
model.fit(X,y)

In [27]:
model.coef_

array([[ 34.03107751,  50.48320425, 106.12307585]])

## Load Test Data

In [28]:
test_df = pd.read_csv('test.csv')
test_df.head(3)

Unnamed: 0,Id,From,To
0,1,3360982,4457271
1,2,4761876,4698439
2,3,4198430,3615486


In [29]:
test_df['From'] = test_df['From'].apply(lambda x : str(x))
test_df['To'] = test_df['To'].apply(lambda x : str(x))

In [30]:
test_df['source_degree'] = test_df['From'].apply(lambda x : adjacency_list_sampled[x]['degree'])
test_df['sink_degree'] = test_df['To'].apply(lambda x : adjacency_list_sampled[x]['degree'] if x in adjacency_list_sampled.keys() else 0)
test_df['common_neighbours'] = np.apply_along_axis(get_common_neighbours, axis=1, arr=test_df[['From', 'To']].values)
test_df.head(3)

Unnamed: 0,Id,From,To,source_degree,sink_degree,common_neighbours
0,1,3360982,4457271,1964,242,0
1,2,4761876,4698439,123,0,0
2,3,4198430,3615486,6996,0,0


In [31]:
X_test = scaler.transform(test_df.drop(['From','To','Id'], axis=1))

In [32]:
output_df = test_df.copy()
output_df.drop(['From' ,'To','source_degree','sink_degree','common_neighbours'], axis = 1, inplace = True)

In [46]:
output_df['Predictions'] = np.transpose(model.predict_proba(X_test))[1]

In [49]:
pd.Series(model.predict(X_test)).value_counts()

0    1782
1     218
Name: count, dtype: int64

In [50]:
output_df.head()

Unnamed: 0,Id,Predictions
0,1,0.437269
1,2,0.413329
2,3,0.488963
3,4,0.415122
4,5,0.426192


In [51]:
today_date = datetime.now().strftime("%Y%m%d_%H%M")
output_df.to_csv(f'submissions\{today_date}_submission.csv', index = False)

## Playground

In [71]:
edge_list = list(zip(train_df['source'].tolist(), train_df['sink'].tolist()))

In [72]:
import networkx as nx

edge_list = [(1, 2), (2, 3), (3, 4), (4, 1), (4,2)]
G = nx.Graph(edge_list)

nx.draw(G, with_labels=True, font_weight='bold')
plt.show()

In [74]:
G = nx.Graph()
G.add_edges_from([(1, 2), (1, 3), (2, 3), (2, 4)])

# Compute resource allocation index between nodes 1 and 4
ra_index = nx.resource_allocation_index(G, [(1, 2), (1, 3), (2, 3), (2, 4)])
for u, v, p in ra_index:
    print(f"Resource Allocation Index between {u} and {v}: {p}")

Resource Allocation Index between 1 and 2: 0.5
Resource Allocation Index between 1 and 3: 0.3333333333333333
Resource Allocation Index between 2 and 3: 0.5
Resource Allocation Index between 2 and 4: 0


In [None]:
# Define the number of folds
k = 5

# Initialize the KFold object
kf = KFold(n_splits=k, shuffle=True, stratify = y)

# Initialize lists to store predictions and true labels
all_predictions = []
all_true_labels = []

# Iterate over each fold
for train_index, val_index in kf.split(X):
    # Split the data into training and validation sets
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # Define and train your machine learning model
    model = LogisticRegression()
    model.fit(X_train, y_train)
    
    # Make predictions on the validation set
    predictions = model.predict(X_val)
    
    # Collect the predictions and true labels for each fold
    all_predictions.extend(predictions)
    all_true_labels.extend(y_val)

# Compute the confusion matrix
cm = confusion_matrix(all_true_labels, all_predictions)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='d')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()