## Imports

In [1]:
import pandas as pd
import csv
import numpy as np
import random
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime
import networkx as nx

## File Paths

In [2]:
train_file = 'train.csv'

## Functions

In [3]:
def create_edges(adjacency_list : dict):
    edges = []
    for node in adjacency_list.keys():
        for neighbour in adjacency_list[node]['neighbours']:
            edges.append([node, neighbour])

    return edges

In [4]:
def get_common_neighbours(edge_pair):
    source = edge_pair[0]
    sink = edge_pair[1]
    if source in adjacency_list_sampled.keys() and sink in adjacency_list_sampled.keys():
        return len(adjacency_list_sampled[source]['neighbours'].intersection(adjacency_list_sampled[sink]['neighbours']))
    else:
        return 0

In [None]:
def get_ra_index(edge_pair):
    pass

## Adjacency List

In [5]:
# adjacency_list = {}
# with open(train_file, 'r') as csvfile:
#     reader = csv.reader(csvfile)
#     for row in reader:
#         if len(row) > 1:
#             adjacency_list[row[0]] = row[1:]
#         else:
#             adjacency_list[row[0]] = []

## Adjacency List Sampled

In [6]:
seed_value = 16
random.seed(seed_value)

adjacency_list_sampled = {}
sampling_ratio = 0.001

with open(train_file, 'r') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        num_samples = max(int(len(row) * sampling_ratio), min(20,len(row)-1)) #sample atleast 20 edges from each node
        if len(row) > 1:
            adjacency_list_sampled[row[0]] = {
                'neighbours': set(random.sample(row[1:], num_samples)),
                'degree': len(row[1:])
            }
        else:
            #To handle nodes with no neighbors
            adjacency_list_sampled[row[0]] = {
                'neighbours': set([]),
                'degree': 0
            }

## Sampled Edges Dataframe

### Creating from sampled adjacency list

In [7]:
sampled_edges = create_edges(adjacency_list_sampled)

In [8]:
sampled_edges_df = pd.DataFrame(sampled_edges, columns = ['source', 'sink'])
sampled_edges_df.head(3)

Unnamed: 0,source,sink
0,687794,1198888
1,687794,366718
2,687794,2675692


In [9]:
sampled_edges_df.shape

(377684, 2)

In [10]:
sampled_edges_df.source.nunique() #430 nodes have no edges

19570

### Adding Labels column

In [11]:
sampled_edges_df['label'] = 1
sampled_edges_df.head(3)

Unnamed: 0,source,sink,label
0,687794,1198888,1
1,687794,366718,1
2,687794,2675692,1


### Add False Edges (using nodes with degree = 0)

In [12]:
false_edges_adjacency_list = {}

sinks = list(set(sampled_edges_df['sink'].values).union(set(sampled_edges_df['source'].values)))

for node in adjacency_list_sampled.keys():
    if adjacency_list_sampled[node]['degree'] == 0:
        degree = random.randint(5, 50)
        sink_nodes = set(random.sample(sinks, degree))
        false_edges_adjacency_list[node] = {'neighbours' : sink_nodes}
    else:
        degree = random.randint(5, 50)
        sink_nodes = set(random.sample(sinks, degree))
        sink_nodes = [x for x in sink_nodes if x not in adjacency_list_sampled[node]['neighbours']]
        false_edges_adjacency_list[node] = {'neighbours' : sink_nodes}        

In [13]:
false_edges = create_edges(false_edges_adjacency_list)

In [14]:
false_edges_df = pd.DataFrame(false_edges, columns = ['source', 'sink'])
false_edges_df.head(3)

Unnamed: 0,source,sink
0,687794,3073569
1,687794,4676953
2,687794,4001041


In [15]:
false_edges_df['label'] = 0
false_edges_df.head(3)

Unnamed: 0,source,sink,label
0,687794,3073569,0
1,687794,4676953,0
2,687794,4001041,0


In [16]:
train_df = pd.concat([sampled_edges_df, false_edges_df])
train_df.head()

Unnamed: 0,source,sink,label
0,687794,1198888,1
1,687794,366718,1
2,687794,2675692,1
3,687794,1822375,1
4,687794,3273020,1


In [17]:
train_df.shape

(925332, 3)

In [18]:
train_df.label.value_counts()

label
0    547648
1    377684
Name: count, dtype: int64

## Making a Graph

In [19]:
edge_list = list(zip(train_df['source'].tolist(), train_df['sink'].tolist()))

In [20]:
G = nx.Graph(edge_list)

## Features

### Feature 1: Source Degree

In [21]:
train_df['source_degree'] = train_df['source'].apply(lambda x : adjacency_list_sampled[x]['degree'])
train_df.head(3)

Unnamed: 0,source,sink,label,source_degree
0,687794,1198888,1,143
1,687794,366718,1,143
2,687794,2675692,1,143


### Feature 2: Sink Degree

In [22]:
train_df['sink_degree'] = train_df['sink'].apply(lambda x : adjacency_list_sampled[x]['degree'] if x in adjacency_list_sampled.keys() else 0)
train_df.head(3)

Unnamed: 0,source,sink,label,source_degree,sink_degree
0,687794,1198888,1,143,124
1,687794,366718,1,143,769
2,687794,2675692,1,143,67


### Feature 3: Common Neighbours

In [23]:
train_df['common_neighbours'] = np.apply_along_axis(get_common_neighbours, axis=1, arr=train_df[['source', 'sink']].values)
train_df.head(3)

Unnamed: 0,source,sink,label,source_degree,sink_degree,common_neighbours
0,687794,1198888,1,143,124,0
1,687794,366718,1,143,769,0
2,687794,2675692,1,143,67,0


### Feature 4: Resource Allocation Index

The resource_allocation_index is a measure in graph theory that quantifies the similarity between two nodes in a network by considering the common neighbors they share. It is a local index used in the context of link prediction or node similarity in social networks, biological networks, and other complex systems

In [42]:
ra_index_generator = nx.resource_allocation_index(G, edge_list)
ra_index = [x for _,_,x in ra_index_generator]

In [43]:
train_df['ra_index'] = ra_index
train_df.head(3)

Unnamed: 0,source,sink,label,source_degree,sink_degree,common_neighbours,ra_index
0,687794,1198888,1,143,124,0,0.0
1,687794,366718,1,143,769,0,0.411586
2,687794,2675692,1,143,67,0,0.052455


## Model

In [44]:
X = train_df.drop(['source','sink','label'], axis=1)
y = train_df['label']

In [45]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [46]:
model = LogisticRegression(C=1, class_weight = 'balanced')

In [47]:
model.fit(X,y)

In [48]:
model.coef_

array([[10.56058284, 33.97419888, 56.07514957,  8.98402667]])

## Load Test Data

In [49]:
test_df = pd.read_csv('test.csv')
test_df.head(3)

Unnamed: 0,Id,From,To
0,1,3360982,4457271
1,2,4761876,4698439
2,3,4198430,3615486


In [50]:
test_df['From'] = test_df['From'].apply(lambda x : str(x))
test_df['To'] = test_df['To'].apply(lambda x : str(x))

In [51]:
test_df['source_degree'] = test_df['From'].apply(lambda x : adjacency_list_sampled[x]['degree'])
test_df['sink_degree'] = test_df['To'].apply(lambda x : adjacency_list_sampled[x]['degree'] if x in adjacency_list_sampled.keys() else 0)
test_df['common_neighbours'] = np.apply_along_axis(get_common_neighbours, axis=1, arr=test_df[['From', 'To']].values)
#test_df['ra_index'] = ?
test_df.head(3)

Unnamed: 0,Id,From,To,source_degree,sink_degree,common_neighbours
0,1,3360982,4457271,1964,242,0
1,2,4761876,4698439,123,0,0
2,3,4198430,3615486,6996,0,0


In [52]:
X_test = scaler.transform(test_df.drop(['From','To','Id'], axis=1))

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- ra_index


In [33]:
output_df = test_df.copy()
output_df.drop(['From' ,'To','source_degree','sink_degree','common_neighbours'], axis = 1, inplace = True)

In [34]:
output_df['Predictions'] = np.transpose(model.predict_proba(X_test))[1]

In [35]:
pd.Series(model.predict(X_test)).value_counts()

0    1849
1     151
Name: count, dtype: int64

In [37]:
model.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [36]:
output_df.head()

Unnamed: 0,Id,Predictions
0,1,0.481128
1,2,0.472092
2,3,0.496006
3,4,0.472669
4,5,0.476213


In [38]:
today_date = datetime.now().strftime("%Y%m%d_%H%M")
output_df.to_csv(f'submissions\{today_date}_submission.csv', index = False)

## Playground

In [None]:
G_play = nx.Graph()
G_play.add_edges_from([(1, 2), (1, 3), (2, 3), (2, 4)])

# Compute resource allocation index between nodes 1 and 4
ra_index = nx.resource_allocation_index(G_play, [(1, 2), (1, 3), (2, 3), (2, 4)])
# for u, v, p in ra_index:
#     print(f"Resource Allocation Index between {u} and {v}: {p}")

In [None]:
ra_index = [x for _,_,x in ra_index]

In [None]:
ra_index

In [None]:
# Define the number of folds
k = 5

# Initialize the KFold object
kf = KFold(n_splits=k, shuffle=True, stratify = y)

# Initialize lists to store predictions and true labels
all_predictions = []
all_true_labels = []

# Iterate over each fold
for train_index, val_index in kf.split(X):
    # Split the data into training and validation sets
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # Define and train your machine learning model
    model = LogisticRegression()
    model.fit(X_train, y_train)
    
    # Make predictions on the validation set
    predictions = model.predict(X_val)
    
    # Collect the predictions and true labels for each fold
    all_predictions.extend(predictions)
    all_true_labels.extend(y_val)

# Compute the confusion matrix
cm = confusion_matrix(all_true_labels, all_predictions)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='d')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()