## Imports

In [1]:
import pandas as pd
import csv
import numpy as np
import random

#import networkx as nx

## File Paths

In [2]:
train_file = 'train.csv'

## Functions

In [3]:
def create_edges(adjacency_list : dict):
    edges = []
    for node in adjacency_list.keys():
        for neighbour in adjacency_list[node]['neighbours']:
            edges.append([node, neighbour])

    return edges

## Adjacency List

In [4]:
# adjacency_list = {}
# with open(train_file, 'r') as csvfile:
#     reader = csv.reader(csvfile)
#     for row in reader:
#         if len(row) > 1:
#             adjacency_list[row[0]] = row[1:]
#         else:
#             adjacency_list[row[0]] = []

## Adjacency List Sampled

In [42]:
adjacency_list_sampled = {}
sampling_ratio = 0.01

with open(train_file, 'r') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        num_samples = max(int(len(row) * sampling_ratio), min(5,len(row)-1)) #sample atleast one edge from nodes with degree less than 10000
        if len(row) > 1:
            adjacency_list_sampled[row[0]] = {
                'neighbours': set(random.sample(row[1:], num_samples)),
                'degree': len(row[1:])
            }
        else:
            #To handle nodes with no neighbors
            adjacency_list_sampled[row[0]] = {
                'neighbours': set([]),
                'degree': 0
            }

## Sampled Edges Dataframe

### Creating from sampled adjacency list

In [72]:
sampled_edges = create_edges(adjacency_list_sampled)

In [73]:
sampled_edges_df = pd.DataFrame(sampled_edges, columns = ['source', 'sink'])
sampled_edges_df.head(3)

Unnamed: 0,source,sink
0,687794,234477
1,687794,2482027
2,687794,1213190


In [74]:
sampled_edges_df.shape

(289518, 2)

In [75]:
sampled_edges_df.source.nunique() #430 nodes have no edges

19570

### Adding Labels column

In [76]:
sampled_edges_df['label'] = 1
sampled_edges_df.head(3)

Unnamed: 0,source,sink,label
0,687794,234477,1
1,687794,2482027,1
2,687794,1213190,1


### Add False Edges (using nodes with degree = 0)

In [83]:
false_edges_adjacency_list = {}

sinks = list(set(sampled_edges_df['sink'].values).union(set(sampled_edges_df['source'].values)))

for node in adjacency_list_sampled.keys():
    if adjacency_list_sampled[node]['degree'] == 0:
        sink_nodes = set(random.sample(sinks, 50))
        false_edges_adjacency_list[node] = {'neighbours' : sink_nodes}

In [84]:
false_edges = create_edges(false_edges_adjacency_list)

In [85]:
false_edges_df = pd.DataFrame(false_edges, columns = ['source', 'sink'])
false_edges_df.head(3)

Unnamed: 0,source,sink
0,819948,1547384
1,819948,749557
2,819948,3441940


In [86]:
false_edges_df['label'] = 0
false_edges_df.head(3)

Unnamed: 0,source,sink,label
0,819948,1547384,0
1,819948,749557,0
2,819948,3441940,0


In [87]:
train_df = pd.concat([sampled_edges_df, false_edges_df])
train_df.head()

Unnamed: 0,source,sink,label
0,687794,234477,1
1,687794,2482027,1
2,687794,1213190,1
3,687794,402872,1
4,687794,78828,1


In [88]:
train_df.shape

(311018, 3)

### Feature 1: Source Degree

In [89]:
train_df['source_degree'] = train_df['source'].apply(lambda x : adjacency_list_sampled[x]['degree'])
train_df.head(3)

Unnamed: 0,source,sink,label,source_degree
0,687794,234477,1,143
1,687794,2482027,1,143
2,687794,1213190,1,143


### Feature 2: Sink Degree

## Playground

In [50]:
sampled_edges_df.sink.apply(lambda x : True if x in adjacency_list_sampled.keys() else False).sum()

36025

In [19]:
zero_degree_count = 0
for node in adjacency_list_sampled.keys():
    if adjacency_list_sampled[node]['degree'] < 1:
        zero_degree_count += 1

print(zero_degree_count)

430


In [49]:
adjacency_list_sampled

{'687794': {'neighbours': ['1854450',
   '2288064',
   '4589838',
   '893965',
   '406619'],
  'degree': 143},
 '2712371': {'neighbours': ['4274637',
   '2665148',
   '4571819',
   '2923913',
   '239119'],
  'degree': 21},
 '314520': {'neighbours': ['1731278',
   '2994467',
   '3520789',
   '2342667',
   '2814798',
   '537013',
   '893379',
   '4275830',
   '4788034',
   '4862803',
   '821917',
   '550826',
   '1145689',
   '3718433',
   '4486433',
   '3893225',
   '2220252',
   '3550559',
   '2574413',
   '4707629',
   '442807',
   '4268621',
   '2808240',
   '438759',
   '3201540',
   '1513488',
   '745969',
   '4798615',
   '4726575',
   '2599898',
   '481265',
   '1942397',
   '2745580',
   '2532272',
   '500488',
   '4180412',
   '1604277',
   '4798104',
   '2095259',
   '2952605',
   '3336145',
   '3379721',
   '3720286',
   '1377318',
   '3047498',
   '3725286',
   '3805264',
   '2875991',
   '1338968',
   '4849965',
   '921200',
   '3685352',
   '4321083',
   '1332000',
   '515

In [21]:
test_df = pd.read_csv('test.csv')
test_df.head()

Unnamed: 0,Id,From,To
0,1,3360982,4457271
1,2,4761876,4698439
2,3,4198430,3615486
3,4,2945770,747948
4,5,3950088,3360335


In [47]:
test_df.shape

(2000, 3)

In [46]:
test_df['From'].apply(lambda x : True if str(x) in adjacency_list_sampled.keys() else False).sum()

2000

In [48]:
test_df['To'].apply(lambda x : True if str(x) in adjacency_list_sampled.keys() else False).sum()

341