## Imports

In [1]:
import pandas as pd
import csv
import numpy as np
import random
#import networkx as nx

## File Paths

In [2]:
train_file = 'train.csv'

## Functions

In [6]:
def create_edges(adjacency_list : dict):
    edges = []
    for node in adjacency_list.keys():
        for neighbour in adjacency_list[node]:
            edges.append([node, neighbour])

    return edges

## Adjacency List

In [3]:
adjacency_list = {}
with open(train_file, 'r') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        if len(row) > 1:
            adjacency_list[row[0]] = row[1:]
        else:
            adjacency_list[row[0]] = []

## Adjacency List Sampled

In [4]:
adjacency_list_sampled = {}
sampling_ratio = 0.001

with open(train_file, 'r') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        num_samples = int(len(row) * sampling_ratio)
        if len(row) > 1:
            adjacency_list_sampled[row[0]] = random.sample(row[1:], num_samples)
        else:
            adjacency_list_sampled[row[0]] = []

## Sampled Edges Dataframe

In [7]:
sampled_edges = create_edges(adjacency_list_sampled)

In [20]:
sampled_edges_df = pd.DataFrame(sampled_edges, columns = ['source', 'sink'])
sampled_edges_df.head()

Unnamed: 0,source,sink
0,314520,242345
1,314520,4046894
2,314520,1941441
3,314520,312484
4,314520,4163509


In [24]:
sampled_edges_df['source_degree'] = sampled_edges_df['source'].apply(lambda x : len(adjacency_list[x]))
sampled_edges_df.head()

Unnamed: 0,source,sink,source_degree
0,314520,242345,764195
1,314520,4046894,764195
2,314520,1941441,764195
3,314520,312484,764195
4,314520,4163509,764195


## Playground

In [5]:
len(adjacency_list_sampled['314520'])

764

In [7]:
features_dict = {
    'node' : [],
    'out_degree' : []
}
with open(train_file, 'r') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        features_dict['node'].append(row[0])
        features_dict['out_degree'].append(len(row)-1)

In [4]:
features_df = pd.DataFrame(features_dict, index = features_dict['node'])
features_df.head()

Unnamed: 0,node,out_degree
687794,687794,143
2712371,2712371,21
314520,314520,764195
49007,49007,297
4505542,4505542,3808


In [21]:
len(adjacency_list.keys())

20000

In [14]:
len(adjacency_list['314520'])

764195

In [38]:
np.sum(features_df['out_degree'] > 1500)

2099

In [6]:
import networkx as nx

In [None]:
G = nx.read_adjlist("train.csv", delimiter=",")

In [9]:
G

<networkx.classes.graph.Graph at 0x2049b2bf6d0>

In [10]:
G.number_of_nodes()

20000

In [11]:
G.number_of_edges()

0

In [8]:
large_list = list(range(700000))

# Check the size of the list in bytes
import sys
print("Size of the list: {} bytes".format(sys.getsizeof(large_list)))

Size of the list: 5600056 bytes
