In [1]:
import pandas as pd
import csv
import numpy as np
import random
import numpy as np
from datetime import datetime
import networkx as nx

### File Paths

In [2]:
train_file = 'train.csv'

### Functions

In [3]:
def create_edges(adjacency_list : dict):
    edges = []
    for node in adjacency_list.keys():
        for neighbour in adjacency_list[node]:
            edges.append([node, neighbour])

    return edges

### Adjacency List - True Edges

In [4]:
seed_value = 16
random.seed(seed_value)

adjacency_list_sampled = {}
sampling_ratio = 0.001

with open(train_file, 'r') as csvfile:
    reader = csv.reader(csvfile)
    for row_string in reader:
        row = [int(x) for x in row_string]
        num_samples = max(int(len(row) * sampling_ratio), min(20,len(row)-1)) #sample atleast 20 edges from each node
        if len(row) > 1:
            adjacency_list_sampled[row[0]] = set(random.sample(row[1:], num_samples))
        else:
            #To handle nodes with no neighbors
            adjacency_list_sampled[row[0]] = set([])

### True Edges Dataframe

In [5]:
true_edges = create_edges(adjacency_list_sampled)

In [6]:
true_edges_df = pd.DataFrame(true_edges, columns = ['source', 'sink'])
true_edges_df.head(3)

Unnamed: 0,source,sink
0,687794,4763554
1,687794,1224868
2,687794,32423


In [7]:
true_edges_df.shape

(377684, 2)

In [8]:
true_edges_df.source.nunique() #430 nodes have no edges

19570

### Adding Labels column

In [9]:
true_edges_df['label'] = 1
true_edges_df.head(3)

Unnamed: 0,source,sink,label
0,687794,4763554,1
1,687794,1224868,1
2,687794,32423,1


### Adding False Edges

In [11]:
false_edges_adjacency_list = {}

sinks = list(set(true_edges_df['sink'].values).union(set(true_edges_df['source'].values)))

for node in adjacency_list_sampled.keys():
    if len(adjacency_list_sampled[node]) == 0:
        degree = random.randint(5, 50)
        sink_nodes = set(random.sample(sinks, degree))
        false_edges_adjacency_list[node] = sink_nodes
    else:
        degree = random.randint(5, 50)
        sink_nodes = set(random.sample(sinks, degree))
        sink_nodes = [x for x in sink_nodes if x not in adjacency_list_sampled[node]]
        false_edges_adjacency_list[node] = sink_nodes

### False Edges Dataframe

In [12]:
false_edges = create_edges(false_edges_adjacency_list)

In [13]:
false_edges_df = pd.DataFrame(false_edges, columns = ['source', 'sink'])
false_edges_df.head(3)

Unnamed: 0,source,sink
0,687794,1565376
1,687794,1868864
2,687794,4506663


In [14]:
false_edges_df['label'] = 0
false_edges_df.head(3)

Unnamed: 0,source,sink,label
0,687794,1565376,0
1,687794,1868864,0
2,687794,4506663,0


### Sampled Edges Dataframe (True and False Edges)

In [15]:
edges_df = pd.concat([true_edges_df, false_edges_df])
edges_df.head()

Unnamed: 0,source,sink,label
0,687794,4763554,1
1,687794,1224868,1
2,687794,32423,1
3,687794,1198888,1
4,687794,1822375,1


In [16]:
edges_df.shape

(925318, 3)

In [17]:
edges_df.label.value_counts()

label
0    547634
1    377684
Name: count, dtype: int64

### Making an Undirected Graph

In [18]:
edges_df = edges_df.loc[~(edges_df['source'] == edges_df['sink'])] #to remove self loops

In [19]:
edge_list = list(zip(edges_df['source'].tolist(), edges_df['sink'].tolist()))

In [20]:
G = nx.Graph(edge_list)

### Making an Directed Graph

In [21]:
DiG = nx.DiGraph(edge_list)

### Saving Edges Dataframe to a CSV file

In [22]:
today_date = datetime.now().strftime("%Y%m%d_%H%M")
edges_df.to_csv(f"data\{today_date}_sampled_edges.csv", index = False)

### Saving the Graphs

In [23]:
nx.write_graphml(G, f"data\{today_date}_sampled_graph.graphml")
nx.write_graphml(DiG, f"data\{today_date}_sampled_digraph.graphml")