In [40]:
import pandas as pd
import csv
import numpy as np
import random
import numpy as np
from datetime import datetime
import networkx as nx

### File Paths

In [41]:
train_file = 'train.csv'

### Functions

In [42]:
def create_edges(adjacency_list : dict):
    edges = []
    for node in adjacency_list.keys():
        for neighbour in adjacency_list[node]:
            edges.append([node, neighbour])

    return edges

### Adjacency List - True Edges

In [43]:
seed_value = 16
random.seed(seed_value)

adjacency_list_sampled = {}
sampling_ratio = 0.001

with open(train_file, 'r') as csvfile:
    reader = csv.reader(csvfile)
    for row_string in reader:
        row = [int(x) for x in row_string]
        num_samples = max(int(len(row) * sampling_ratio), min(20,len(row)-1)) #sample atleast 20 edges from each node
        if len(row) > 1:
            adjacency_list_sampled[row[0]] = set(random.sample(row[1:], num_samples))
        else:
            #To handle nodes with no neighbors
            adjacency_list_sampled[row[0]] = set([])

### True Edges Dataframe

In [44]:
true_edges = create_edges(adjacency_list_sampled)

In [45]:
true_edges_df = pd.DataFrame(true_edges, columns = ['source', 'sink'])
true_edges_df.head(3)

Unnamed: 0,source,sink
0,687794,4763554
1,687794,1224868
2,687794,32423


In [46]:
true_edges_df.shape

(377684, 2)

In [47]:
true_edges_df.source.nunique() #430 nodes have no edges

19570

### Adding Labels column

In [48]:
true_edges_df['label'] = 1
true_edges_df.head(3)

Unnamed: 0,source,sink,label
0,687794,4763554,1
1,687794,1224868,1
2,687794,32423,1


### Making a Undirected Graph

In [49]:
true_edges_df = true_edges_df.loc[~(true_edges_df['source'] == true_edges_df['sink'])] #to remove self loops

In [50]:
true_edge_list = list(zip(true_edges_df['source'].tolist(), true_edges_df['sink'].tolist()))

In [51]:
G = nx.Graph(true_edge_list)

### Making a Directed Graph

In [52]:
DiG = nx.DiGraph(true_edge_list)

### Adding False Edges

In [53]:
# Define the number of edges 'n' to add
n = 350000

# Get a list of existing nodes in the graph
nodes = list(DiG.nodes())
edges = list(DiG.edges())

# Generate 'n' random edges between existing nodes without self-loops and duplicates
false_edges = set()  # Use a set to ensure uniqueness

while len(false_edges) < n:
    edge = (random.choice(nodes), random.choice(nodes))
    
    # Check if it's a self-loop or duplicate edge
    if edge[0] != edge[1] and edge not in false_edges and edge[::-1] not in false_edges and edge[::-1] not in edges:
        false_edges.add(edge)

    if len(false_edges) % 10000 == 0:
        print(f"{len(false_edges)} false edges created...")

G.add_edges_from(false_edges)
DiG.add_edges_from(false_edges)

10000 false edges created...
20000 false edges created...
30000 false edges created...
40000 false edges created...
50000 false edges created...
60000 false edges created...
70000 false edges created...
80000 false edges created...
90000 false edges created...
100000 false edges created...
110000 false edges created...
120000 false edges created...
130000 false edges created...
140000 false edges created...
150000 false edges created...
160000 false edges created...
170000 false edges created...
180000 false edges created...
190000 false edges created...
200000 false edges created...
210000 false edges created...
220000 false edges created...
230000 false edges created...
240000 false edges created...
250000 false edges created...
260000 false edges created...
270000 false edges created...
280000 false edges created...
290000 false edges created...
300000 false edges created...
310000 false edges created...
320000 false edges created...
330000 false edges created...
340000 false edges 

### False Edges Dataframe

In [54]:
false_edges_df = pd.DataFrame(false_edges, columns = ['source', 'sink'])
false_edges_df.head(3)

Unnamed: 0,source,sink
0,384931,3496589
1,2386113,620583
2,1872437,833447


In [55]:
false_edges_df['label'] = 0
false_edges_df.head(3)

Unnamed: 0,source,sink,label
0,384931,3496589,0
1,2386113,620583,0
2,1872437,833447,0


### Sampled Edges Dataframe (True and False Edges)

In [56]:
edges_df = pd.concat([true_edges_df, false_edges_df])
edges_df.head()

Unnamed: 0,source,sink,label
0,687794,4763554,1
1,687794,1224868,1
2,687794,32423,1
3,687794,1198888,1
4,687794,1822375,1


In [57]:
edges_df.shape

(727683, 3)

In [58]:
edges_df.label.value_counts()

label
1    377683
0    350000
Name: count, dtype: int64

### Saving Edges Dataframe to a CSV file

In [59]:
today_date = datetime.now().strftime("%Y%m%d_%H%M")
edges_df.to_csv(f"data\{today_date}_sampled_edges.csv", index = False)

### Saving the Graphs

In [60]:
nx.write_graphml(G, f"data\{today_date}_sampled_graph.graphml")
nx.write_graphml(DiG, f"data\{today_date}_sampled_digraph.graphml")