In [1]:
import pandas as pd
import csv
import numpy as np
import random
import numpy as np
from datetime import datetime
import networkx as nx

### File Paths

In [2]:
train_file = 'train.csv'

### Functions

In [3]:
def create_edges(adjacency_list : dict):
    edges = []
    for node in adjacency_list.keys():
        for neighbour in adjacency_list[node]:
            edges.append([node, neighbour])

    return edges

### Adjacency List - True Edges

In [4]:
seed_value = 16
random.seed(seed_value)

# adjacency_list_sampled = {}
# sampling_ratio = 0.001

# with open(train_file, 'r') as csvfile:
#     reader = csv.reader(csvfile)
#     for row_string in reader:
#         row = [int(x) for x in row_string]
#         num_samples = max(int(len(row) * sampling_ratio), min(20,len(row)-1)) #sample atleast 20 edges from each node
#         if len(row) > 1:
#             adjacency_list_sampled[row[0]] = set(random.sample(row[1:], num_samples))
#         else:
#             #To handle nodes with no neighbors
#             adjacency_list_sampled[row[0]] = set([])

def random_walk(graph, start_node, walk_length):
    walk = [start_node]
    
    for _ in range(walk_length):
        neighbors = graph.get(walk[-1], [])  # Get neighbors of the last visited node
        if neighbors:
            next_node = random.choice(neighbors)
            walk.append(next_node)
        else:
            break  # If no neighbors, terminate the walk
    
    return walk

# Specify the file path
file_path = train_file  # Replace with the actual file path

# Specify the number of random walks and the length of each walk
num_walks = 20000
walk_length = 10

# Open the CSV file and build the graph
adjacency_list = {}
with open(file_path, 'r') as file:
    reader = csv.reader(file)
    
    # Build the graph from the CSV file
    for row in reader:
        node = row[0]
        neighbors = row[1:]
        adjacency_list[node] = neighbors

# Initialize a list to store sampled edges
sampled_edges = []

# Perform random walks and sample edges
for _ in range(num_walks):
    start_node = random.choice(list(adjacency_list.keys()))
    walk = random_walk(adjacency_list, start_node, walk_length)
    
    # Extract edges from the random walk
    edges = [(walk[i], walk[i+1]) for i in range(len(walk)-1)]
    sampled_edges.extend(edges)

In [5]:
len(sampled_edges)

29003

In [8]:
len(adjacency_list)

20000

### True Edges Dataframe

In [11]:
#true_edges = create_edges(adjacency_list_sampled)
true_edges = sampled_edges

In [12]:
true_edges_df = pd.DataFrame(true_edges, columns = ['source', 'sink'])
true_edges_df.head(3)

Unnamed: 0,source,sink
0,1276050,4115397
1,376726,652404
2,4243429,2819456


In [13]:
true_edges_df.shape

(29003, 2)

In [14]:
true_edges_df.source.nunique()

14035

### Adding Labels column

In [15]:
true_edges_df['label'] = 1
true_edges_df.head(3)

Unnamed: 0,source,sink,label
0,1276050,4115397,1
1,376726,652404,1
2,4243429,2819456,1


### Adding False Edges

In [None]:
false_edges_adjacency_list = {}

# sinks = list(set(true_edges_df['sink'].values).union(set(true_edges_df['source'].values)))

# for node in adjacency_list_sampled.keys():
#     if len(adjacency_list_sampled[node]) == 0:
#         degree = random.randint(5, 50)
#         sink_nodes = set(random.sample(sinks, degree))
#         false_edges_adjacency_list[node] = sink_nodes
#     else:
#         degree = random.randint(5, 50)
#         sink_nodes = set(random.sample(sinks, degree))
#         sink_nodes = [x for x in sink_nodes if x not in adjacency_list_sampled[node]]
#         false_edges_adjacency_list[node] = sink_nodes

### False Edges Dataframe

In [None]:
false_edges = create_edges(false_edges_adjacency_list)

In [None]:
false_edges_df = pd.DataFrame(false_edges, columns = ['source', 'sink'])
false_edges_df.head(3)

In [None]:
false_edges_df['label'] = 0
false_edges_df.head(3)

### Sampled Edges Dataframe (True and False Edges)

In [None]:
edges_df = pd.concat([true_edges_df, false_edges_df])
edges_df.head()

In [None]:
edges_df.shape

In [None]:
edges_df.label.value_counts()

### Making an Undirected Graph

In [None]:
edges_df = edges_df.loc[~(edges_df['source'] == edges_df['sink'])] #to remove self loops

In [None]:
edge_list = list(zip(edges_df['source'].tolist(), edges_df['sink'].tolist()))

In [None]:
G = nx.Graph(edge_list)

### Making an Directed Graph

In [None]:
DiG = nx.DiGraph(edge_list)

### Saving Edges Dataframe to a CSV file

In [None]:
today_date = datetime.now().strftime("%Y%m%d_%H%M")
edges_df.to_csv(f"data\{today_date}_sampled_edges.csv", index = False)

### Saving the Graphs

In [None]:
nx.write_graphml(G, f"data\{today_date}_sampled_graph.graphml")
nx.write_graphml(DiG, f"data\{today_date}_sampled_digraph.graphml")