In [1]:
import pandas as pd
import csv
import numpy as np
import random
import numpy as np
from datetime import datetime
import networkx as nx

### Loading Edges Dataframe

In [2]:
edges_df = pd.read_csv("data/20240303_2343_sampled_edges.csv")
edges_df.head(3)

Unnamed: 0,source,sink,label
0,1000879,3408999,1
1,3109148,216786,1
2,4035827,2217836,1


In [3]:
edges_df.shape

(47515, 3)

In [4]:
edges_df.label.value_counts()

label
1    25000
0    22515
Name: count, dtype: int64

### Loading Sampled Graph

In [5]:
G = nx.read_graphml("data/20240303_2343_sampled_digraph.graphml", node_type = int)
print(len(G.edges()))

25000


### Features

In [6]:
train_df = edges_df.copy()

In [7]:
edges_list = list(zip(train_df['source'].tolist(), train_df['sink'].tolist()))

#### Feature 1: Source In-Degree Density

In [8]:
train_df['source_in_degree_dens'] = train_df['source'].apply(lambda x : G.in_degree(x)/G.degree(x))
train_df.head(3)

Unnamed: 0,source,sink,label,source_in_degree_dens
0,1000879,3408999,1,0.25
1,3109148,216786,1,0.0
2,4035827,2217836,1,0.5


#### Feature 2: Sink In-Degree Density

In [9]:
train_df['sink_in_degree_dens'] = train_df['sink'].apply(lambda x : G.in_degree(x)/G.degree(x))
train_df.head(3)

Unnamed: 0,source,sink,label,source_in_degree_dens,sink_in_degree_dens
0,1000879,3408999,1,0.25,0.5
1,3109148,216786,1,0.0,1.0
2,4035827,2217836,1,0.5,0.266667


#### Feature 3: Source Out-Degree Density

In [10]:
train_df['source_out_degree_dens'] = train_df['source'].apply(lambda x : G.out_degree(x)/G.degree(x))
train_df.head(3)

Unnamed: 0,source,sink,label,source_in_degree_dens,sink_in_degree_dens,source_out_degree_dens
0,1000879,3408999,1,0.25,0.5,0.75
1,3109148,216786,1,0.0,1.0,1.0
2,4035827,2217836,1,0.5,0.266667,0.5


#### Feature 4: Sink Out-Degree Density

In [11]:
train_df['sink_out_degree_dens'] = train_df['sink'].apply(lambda x : G.out_degree(x)/G.degree(x))
train_df.head(3)

Unnamed: 0,source,sink,label,source_in_degree_dens,sink_in_degree_dens,source_out_degree_dens,sink_out_degree_dens
0,1000879,3408999,1,0.25,0.5,0.75,0.5
1,3109148,216786,1,0.0,1.0,1.0,0.0
2,4035827,2217836,1,0.5,0.266667,0.5,0.733333


#### Feature 5: Source Bi-Degree Density

In [12]:
train_df['source_bi_degree_dens'] = train_df['source'].apply(lambda x : len(set(G.predecessors(x)).intersection(set(G.successors(x))))/G.degree(x))
train_df.head(3)

Unnamed: 0,source,sink,label,source_in_degree_dens,sink_in_degree_dens,source_out_degree_dens,sink_out_degree_dens,source_bi_degree_dens
0,1000879,3408999,1,0.25,0.5,0.75,0.5,0.0
1,3109148,216786,1,0.0,1.0,1.0,0.0,0.0
2,4035827,2217836,1,0.5,0.266667,0.5,0.733333,0.166667


#### Feature 6: Source Bi-Degree Density

In [13]:
train_df['sink_bi_degree_dens'] = train_df['sink'].apply(lambda x : len(set(G.predecessors(x)).intersection(set(G.successors(x))))/G.degree(x))
train_df.head(3)

Unnamed: 0,source,sink,label,source_in_degree_dens,sink_in_degree_dens,source_out_degree_dens,sink_out_degree_dens,source_bi_degree_dens,sink_bi_degree_dens
0,1000879,3408999,1,0.25,0.5,0.75,0.5,0.0,0.0
1,3109148,216786,1,0.0,1.0,1.0,0.0,0.0,0.0
2,4035827,2217836,1,0.5,0.266667,0.5,0.733333,0.166667,0.066667


#### Feature 7: Common In-neighbours

In [14]:
train_df['common_in_neighbours'] = [len(set(G.predecessors(source)).intersection(set(G.predecessors(sink)))) for source, sink in edges_list]
train_df.head(3)

Unnamed: 0,source,sink,label,source_in_degree_dens,sink_in_degree_dens,source_out_degree_dens,sink_out_degree_dens,source_bi_degree_dens,sink_bi_degree_dens,common_in_neighbours
0,1000879,3408999,1,0.25,0.5,0.75,0.5,0.0,0.0,0
1,3109148,216786,1,0.0,1.0,1.0,0.0,0.0,0.0,0
2,4035827,2217836,1,0.5,0.266667,0.5,0.733333,0.166667,0.066667,0


#### Feature 8: Common out-neighbours

In [15]:
train_df['common_out_neighbours'] = [len(set(G.successors(source)).intersection(set(G.successors(sink)))) for source, sink in edges_list]
train_df.head(3)

Unnamed: 0,source,sink,label,source_in_degree_dens,sink_in_degree_dens,source_out_degree_dens,sink_out_degree_dens,source_bi_degree_dens,sink_bi_degree_dens,common_in_neighbours,common_out_neighbours
0,1000879,3408999,1,0.25,0.5,0.75,0.5,0.0,0.0,0,0
1,3109148,216786,1,0.0,1.0,1.0,0.0,0.0,0.0,0,0
2,4035827,2217836,1,0.5,0.266667,0.5,0.733333,0.166667,0.066667,0,0


#### Feature 9: Total in-neighbours

In [17]:
train_df['total_in_neighbours'] = [len(set(G.predecessors(source)).union(set(G.predecessors(sink)))) for source, sink in edges_list]
train_df.head(3)

Unnamed: 0,source,sink,label,source_in_degree_dens,sink_in_degree_dens,source_out_degree_dens,sink_out_degree_dens,source_bi_degree_dens,sink_bi_degree_dens,common_in_neighbours,common_out_neighbours,total_in_neighbours
0,1000879,3408999,1,0.25,0.5,0.75,0.5,0.0,0.0,0,0,5
1,3109148,216786,1,0.0,1.0,1.0,0.0,0.0,0.0,0,0,1
2,4035827,2217836,1,0.5,0.266667,0.5,0.733333,0.166667,0.066667,0,0,7


#### Feature 9: Total out-neighbours

In [18]:
train_df['total_out_neighbours'] = [len(set(G.successors(source)).union(set(G.successors(sink)))) for source, sink in edges_list]
train_df.head(3)

Unnamed: 0,source,sink,label,source_in_degree_dens,sink_in_degree_dens,source_out_degree_dens,sink_out_degree_dens,source_bi_degree_dens,sink_bi_degree_dens,common_in_neighbours,common_out_neighbours,total_in_neighbours,total_out_neighbours
0,1000879,3408999,1,0.25,0.5,0.75,0.5,0.0,0.0,0,0,5,13
1,3109148,216786,1,0.0,1.0,1.0,0.0,0.0,0.0,0,0,1,8
2,4035827,2217836,1,0.5,0.266667,0.5,0.733333,0.166667,0.066667,0,0,7,14


#### Feature 10: Source Neighbourhood Subgraph Density

In [19]:
train_df['source_nh_subgraph_dens'] = train_df['source'].apply(lambda x : G.degree(x)/len(G.subgraph(x).nodes()))
train_df.head(3)

Unnamed: 0,source,sink,label,source_in_degree_dens,sink_in_degree_dens,source_out_degree_dens,sink_out_degree_dens,source_bi_degree_dens,sink_bi_degree_dens,common_in_neighbours,common_out_neighbours,total_in_neighbours,total_out_neighbours,source_nh_subgraph_dens
0,1000879,3408999,1,0.25,0.5,0.75,0.5,0.0,0.0,0,0,5,13,16.0
1,3109148,216786,1,0.0,1.0,1.0,0.0,0.0,0.0,0,0,1,8,8.0
2,4035827,2217836,1,0.5,0.266667,0.5,0.733333,0.166667,0.066667,0,0,7,14,6.0


#### Feature 11: Sink Neighbourhood Subgraph Density

In [20]:
train_df['sink_nh_subgraph_dens'] = train_df['sink'].apply(lambda x : G.degree(x)/len(G.subgraph(x).nodes()))
train_df.head(3)

Unnamed: 0,source,sink,label,source_in_degree_dens,sink_in_degree_dens,source_out_degree_dens,sink_out_degree_dens,source_bi_degree_dens,sink_bi_degree_dens,common_in_neighbours,common_out_neighbours,total_in_neighbours,total_out_neighbours,source_nh_subgraph_dens,sink_nh_subgraph_dens
0,1000879,3408999,1,0.25,0.5,0.75,0.5,0.0,0.0,0,0,5,13,16.0,2.0
1,3109148,216786,1,0.0,1.0,1.0,0.0,0.0,0.0,0,0,1,8,8.0,1.0
2,4035827,2217836,1,0.5,0.266667,0.5,0.733333,0.166667,0.066667,0,0,7,14,6.0,15.0


#### Feature 12: Average Strongly Connected Components

In [None]:
train_df['sink_nh_subgraph_dens'] = train_df['sink'].apply(lambda x : G.degree(x)/len(G.subgraph(x).nodes()))
train_df.head(3)

### Saving Train Dataframe with Undirected Features to CSV file

In [22]:
today_date = datetime.now().strftime("%Y%m%d_%H%M")
train_df.to_csv(f"data\{today_date}_train_directed.csv", index = False)