In [1]:
import pandas as pd
import csv
import numpy as np
import random
import numpy as np
from datetime import datetime
import networkx as nx

### Loading Edges Dataframe

In [2]:
edges_df = pd.read_csv("data/20240228_2130_sampled_edges.csv")
edges_df.head(3)

Unnamed: 0,source,sink,label
0,687794,4763554,1
1,687794,1224868,1
2,687794,32423,1


In [3]:
edges_df.shape

(925315, 3)

In [4]:
edges_df.label.value_counts()

label
0    547632
1    377683
Name: count, dtype: int64

### Loading Sampled Graph

In [5]:
G = nx.read_graphml("data/20240228_2130_sampled_graph.graphml", node_type = int)
print(len(G.edges()))

921914


### Features

In [6]:
train_df = edges_df.copy()

In [7]:
edges_list = list(zip(train_df['source'].tolist(), train_df['sink'].tolist()))

#### Feature 1: Source Degree

In [8]:
train_df['source_degree'] = train_df['source'].apply(lambda x : G.degree(x))
train_df.head(3)

Unnamed: 0,source,sink,label,source_degree
0,687794,4763554,1,69
1,687794,1224868,1,69
2,687794,32423,1,69


#### Feature 2: Sink Degree

In [9]:
train_df['sink_degree'] = train_df['sink'].apply(lambda x : G.degree(x))
train_df.head(3)

Unnamed: 0,source,sink,label,source_degree,sink_degree
0,687794,4763554,1,69,89
1,687794,1224868,1,69,86
2,687794,32423,1,69,54


#### Feature 3: Common Neighbours

In [10]:
train_df['common_neighbours'] = [len(set(nx.common_neighbors(G, u, v))) for u, v in edges_list]
train_df.head(3)

Unnamed: 0,source,sink,label,source_degree,sink_degree,common_neighbours
0,687794,4763554,1,69,89,0
1,687794,1224868,1,69,86,1
2,687794,32423,1,69,54,1


#### Feature 4: Resource Allocation Index

In [11]:
ra_index_generator = nx.resource_allocation_index(G, edges_list)
ra_index = [x for _,_,x in ra_index_generator]

In [12]:
train_df['ra_index'] = ra_index
train_df.head(3)

Unnamed: 0,source,sink,label,source_degree,sink_degree,common_neighbours,ra_index
0,687794,4763554,1,69,89,0,0.0
1,687794,1224868,1,69,86,1,0.008
2,687794,32423,1,69,54,1,0.014085


#### Feature 5: Jaccard Coefficient

In [13]:
jaccard_coef_generator = nx.jaccard_coefficient(G, edges_list)
jaccard_coef = [x for _,_,x in jaccard_coef_generator]

In [14]:
train_df['jaccard_coef'] = jaccard_coef
train_df.head(3)

Unnamed: 0,source,sink,label,source_degree,sink_degree,common_neighbours,ra_index,jaccard_coef
0,687794,4763554,1,69,89,0,0.0,0.0
1,687794,1224868,1,69,86,1,0.008,0.006494
2,687794,32423,1,69,54,1,0.014085,0.008197


#### Feature 6: Adamic Agar Index

In [15]:
aa_index_generator = nx.adamic_adar_index(G, edges_list)
aa_index = [x for _,_,x in aa_index_generator]

In [16]:
train_df['aa_index'] = aa_index
train_df.head(3)

Unnamed: 0,source,sink,label,source_degree,sink_degree,common_neighbours,ra_index,jaccard_coef,aa_index
0,687794,4763554,1,69,89,0,0.0,0.0,0.0
1,687794,1224868,1,69,86,1,0.008,0.006494,0.207112
2,687794,32423,1,69,54,1,0.014085,0.008197,0.234594


### Saving Train Dataframe with Undirected Features to CSV file

In [17]:
today_date = datetime.now().strftime("%Y%m%d_%H%M")
train_df.to_csv(f"data\{today_date}_train_undirected.csv", index = False)