In [1]:
import pandas as pd
import csv
import numpy as np
import random
import numpy as np
from datetime import datetime
import networkx as nx

### Loading Edges Dataframe

In [2]:
edges_df = pd.read_csv("data/20240303_2343_sampled_edges.csv")
edges_df.head(3)

Unnamed: 0,source,sink,label
0,1000879,3408999,1
1,3109148,216786,1
2,4035827,2217836,1


In [3]:
edges_df.shape

(47515, 3)

In [4]:
edges_df.label.value_counts()

label
1    25000
0    22515
Name: count, dtype: int64

### Loading Sampled Graph

In [5]:
G = nx.read_graphml("data/20240303_2343_sampled_graph.graphml", node_type = int)
print(len(G.edges()))

24948


### Features

In [6]:
train_df = edges_df.copy()

In [7]:
edges_list = list(zip(train_df['source'].tolist(), train_df['sink'].tolist()))

#### Feature 1: Source Degree

In [8]:
train_df['source_degree'] = train_df['source'].apply(lambda x : G.degree(x))
train_df.head(3)

Unnamed: 0,source,sink,label,source_degree
0,1000879,3408999,1,16
1,3109148,216786,1,8
2,4035827,2217836,1,5


#### Feature 2: Sink Degree

In [9]:
train_df['sink_degree'] = train_df['sink'].apply(lambda x : G.degree(x))
train_df.head(3)

Unnamed: 0,source,sink,label,source_degree,sink_degree
0,1000879,3408999,1,16,2
1,3109148,216786,1,8,1
2,4035827,2217836,1,5,14


#### Feature 3: Common Neighbours

In [10]:
train_df['common_neighbours'] = [len(set(nx.common_neighbors(G, u, v))) for u, v in edges_list]
train_df.head(3)

Unnamed: 0,source,sink,label,source_degree,sink_degree,common_neighbours
0,1000879,3408999,1,16,2,0
1,3109148,216786,1,8,1,0
2,4035827,2217836,1,5,14,0


#### Feature 4: Resource Allocation Index

In [11]:
ra_index_generator = nx.resource_allocation_index(G, edges_list)
ra_index = [x for _,_,x in ra_index_generator]

In [12]:
train_df['ra_index'] = ra_index
train_df.head(3)

Unnamed: 0,source,sink,label,source_degree,sink_degree,common_neighbours,ra_index
0,1000879,3408999,1,16,2,0,0.0
1,3109148,216786,1,8,1,0,0.0
2,4035827,2217836,1,5,14,0,0.0


#### Feature 5: Jaccard Coefficient

In [13]:
jaccard_coef_generator = nx.jaccard_coefficient(G, edges_list)
jaccard_coef = [x for _,_,x in jaccard_coef_generator]

In [14]:
train_df['jaccard_coef'] = jaccard_coef
train_df.head(3)

Unnamed: 0,source,sink,label,source_degree,sink_degree,common_neighbours,ra_index,jaccard_coef
0,1000879,3408999,1,16,2,0,0.0,0.0
1,3109148,216786,1,8,1,0,0.0,0.0
2,4035827,2217836,1,5,14,0,0.0,0.0


#### Feature 6: Adamic Agar Index

In [15]:
aa_index_generator = nx.adamic_adar_index(G, edges_list)
aa_index = [x for _,_,x in aa_index_generator]

In [16]:
train_df['aa_index'] = aa_index
train_df.head(3)

Unnamed: 0,source,sink,label,source_degree,sink_degree,common_neighbours,ra_index,jaccard_coef,aa_index
0,1000879,3408999,1,16,2,0,0.0,0.0,0.0
1,3109148,216786,1,8,1,0,0.0,0.0,0.0
2,4035827,2217836,1,5,14,0,0.0,0.0,0.0


### Feature 7: Preferential Attachment

In [17]:
pa_generator = nx.preferential_attachment(G, edges_list)
pref_attach = [x for _,_,x in pa_generator]

In [18]:
train_df['pref_attach'] = pref_attach
train_df.head(3)

Unnamed: 0,source,sink,label,source_degree,sink_degree,common_neighbours,ra_index,jaccard_coef,aa_index,pref_attach
0,1000879,3408999,1,16,2,0,0.0,0.0,0.0,32
1,3109148,216786,1,8,1,0,0.0,0.0,0.0,8
2,4035827,2217836,1,5,14,0,0.0,0.0,0.0,70


###  Feature 8: Katz Centrality

In [19]:
# kc_generator = nx.katz_centrality_numpy(G)
# katz_cent = {}
# for n,kc in sorted(kc_generator.items()):
#     katz_cent[n] = kc


KeyboardInterrupt



In [None]:
# train_df['katz_cent'] = train_df['source'].apply(lambda x : katz_cent[x])
# train_df.head(3)

### Saving Train Dataframe with Undirected Features to CSV file

In [21]:
today_date = datetime.now().strftime("%Y%m%d_%H%M")
train_df.to_csv(f"data\{today_date}_train_undirected.csv", index = False)