In [1]:
import pandas as pd
import csv
import numpy as np
import random
import numpy as np
from datetime import datetime
import networkx as nx

### Loading Edges Dataframe

In [37]:
edges_df = pd.read_csv("data/20240301_2119_sampled_edges.csv")
edges_df.head(3)

Unnamed: 0,source,sink,label
0,1276050,4115397,1
1,376726,652404,1
2,4243429,2819456,1


In [38]:
edges_df.shape

(143047, 3)

In [39]:
edges_df.label.value_counts()

label
1    73047
0    70000
Name: count, dtype: int64

### Loading Sampled Graph

In [40]:
G = nx.read_graphml("data/20240301_2119_sampled_graph.graphml", node_type = int)
print(len(G.edges()))

138794


### Features

In [41]:
train_df = edges_df.copy()

In [42]:
edges_list = list(zip(train_df['source'].tolist(), train_df['sink'].tolist()))

#### Feature 1: Source Degree

In [43]:
train_df['source_degree'] = train_df['source'].apply(lambda x : G.degree(x))
train_df.head(3)

Unnamed: 0,source,sink,label,source_degree
0,1276050,4115397,1,6
1,376726,652404,1,3
2,4243429,2819456,1,8


#### Feature 2: Sink Degree

In [44]:
train_df['sink_degree'] = train_df['sink'].apply(lambda x : G.degree(x))
train_df.head(3)

Unnamed: 0,source,sink,label,source_degree,sink_degree
0,1276050,4115397,1,6,4
1,376726,652404,1,3,4
2,4243429,2819456,1,8,7


#### Feature 3: Common Neighbours

In [45]:
train_df['common_neighbours'] = [len(set(nx.common_neighbors(G, u, v))) for u, v in edges_list]
train_df.head(3)

Unnamed: 0,source,sink,label,source_degree,sink_degree,common_neighbours
0,1276050,4115397,1,6,4,0
1,376726,652404,1,3,4,0
2,4243429,2819456,1,8,7,0


#### Feature 4: Resource Allocation Index

In [46]:
ra_index_generator = nx.resource_allocation_index(G, edges_list)
ra_index = [x for _,_,x in ra_index_generator]

In [47]:
train_df['ra_index'] = ra_index
train_df.head(3)

Unnamed: 0,source,sink,label,source_degree,sink_degree,common_neighbours,ra_index
0,1276050,4115397,1,6,4,0,0.0
1,376726,652404,1,3,4,0,0.0
2,4243429,2819456,1,8,7,0,0.0


#### Feature 5: Jaccard Coefficient

In [48]:
jaccard_coef_generator = nx.jaccard_coefficient(G, edges_list)
jaccard_coef = [x for _,_,x in jaccard_coef_generator]

In [49]:
train_df['jaccard_coef'] = jaccard_coef
train_df.head(3)

Unnamed: 0,source,sink,label,source_degree,sink_degree,common_neighbours,ra_index,jaccard_coef
0,1276050,4115397,1,6,4,0,0.0,0.0
1,376726,652404,1,3,4,0,0.0,0.0
2,4243429,2819456,1,8,7,0,0.0,0.0


#### Feature 6: Adamic Agar Index

In [50]:
aa_index_generator = nx.adamic_adar_index(G, edges_list)
aa_index = [x for _,_,x in aa_index_generator]

In [51]:
train_df['aa_index'] = aa_index
train_df.head(3)

Unnamed: 0,source,sink,label,source_degree,sink_degree,common_neighbours,ra_index,jaccard_coef,aa_index
0,1276050,4115397,1,6,4,0,0.0,0.0,0.0
1,376726,652404,1,3,4,0,0.0,0.0,0.0
2,4243429,2819456,1,8,7,0,0.0,0.0,0.0


### Feature 7: Preferential Attachment

In [52]:
pa_generator = nx.preferential_attachment(G, edges_list)
pref_attach = [x for _,_,x in pa_generator]

In [53]:
train_df['pref_attach'] = pref_attach
train_df.head(3)

Unnamed: 0,source,sink,label,source_degree,sink_degree,common_neighbours,ra_index,jaccard_coef,aa_index,pref_attach
0,1276050,4115397,1,6,4,0,0.0,0.0,0.0,24
1,376726,652404,1,3,4,0,0.0,0.0,0.0,12
2,4243429,2819456,1,8,7,0,0.0,0.0,0.0,56


###  Feature 8: Katz Centrality

In [27]:
kc_generator = nx.katz_centrality_numpy(G)
katz_cent = {}
for n,kc in sorted(kc_generator.items()):
    katz_cent[n] = kc

In [35]:
train_df['katz_cent'] = train_df['source'].apply(lambda x : katz_cent[x])
train_df.head(3)

Unnamed: 0,source,sink,label,source_degree,sink_degree,common_neighbours,ra_index,jaccard_coef,aa_index,pref_attach,katz_cent
0,1276050,4115397,1,5,4,0,0.0,0.0,0.0,20,0.003418
1,376726,652404,1,2,4,0,0.0,0.0,0.0,8,0.002385
2,4243429,2819456,1,8,2,0,0.0,0.0,0.0,16,0.00472


### Saving Train Dataframe with Undirected Features to CSV file

In [54]:
today_date = datetime.now().strftime("%Y%m%d_%H%M")
train_df.to_csv(f"data\{today_date}_train_undirected.csv", index = False)