In [3]:
import pandas as pd
import numpy as np
import networkx as nx
from tqdm import tqdm_notebook as tqdm

In [4]:
trainingdf = pd.read_csv('../data/df.csv')

In [5]:
unique_sources = trainingdf['Source'].unique()
print(len(unique_sources))
unique_sinks = trainingdf['Sink'].unique()
print(len(unique_sinks))
edgeStringSet = set(trainingdf['string'].unique())

19570
4867136


Fake Edges - Mapping 1 Source -> 10 Sinks, ensuring edge generated dont exist in training dataset

In [6]:
fakeData = []
for source_node in unique_sources:
    counter = 0
    while counter < 10:
        sink = unique_sinks[np.random.randint(len(unique_sinks))]
        compareEdge = str(source_node) + ',' + str(sink)
        if compareEdge not in edgeStringSet:
            fakeData.append((source_node,sink,0))
            counter += 1
fakeDataDF = pd.DataFrame.from_records(fakeData, columns=['Source','Sink','Value'])
fakeDataDF.head()

Unnamed: 0,Source,Sink,Value
0,4066935,1584629,0
1,4066935,2205267,0
2,4066935,633790,0
3,4066935,1771710,0
4,4066935,4378825,0


<a id='the_destination'></a>True Edges - Random sampling from Training Dataset of 24m edges

In [7]:
check = True
while check:
    trueDataDF = trainingdf.sample(195705, replace=False, random_state=470350637)
    if trueDataDF['string'].nunique() >= 195700:
        trueDataDF = trueDataDF.drop_duplicates(subset=['string'])
        check = False
trueDataDF.drop(columns=['string'], axis=1, inplace=True)
trueDataDF['Value'] = 1
trueDataDF.head()

Unnamed: 0,Source,Sink,Value
21810337,1709040,1695026,1
720713,20388,4700562,1
3949681,4067692,2495732,1
19998727,2912811,345595,1
7084186,3077895,3951788,1


Combining both to get random generated data aggregation

In [8]:
generatedDataDF = pd.concat([trueDataDF,fakeDataDF])
generatedDataDFtemp = generatedDataDF.drop(columns=['Value'], axis=1)
generatedDataList = generatedDataDFtemp.to_records(index=False)
generatedDataList = generatedDataList.tolist()
len(generatedDataList)

391400

NetworkX

In [9]:
node_child_counts = {}
node_child_sets = {}
items = []
with open('../data/train.txt', 'rt') as f:
    line = f.readline()
    while line:
        numbers = line.split('\t')
        source = int(numbers[0])
        node_child_counts[source] = len(numbers)-1
        node_child_sets[source] = set(numbers[1:])
        for sink in numbers[1:]:
            items.append((source, int(sink),1)) # Tweak to build graph with weight edges of 1
        line = f.readline()
len(items)

24004361

Creating DirectedGraph and converting it to an Undirected one for applying functions

In [10]:
DG = nx.DiGraph()
DG.add_weighted_edges_from(items)

In [11]:
G = DG.to_undirected()

In [12]:
del DG # Clearing memory

Adamic Adar Coeficient

In [13]:
AApreds = nx.adamic_adar_index(G, generatedDataList)
AAlist = []

In [None]:
with tqdm(total=391400) as pbar:
    for source, sink, prediction in AApreds:
        AAlist.append((source, sink, prediction))
        counter -= 1
        pbar.update(1)

HBox(children=(IntProgress(value=0, max=391400), HTML(value='')))

In [None]:
AAdf = pd.DataFrame.from_records(AAlist,columns=['Source','Sink','AAprediction'])
AAdf.head()

Common Neighbours

In [None]:
common_neighbor_list = []
cncount = len(generatedDataList)
with tqdm(total=cncount) as pbar:
    for source, sink in generatedDataList:
        common_neighbor_list.append((source, sink, len(list(nx.common_neighbors(G,source,sink)))))
        cncount = cncount - 1
        pbar.update(1)
common_neighbor_df = pd.Dataframe.from_records(common_neighbor_list,columns=['Source','Sink','common_neighbor'])

In [None]:
katz = nx.algorithms.centrality.katz_centrality_numpy(G)