In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import time
from tqdm import tqdm_notebook as tqdm

In [5]:
trainingdf = pd.read_csv('../data/df.csv')

In [6]:
unique_sources = trainingdf['Source'].unique()
print(len(unique_sources))
unique_sinks = trainingdf['Sink'].unique()
print(len(unique_sinks))
edgeStringSet = set(trainingdf['string'].unique())

19570
4867136


### Fake Edges - Mapping 1 Source -> 10 Sinks, ensuring edge generated dont exist in training dataset

In [None]:
fakeData = []
for source_node in unique_sources:
    counter = 0
    while counter < 10:
        sink = unique_sinks[np.random.randint(len(unique_sinks))]
        compareEdge = str(source_node) + ',' + str(sink)
        if compareEdge not in edgeStringSet:
            fakeData.append((source_node,sink,0))
            counter += 1
fakeDataDF = pd.DataFrame.from_records(fakeData, columns=['Source','Sink','Value'])
fakeDataDF.head()

### <a id='the_destination'></a>True Edges - Random sampling from Training Dataset of 24m edges

In [None]:
check = True
while check:
    trueDataDF = trainingdf.sample(195705, replace=False, random_state=470350637)
    if trueDataDF['string'].nunique() >= 195700:
        trueDataDF = trueDataDF.drop_duplicates(subset=['string'])
        check = False
trueDataDF.drop(columns=['string'], axis=1, inplace=True)
trueDataDF['Value'] = 1
trueDataDF.head()

Combining both to get random generated data aggregation

In [None]:
generatedDataDF = pd.concat([trueDataDF,fakeDataDF])
generatedDataDF.to_csv('../data/400k_NewDataset' + str(time.strftime("%c")) + '.cav', index = False)

Converting to list for processing against NetworkX models

In [None]:
generatedDataDFtemp = generatedDataDF.drop(columns=['Value'], axis=1)
generatedDataList = generatedDataDFtemp.to_records(index=False)
generatedDataList = generatedDataList.tolist()
len(generatedDataList)

# NetworkX

In [2]:
node_child_counts = {}
node_child_sets = {}
items = []
with open('../data/train.txt', 'rt') as f:
    line = f.readline()
    while line:
        numbers = line.split('\t')
        source = int(numbers[0])
        node_child_counts[source] = len(numbers)-1
        node_child_sets[source] = set(numbers[1:])
        for sink in numbers[1:]:
            items.append((source, int(sink),1)) # Tweak to build graph with edge weight of 1
        line = f.readline()
len(items)

24004361

### Creating DirectedGraph and converting it to an Undirected one for applying functions

In [3]:
DG = nx.DiGraph()
DG.add_weighted_edges_from(items)

In [7]:
G = DG.to_undirected()

`Clearing memory`

In [None]:
del DG

### Adamic Adar Coeficient

In [None]:
AApreds = nx.adamic_adar_index(G, generatedDataList)
AAlist = []

In [None]:
with tqdm(total=391400) as pbar:
    for source, sink, prediction in AApreds:
        AAlist.append((source, sink, prediction))
        counter -= 1
        pbar.update(1)
AAdf = pd.DataFrame.from_records(AAlist,columns=['Source','Sink','AAprediction'])
AAdf.to_csv('../data/400k_NewDataset_AA' + str(time.strftime("%c")) + '.csv', index = False)
AAdf.head()

In [10]:
AAdf = pd.read_csv('../data/400k_NewDataset_AA.csv')

### Common Neighbours

In [None]:
common_neighbor_list = []
cncount = len(generatedDataList)
with tqdm(total=cncount) as pbar:
    for source, sink in generatedDataList:
        common_neighbor_list.append((source, sink, len(list(nx.common_neighbors(G,source,sink)))))
        cncount = cncount - 1
        pbar.update(1)
common_neighbor_df = pd.DataFrame.from_records(common_neighbor_list,columns=['Source','Sink','common_neighbors'])
common_neighbor_df.to_csv('../data/400k_NewDataset_CN' + str(time.strftime("%c")) + '.csv', index = False)
common_neighbor_df.head()

In [9]:
common_neighbor_df = pd.read_csv('../data/400k_NewDataset_CN.csv')

### Merging

In [7]:
reducingDF = pd.merge(common_neighbor_df, AAdf, left_on=['Source','Sink'], right_on=['Source','Sink'], how='outer', left_index=False, right_index=False)

In [13]:
def validateDF(a, b):
    new = str(a) + ',' + str(b)
    if new in edgeStringSet:
        return 1
    else:
        return 0

In [15]:
reducingDF['exist_in_training_dataset'] = reducingDF[['Source','Sink']].apply(lambda x: validateDF(x[0],x[1]), axis=1)
reducingDF.to_csv('../data/400k_NewDataset_Final' + str(time.strftime("%c")) + '.csv', index=False)

### Test Dataset

In [8]:
testDF = pd.read_csv('../data/testData_NetworkX_analysis_02Sep18.csv')
testDF.head()

Unnamed: 0,Source,Sink,AAprediction,JA,PA,Id,common_neighbors
0,2184483,1300190,0.0,0.0,435,1,0
1,3151356,1452193,0.407705,0.00626,102306,2,4
2,1579396,193159,0.0,0.0,418,3,0
3,1406432,2481036,1.238898,0.0625,2838,4,7
4,2389638,593017,0.802812,0.012072,62196,5,6
