In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from tqdm import tqdm_notebook as tqdm

In [2]:
node_child_counts = {}
node_child_sets = {}
items = []
with open('../data/train.txt', 'rt') as f:
    line = f.readline()
    while line:
        numbers = line.split('\t')
        source = int(numbers[0])
        node_child_counts[source] = len(numbers)-1
        node_child_sets[source] = set(numbers[1:])
        for sink in numbers[1:]:
            items.append((source, int(sink)))
        line = f.readline()
len(items)

24004361

In [3]:
DG = nx.DiGraph()
DG.add_edges_from(items)

In [4]:
G = DG.to_undirected()

#### Taking Source, Sink tuples from the testData, finding their common neighbors and store them in a dataframe `test_df`

In [5]:
test_edges = list(pd.read_csv('../data/testData_NetworkX_analysis_01Sep18.csv', usecols=['Source','Sink']).to_records(index=False))

In [6]:
common_neighbors = []
with tqdm(total=len(test_edges)) as pbar:
    for source, sink in test_edges:
        common_neighbors.append((source, sink, list(nx.common_neighbors(G, source, sink)),len(list(nx.common_neighbors(G, source, sink)))))
        pbar.update(1)

HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))




In [7]:
test_df = pd.DataFrame.from_records(common_neighbors, columns=['Source','Sink','list_of_common_neighbors','number_of_common_neighbors'])
test_df.head()

Unnamed: 0,Source,Sink,list_of_common_neighbors,number_of_common_neighbors
0,2184483,1300190,[],0
1,3151356,1452193,"[2120801, 36596, 404943, 2809458]",4
2,1579396,193159,[],0
3,1406432,2481036,"[2152768, 2789436, 4720169, 2307937, 2385853, ...",7
4,2389638,593017,"[20388, 541698, 1565469, 282738, 82340, 2766242]",6


Storing it in a file: Common Neighbors

In [8]:
test_df.to_csv('../data/testCNB.csv', index = False)

#### Reading the training dataset to create comparision set: `string` and using that information to calculate `class2features` and storing it in Dataframe: `class2features_df`

In [9]:
train_df = pd.read_csv('../data/df.csv')
edgeStringSet = set(train_df['string'].unique())

In [10]:
class2features = []
with tqdm(total=len(common_neighbors)) as pbar:
for source, sink, neighbors, num_cnb in common_neighbors:
    t1 = t2 = t3 = t4 = 0
    for item in neighbors:
        source2neighbor = str(source) + ',' + str(item)
        sink2neighbor = str(sink) + ',' + str(item)
        neighbor2source = str(item) + ',' + str(source)
        neighbor2sink = str(item) + ',' + str(sink)
        if source2neighbor in edgeStringSet and neighbor2source in edgeStringSet:
            t1 = t1 + 1
        if source2neighbor in edgeStringSet and sink2neighbor in edgeStringSet:
            t2 = t2 + 1
        if neighbor2source in edgeStringSet and neighbor2sink in edgeStringSet:
            t3 = t3 + 1
        if neighbor2source in edgeStringSet and sink2neighbor in edgeStringSet:
            t4 = t4 + 1
    # print(t1, t2, t3, t4)
    if num_cnb == 0:
        class2features.append((source, sink, neighbors, num_cnb, t1, t2, t3, t4, 0, 0, 0, 0))
    else:
        class2features.append((source, sink, neighbors, num_cnb, t1, t2, t3, t4, t1/num_cnb, t2/num_cnb, t3/num_cnb, t4/num_cnb))
    pbar.update(1)

In [11]:
class2features_df = pd.DataFrame.from_records(class2features, columns=['source', 'sink', 'common_neighbors', 'num_cnb', 't1', 't2', 't3', 't4', 't1BYnum_cnb', 't2BYnum_cnb', 't3BYnum_cnb', 't4BYnum_cnb'])
class2features_df.head()

Unnamed: 0,source,sink,common_neighbors,num_cnb,t1,t2,t3,t4,t1BYnum_cnb,t2BYnum_cnb,t3BYnum_cnb,t4BYnum_cnb
0,2184483,1300190,[],0,0,0,0,0,0.0,0.0,0.0,0.0
1,3151356,1452193,"[2120801, 36596, 404943, 2809458]",4,1,0,3,0,0.25,0.0,0.75,0.0
2,1579396,193159,[],0,0,0,0,0,0.0,0.0,0.0,0.0
3,1406432,2481036,"[2152768, 2789436, 4720169, 2307937, 2385853, ...",7,1,5,1,0,0.142857,0.714286,0.142857,0.0
4,2389638,593017,"[20388, 541698, 1565469, 282738, 82340, 2766242]",6,1,6,0,1,0.166667,1.0,0.0,0.166667


To get ID column for test dataset

In [12]:
testID = pd.read_csv('../data/test-public.txt', sep='\t')
testID.head()

Unnamed: 0,Id,Source,Sink
0,1,2184483,1300190
1,2,3151356,1452193
2,3,1579396,193159
3,4,1406432,2481036
4,5,2389638,593017


In [32]:
class2features_df = pd.merge(class2features_df, testID, how='inner', left_on=['source','sink'], right_on=['Source','Sink'], left_index=False, right_index=False)
class2features_df.head()

Unnamed: 0,source,sink,common_neighbors,num_cnb,t1,t2,t3,t4,t1BYnum_cnb,t2BYnum_cnb,t3BYnum_cnb,t4BYnum_cnb,Id,Source,Sink
0,2184483,1300190,[],0,0,0,0,0,0.0,0.0,0.0,0.0,1,2184483,1300190
1,3151356,1452193,"[2120801, 36596, 404943, 2809458]",4,1,0,3,0,0.25,0.0,0.75,0.0,2,3151356,1452193
2,1579396,193159,[],0,0,0,0,0,0.0,0.0,0.0,0.0,3,1579396,193159
3,1406432,2481036,"[2152768, 2789436, 4720169, 2307937, 2385853, ...",7,1,5,1,0,0.142857,0.714286,0.142857,0.0,4,1406432,2481036
4,2389638,593017,"[20388, 541698, 1565469, 282738, 82340, 2766242]",6,1,6,0,1,0.166667,1.0,0.0,0.166667,5,2389638,593017


In [33]:
class2features_df.drop(['Source','Sink'], axis=1, inplace=True)
class2features_df.head()

Unnamed: 0,source,sink,common_neighbors,num_cnb,t1,t2,t3,t4,t1BYnum_cnb,t2BYnum_cnb,t3BYnum_cnb,t4BYnum_cnb,Id
0,2184483,1300190,[],0,0,0,0,0,0.0,0.0,0.0,0.0,1
1,3151356,1452193,"[2120801, 36596, 404943, 2809458]",4,1,0,3,0,0.25,0.0,0.75,0.0,2
2,1579396,193159,[],0,0,0,0,0,0.0,0.0,0.0,0.0,3
3,1406432,2481036,"[2152768, 2789436, 4720169, 2307937, 2385853, ...",7,1,5,1,0,0.142857,0.714286,0.142857,0.0,4
4,2389638,593017,"[20388, 541698, 1565469, 282738, 82340, 2766242]",6,1,6,0,1,0.166667,1.0,0.0,0.166667,5


Saving features to file

In [35]:
class2features_df.to_csv('../data/2k_TEST_class2features.csv', index=False)

Reading First 8 feature dataset and removing unnecessary features for joining with `class2feature` set

In [36]:
class1features_df = pd.read_csv('../data/2k_TEST_class1features.csv')

In [37]:
class1features_df.drop(['AAprediction','Id','common_neighbors'], inplace=True, axis=1)
class1features_df.head()

Unnamed: 0,Source,Sink,source_in_degree,source_out_degree,sink_in_degree,sink_out_degree,source_outBYin_ratio,source_inBYout_ratio,sink_outBYin_ratio,sink_inBYout_ratio
0,2184483,1300190,102,83,3,0,0.813725,1.228916,0.0,0.0
1,3151356,1452193,39,340,289,0,8.717949,0.114706,0.0,0.0
2,1579396,193159,13,208,2,0,16.0,0.0625,0.0,0.0
3,1406432,2481036,16,84,24,14,5.25,0.190476,0.583333,1.714286
4,2389638,593017,30,267,165,58,8.9,0.11236,0.351515,2.844828


### Merging class 1 and class 2 features

In [38]:
class1and2features = pd.merge(class2features_df, class1features_df, how='inner', left_on=['source','sink'], right_on=['Source','Sink'], left_index=False, right_index=False)

In [39]:
class1and2features.drop(['Source','Sink'], axis=1, inplace=True)

In [40]:
class1and2features.head()

Unnamed: 0,source,sink,common_neighbors,num_cnb,t1,t2,t3,t4,t1BYnum_cnb,t2BYnum_cnb,...,t4BYnum_cnb,Id,source_in_degree,source_out_degree,sink_in_degree,sink_out_degree,source_outBYin_ratio,source_inBYout_ratio,sink_outBYin_ratio,sink_inBYout_ratio
0,2184483,1300190,[],0,0,0,0,0,0.0,0.0,...,0.0,1,102,83,3,0,0.813725,1.228916,0.0,0.0
1,3151356,1452193,"[2120801, 36596, 404943, 2809458]",4,1,0,3,0,0.25,0.0,...,0.0,2,39,340,289,0,8.717949,0.114706,0.0,0.0
2,1579396,193159,[],0,0,0,0,0,0.0,0.0,...,0.0,3,13,208,2,0,16.0,0.0625,0.0,0.0
3,1406432,2481036,"[2152768, 2789436, 4720169, 2307937, 2385853, ...",7,1,5,1,0,0.142857,0.714286,...,0.0,4,16,84,24,14,5.25,0.190476,0.583333,1.714286
4,2389638,593017,"[20388, 541698, 1565469, 282738, 82340, 2766242]",6,1,6,0,1,0.166667,1.0,...,0.166667,5,30,267,165,58,8.9,0.11236,0.351515,2.844828


Gettiong Adamic Adar & Jaccard Coefecient for Test Edges

In [42]:
link_prediction_coef = pd.read_csv('../data/testData_NetworkX_analysis_01Sep18.csv')
link_prediction_coef.drop(['PA','Id','common_neighbors'], axis=1, inplace=True)
link_prediction_coef.rename({'JA':'JAprediction'}, axis=1, inplace=True)
link_prediction_coef.head()

Unnamed: 0,Source,Sink,AAprediction,JAprediction
0,2184483,1300190,0.0,0.0
1,3151356,1452193,0.407705,0.00626
2,1579396,193159,0.0,0.0
3,1406432,2481036,1.238898,0.0625
4,2389638,593017,0.802812,0.012072


In [43]:
test_total = pd.merge(class1and2features, link_prediction_coef, how='inner', left_on=['source','sink'], right_on=['Source','Sink'], left_index=False, right_index=False)
test_total.head()

Unnamed: 0,source,sink,common_neighbors,num_cnb,t1,t2,t3,t4,t1BYnum_cnb,t2BYnum_cnb,...,sink_in_degree,sink_out_degree,source_outBYin_ratio,source_inBYout_ratio,sink_outBYin_ratio,sink_inBYout_ratio,Source,Sink,AAprediction,JAprediction
0,2184483,1300190,[],0,0,0,0,0,0.0,0.0,...,3,0,0.813725,1.228916,0.0,0.0,2184483,1300190,0.0,0.0
1,3151356,1452193,"[2120801, 36596, 404943, 2809458]",4,1,0,3,0,0.25,0.0,...,289,0,8.717949,0.114706,0.0,0.0,3151356,1452193,0.407705,0.00626
2,1579396,193159,[],0,0,0,0,0,0.0,0.0,...,2,0,16.0,0.0625,0.0,0.0,1579396,193159,0.0,0.0
3,1406432,2481036,"[2152768, 2789436, 4720169, 2307937, 2385853, ...",7,1,5,1,0,0.142857,0.714286,...,24,14,5.25,0.190476,0.583333,1.714286,1406432,2481036,1.238898,0.0625
4,2389638,593017,"[20388, 541698, 1565469, 282738, 82340, 2766242]",6,1,6,0,1,0.166667,1.0,...,165,58,8.9,0.11236,0.351515,2.844828,2389638,593017,0.802812,0.012072


In [44]:
test_total.drop(['Source','Sink'], inplace=True, axis=1)
test_total.head()

Unnamed: 0,source,sink,common_neighbors,num_cnb,t1,t2,t3,t4,t1BYnum_cnb,t2BYnum_cnb,...,source_in_degree,source_out_degree,sink_in_degree,sink_out_degree,source_outBYin_ratio,source_inBYout_ratio,sink_outBYin_ratio,sink_inBYout_ratio,AAprediction,JAprediction
0,2184483,1300190,[],0,0,0,0,0,0.0,0.0,...,102,83,3,0,0.813725,1.228916,0.0,0.0,0.0,0.0
1,3151356,1452193,"[2120801, 36596, 404943, 2809458]",4,1,0,3,0,0.25,0.0,...,39,340,289,0,8.717949,0.114706,0.0,0.0,0.407705,0.00626
2,1579396,193159,[],0,0,0,0,0,0.0,0.0,...,13,208,2,0,16.0,0.0625,0.0,0.0,0.0,0.0
3,1406432,2481036,"[2152768, 2789436, 4720169, 2307937, 2385853, ...",7,1,5,1,0,0.142857,0.714286,...,16,84,24,14,5.25,0.190476,0.583333,1.714286,1.238898,0.0625
4,2389638,593017,"[20388, 541698, 1565469, 282738, 82340, 2766242]",6,1,6,0,1,0.166667,1.0,...,30,267,165,58,8.9,0.11236,0.351515,2.844828,0.802812,0.012072


In [45]:
test_total.shape

(2000, 23)

In [46]:
test_total.columns

Index(['source', 'sink', 'common_neighbors', 'num_cnb', 't1', 't2', 't3', 't4',
       't1BYnum_cnb', 't2BYnum_cnb', 't3BYnum_cnb', 't4BYnum_cnb', 'Id',
       'source_in_degree', 'source_out_degree', 'sink_in_degree',
       'sink_out_degree', 'source_outBYin_ratio', 'source_inBYout_ratio',
       'sink_outBYin_ratio', 'sink_inBYout_ratio', 'AAprediction',
       'JAprediction'],
      dtype='object')

In [47]:
test_total.to_csv('../data/2k_TEST_All_features.csv', index=False)