In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import random

In [2]:
%matplotlib inline

In [20]:
node_child_counts = {}
node_child_sets = {}
items = []
with open('./train.txt', 'rt') as f:
    line = f.readline()
    while line:
        numbers = line.split('\t')
        source = int(numbers[0])
        node_child_counts[source] = len(numbers)-1
        node_child_sets[source] = set(numbers[1:])
        for sink in numbers[1:]:
            items.append((source, int(sink),1)) 
        line = f.readline()
len(items)

24004361

For brute force comparing while generating fake nodes

Storing it in a file for future ready access

Reading training dataset from dataframe stored at local file

In [21]:
df = pd.read_csv('/Users/k/Desktop/data/df.csv')

Creating a set of values so that it makes search during comparision faster

In [5]:
combinedStringValues = set(df['string'].unique().flatten())
len(combinedStringValues)

23946602

In [6]:
unique_parent = pd.DataFrame(df['Source'].unique(), columns=['Unique_Node'])
unique_children = pd.DataFrame(df['Sink'].unique(), columns=['Unique_Node'])

In [7]:
def isParent(x):
    if x in unique_parent.values:
        return 1
    else:
        return 0
    
def isLeafNode(x):
    if x not in unique_parent.values:
        return 1
    else:
        return 0
    
def hasParent(x):
    if x in unique_children['Unique_Node']:
        return 1
    else:
        return 0

## Feature check

In [8]:
unique_children['hasChildren'] = unique_children['Unique_Node'].apply(isParent)
unique_children['hasParent'] = 1
unique_children.head()

Unnamed: 0,Unique_Node,hasChildren,hasParent
0,1272125,1,1
1,3105725,1,1
2,2828522,1,1
3,4394015,1,1
4,2367409,1,1


In [9]:
len(unique_children[(unique_children['hasChildren']==0)&(unique_children['hasParent']==1)])

4847566

In [10]:
unique_parent['hasParent'] = unique_parent['Unique_Node'].apply(hasParent)
unique_parent['hasChildren'] = 1
unique_parent.head()

Unnamed: 0,Unique_Node,hasParent,hasChildren
0,4066935,1,1
1,1940058,1,1
2,20388,1,1
3,212805,1,1
4,850459,1,1


How many nodes have parents:
Total number of nodes = <span style="color:red">4867136</span>

#### Following command is for renaming columns of a dataframe:
<br>unique_children.rename(columns={'isParent':'hasChildren'}, inplace=True)

## Generating fake nodes according to the displayed logic:
<img src='https://github.com/Vitaly-Yakutenko/sml_project1/blob/master/docs/FakeDataGeneration.jpg?raw=true' height=640 width =480 align=left>

In [11]:
set1 = unique_parent[(unique_parent['hasChildren']==1) & (unique_parent['hasChildren']==1)]
set1arr = set1['Unique_Node'].values.flatten()
len(set1arr)

19570

In [12]:
set2 = unique_children[(unique_children['hasChildren']==0) & (unique_children['hasParent']==1)]
set2arr = set2['Unique_Node'].values.flatten()
len(set2arr)

4847566

# FAKE DATA GENERATION

### Generating FAKE EDGES which meet our selection criteria:
<br>< Source, Sink, Label = 0 >

In [13]:
counter = 100000
FElist = []
with tqdm(total=counter) as pbar:
    while counter is not 0:
        source = set1arr[random.randrange(len(set1arr))]
        sink = set2arr[random.randrange(len(set2arr))]
        string4Compare = str(source) + ',' + str(sink)
        if not string4Compare in combinedStringValues:
            FElist.append((source,sink,0))
            counter -= 1
            pbar.update(1)

HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))




In [14]:
dfFakeEdge = pd.DataFrame.from_records(FElist, columns=['Source','Sink','Value'])
dfFakeEdge.head()

Unnamed: 0,Source,Sink,Value
0,1403191,3847720,0
1,888595,128444,0
2,1881939,760085,0
3,3676504,3898585,0
4,1712959,3152720,0


In [15]:
dfFakeEdge.to_csv('/Users/k/Desktop/data/fake0.csv', index = False)

### Alternative approach to generate TRUE EDGE data which meets test sink set conditions

In [16]:
newdf = pd.merge(df,unique_children,left_on='Sink',right_on='Unique_Node',left_index=False, right_index=False, how='right')

In [17]:
newdf = newdf[(newdf['hasChildren']==0)&(newdf['hasParent']==1)]

In [18]:
dk = newdf # Copying to save newdf from further change

In [19]:
dk.drop(columns=['Unique_Node','hasChildren','hasParent','string'], inplace=True)
dk['value'] = 1
dk.head()

Unnamed: 0,Source,Sink,value
20427,4066935,349769,1
20428,2549365,349769,1
20429,4066452,349769,1
20430,2553285,349769,1
20431,428997,349769,1


Rebuilding index after the join operation, we'll be picking data from this dataframe at random

In [20]:
dk = dk.reset_index(drop=True)

In [21]:
tcounter = 100000
dataSetSize = len(dk)
trueSet = []
with tqdm(total=tcounter) as pbar:
    while tcounter is not 0:
        tup = list(dk.loc[random.randrange(dataSetSize)])
        trueSet.append(tup)
        tcounter -= 1
        pbar.update(1)

HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))




In [22]:
dfTrueEdge = pd.DataFrame.from_records(trueSet, columns=['Source','Sink','value'])
dfTrueEdge.head()

Unnamed: 0,Source,Sink,value
0,4096612,2983822,1
1,2739890,3529140,1
2,137719,1389087,1
3,1167936,489392,1
4,4202167,4434432,1


In [23]:
dfTrueEdge.to_csv('/Users/k/Desktop/data/true1.csv', index = False)

### Test Dataset

In [24]:
testdf = pd.read_csv('./test-public.txt', sep='\t')
testdf.head()

Unnamed: 0,Id,Source,Sink
0,1,2184483,1300190
1,2,3151356,1452193
2,3,1579396,193159
3,4,1406432,2481036
4,5,2389638,593017


## Random data generation using NetworkX

In [25]:
import networkx as nx

### Undirected Graph - G

In [26]:
G = nx.Graph()
G.add_weighted_edges_from(items)

### Directed Graph - DG

In [None]:
DG = nx.Graph()
DG.add_weighted_edges_from(items)

Doing the following to find the Test Dataset similarilty coeficient values against the fake generated values `dfFakeEdge of 100,000`

In [27]:
fkdf = dfFakeEdge.drop('Value', axis=1)
fkdf.head()

Unnamed: 0,Source,Sink
0,1403191,3847720
1,888595,128444
2,1881939,760085
3,3676504,3898585
4,1712959,3152720


In [28]:
testList = fkdf.to_records(index=False)
testList = testList.tolist()

Making predictions accordingly

In [29]:
preds = nx.adamic_adar_index(G, testList)
adamic = []

In [30]:
counter = len(fkdf)
with tqdm(total=100000) as pbar:
    for SRC, SNK, Predictions in preds:
        adamic.append((SRC, SNK, Predictions))
        counter -= 1
        pbar.update(1)

HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))




In [31]:
adamdf = pd.DataFrame.from_records(adamic, columns=['Source','Sink','Predictions'])

In [32]:
newdfs = adamdf
newdfs.head()

Unnamed: 0,Source,Sink,Predictions
0,1403191,3847720,0.077968
1,888595,128444,0.0
2,1881939,760085,0.0
3,3676504,3898585,0.0
4,1712959,3152720,0.0


In [33]:
def pred(x):
    if x > 0.095: # Tuning parameter
        return 1
    else:
        return 0

In [34]:
newdfs['value'] = newdfs['Predictions'].apply(pred)
newdfs.head()

Unnamed: 0,Source,Sink,Predictions,value
0,1403191,3847720,0.077968,0
1,888595,128444,0.0,0
2,1881939,760085,0.0,0
3,3676504,3898585,0.0,0
4,1712959,3152720,0.0,0


In [35]:
prediction = newdfs.drop(['Source','Sink','Predictions'],axis=1)

In [36]:
prediction['Prediction'] = newdfs['value']
prediction.head()

Unnamed: 0,value,Prediction
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


prediction.to_csv('test_predictions.csv',index=False)

In [37]:
newdfs.drop('Predictions', inplace=True, axis=1)

Considering only the fake values

In [39]:
newdfs2 = newdfs[newdfs['value']==0]

In [40]:
newdfs2.head()

Unnamed: 0,Source,Sink,value
0,1403191,3847720,0
1,888595,128444,0
2,1881939,760085,0
3,3676504,3898585,0
4,1712959,3152720,0


Taking sample of True edges from the generated data

In [45]:
true_edges_from_training_data = df.sample(len(newdfs2))

In [46]:
true_edges_from_training_data.drop('string', inplace=True, axis=1)

In [47]:
true_edges_from_training_data.reset_index(drop=True, inplace=True)

In [48]:
true_edges_from_training_data['value'] = 1

In [49]:
print(true_edges_from_training_data['value'].sum())
true_edges_from_training_data.head()

95819


Unnamed: 0,Source,Sink,value
0,796367,4712010,1
1,145219,3319040,1
2,2939315,1703302,1
3,2805661,176194,1
4,2763196,2646822,1


Combining False and True edges

In [50]:
train_dataset_TrueAndFalse = pd.concat(objs=[newdfs2,true_edges_from_training_data], axis=0)

In [51]:
train_dataset_TrueAndFalse.reset_index(inplace=True, drop=True)

In [52]:
train_dataset_TrueAndFalse.to_csv('/Users/k/Desktop/data/train_dataset_TrueAndFalse.csv', index=False)
train_dataset_TrueAndFalse.head()

Unnamed: 0,Source,Sink,value
0,1403191,3847720,0
1,888595,128444,0
2,1881939,760085,0
3,3676504,3898585,0
4,1712959,3152720,0


In [3]:
train_dataset_TrueAndFalse = pd.read_csv('/Users/k/Desktop/data/train_dataset_TrueAndFalse.csv')
adamicdf = pd.read_csv('/Users/k/Desktop/data/190ktrainsample.csv')

In [4]:
train_dataset_TrueAndFalse.head()

Unnamed: 0,Source,Sink,value
0,1403191,3847720,0
1,888595,128444,0
2,1881939,760085,0
3,3676504,3898585,0
4,1712959,3152720,0


In [5]:
adamicdf.head()

Unnamed: 0,Source,Sink,AA
0,1403191,3847720,0.077968
1,888595,128444,0.0
2,1881939,760085,0.0
3,3676504,3898585,0.0
4,1712959,3152720,0.0


Now applying predictions to the whole of the dataset that was generated by us ~ < 200000

In [53]:
hlist = train_dataset_TrueAndFalse.drop('value', axis=1).to_records(index=False)
hlist = hlist.tolist()

In [54]:
len(hlist)

191638

Applying predictions over the whole lot

In [98]:
fpreds = nx.adamic_adar_index(G, hlist)
fadamic = []

# Run following in the end, takes a lot of time to compare stuff - 50% issue problem

In [99]:
counter = len(hlist)
with tqdm(total=counter) as pbar:
    for SRC, SNK, Predictions in fpreds:
        fadamic.append((SRC, SNK, Predictions))
        counter -= 1
        pbar.update(1)

HBox(children=(IntProgress(value=0, max=191638), HTML(value='')))




In [None]:
adamicdf = pd.DataFrame.from_records(fadamic,columns=['Source','Sink','AA'])
adamicdf.to_csv('/Users/k/Desktop/data/190ktrainsample.csv', index=False)

Merging to achieve values that signify whether the node generated is true or fake

In [15]:
adamicdfnew = pd.merge(adamicdf, train_dataset_TrueAndFalse, left_index=False, right_index=False, how='inner', left_on=['Source','Sink'], right_on=['Source','Sink'])

In [33]:
print(len(adamicdf))
print(len(train_dataset_TrueAndFalse))

191638
191638


In [22]:
df[(df['Source']==145219)&(df['Sink']==3319040)]

Unnamed: 0,Source,Sink,string
11125198,145219,3319040,1452193319040


In [25]:
adamicdfnew[adamicdfnew['value']==1].head()

Unnamed: 0,Source,Sink,AA,value
95819,796367,4712010,0.337515,1
95820,145219,3319040,2.456273,1
95821,2939315,1703302,0.084494,1
95822,2805661,176194,0.478324,1
95823,2763196,2646822,0.718578,1


# Trying out different similarity index for the test dataset

In [59]:
testdf.head()
testdfcopy = testdf
testdfcopy.head()

Unnamed: 0,Id,Source,Sink
0,1,2184483,1300190
1,2,3151356,1452193
2,3,1579396,193159
3,4,1406432,2481036
4,5,2389638,593017


In [66]:
testDF_withoutID = testdfcopy.drop('Id', axis=1)
test_data_list = testDF_withoutID.to_records(index=False).tolist()
len(test_data_list)

2000

### Adamic Adar

In [67]:
AA = nx.adamic_adar_index(G, test_data_list)
AAlist = []

In [70]:
counter = len(test_data_list)
with tqdm(total=counter) as pbar:
    for src, snk, predictions in AA:
        AAlist.append((src, snk, predictions))
        counter -= 1
        pbar.update(1)

HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))




In [73]:
AAdf = pd.DataFrame.from_records(AAlist,columns=['Source','Sink','AA'])
AAdf.head()

Unnamed: 0,Source,Sink,AA
0,2184483,1300190,0.0
1,3151356,1452193,0.407705
2,1579396,193159,0.0
3,1406432,2481036,1.238898
4,2389638,593017,0.802812


### Jaccard Coefficient

In [68]:
JA = nx.jaccard_coefficient(G, test_data_list)
JAlist = []

In [71]:
counter = len(test_data_list)
with tqdm(total=counter) as pbar:
    for src, snk, predictions in JA:
        JAlist.append((src, snk, predictions))
        counter -= 1
        pbar.update(1)

HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))




In [74]:
JAdf = pd.DataFrame.from_records(JAlist,columns=['Source','Sink','JA'])
JAdf.head()

Unnamed: 0,Source,Sink,JA
0,2184483,1300190,0.0
1,3151356,1452193,0.00626
2,1579396,193159,0.0
3,1406432,2481036,0.0625
4,2389638,593017,0.012072


### Preferential Attachment

In [69]:
PA = nx.preferential_attachment(G, test_data_list)
PAlist = []

In [72]:
counter = len(test_data_list)
with tqdm(total=counter) as pbar:
    for src, snk, predictions in PA:
        PAlist.append((src, snk, predictions))
        counter -= 1
        pbar.update(1)

HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))




In [75]:
PAdf = pd.DataFrame.from_records(PAlist,columns=['Source','Sink','PA'])
PAdf.head()

Unnamed: 0,Source,Sink,PA
0,2184483,1300190,435
1,3151356,1452193,102306
2,1579396,193159,418
3,1406432,2481036,2838
4,2389638,593017,62196


### Integrating

In [76]:
AA_JA = pd.merge(AAdf,JAdf,left_index=False, right_index=False, how='inner', left_on=['Source','Sink'], right_on=['Source','Sink'])

In [79]:
PA_AA_JA = pd.merge(AA_JA,PAdf,left_index=False, right_index=False, how='inner', left_on=['Source','Sink'], right_on=['Source','Sink'])

In [88]:
PA_AA_JA = pd.merge(PA_AA_JA,testdfcopy,left_index=False, right_index=False, how='inner', left_on=['Source','Sink'], right_on=['Source','Sink'])

In [96]:
PA_AA_JA.to_csv('/Users/k/Desktop/data/coef_comparision_TestDataSet.csv', index=False)
PA_AA_JA.head()

Unnamed: 0,Source,Sink,AA,JA,PA,Id
0,2184483,1300190,0.0,0.0,435,1
1,3151356,1452193,0.407705,0.00626,102306,2
2,1579396,193159,0.0,0.0,418,3
3,1406432,2481036,1.238898,0.0625,2838,4
4,2389638,593017,0.802812,0.012072,62196,5


In [84]:
import seaborn as sns
%matplotlib inline