In [83]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import random

In [72]:
%matplotlib inline

In [3]:
node_child_counts = {}
node_child_sets = {}
items = []
with open('./train.txt', 'rt') as f:
    line = f.readline()
    while line:
        numbers = line.split('\t')
        source = int(numbers[0])
        node_child_counts[source] = len(numbers)-1
        node_child_sets[source] = set(numbers[1:])
        for sink in numbers[1:]:
            items.append((source, int(sink))) 
        line = f.readline()
len(items)

24004361

In [4]:
df = pd.DataFrame(items, columns=['Source','Sink'])

Unnamed: 0,Source,Sink
0,4066935,1272125
1,4066935,3105725
2,4066935,2828522
3,4066935,4394015
4,4066935,2367409


#### For brute force comparing while generating fake nodes

In [5]:
df['string'] = df[['Source','Sink']].apply(lambda x: str(x[0]) +',' +str(x[1]), axis=1)

In [None]:
df.head()

Creating a set of values so that it makes search during comaprision faster

In [6]:
combinedStringValues = set(df['string'].unique().flatten())
len(combinedStringValues)

23946602

In [13]:
unique_parent = pd.DataFrame(df['Source'].unique(), columns=['Unique_Node'])
unique_children = pd.DataFrame(df['Sink'].unique(), columns=['Unique_Node'])

In [14]:
def isParent(x):
    if x in unique_parent.values:
        return 1
    else:
        return 0
    
def isLeafNode(x):
    if x not in unique_parent.values:
        return 1
    else:
        return 0
    
def hasParent(x):
    if x in unique_children['Unique_Node']:
        return 1
    else:
        return 0

## Feature check

In [15]:
unique_children['hasChildren'] = unique_children['Unique_Node'].apply(isParent)
unique_children['hasParent'] = 1
unique_children.head()

Unnamed: 0,Unique_Node,hasChildren,hasParent
0,1272125,1,1
1,3105725,1,1
2,2828522,1,1
3,4394015,1,1
4,2367409,1,1


In [77]:
len(unique_children[(unique_children['hasChildren']==0)&(unique_children['hasParent']==1)])

4847566

In [79]:
unique_parent['hasParent'] = unique_parent['Unique_Node'].apply(hasParent)
unique_parent['hasChildren'] = 1
unique_parent.head()

Unnamed: 0,Unique_Node,hasParent,hasChildren
0,4066935,1,1
1,1940058,1,1
2,20388,1,1
3,212805,1,1
4,850459,1,1


How many nodes have parents:
Total number of nodes = <span style="color:red">4867136</span>

#### Following command is for renaming columns of a dataframe:
<br>unique_children.rename(columns={'isParent':'hasChildren'}, inplace=True)

## Generating fake nodes according to the displayed logic:
<img src='https://github.com/Vitaly-Yakutenko/sml_project1/blob/master/docs/FakeDataGeneration.jpg?raw=true' height=960 width =720 align=left>

In [80]:
set1 = unique_parent[(unique_parent['hasChildren']==1) & (unique_parent['hasChildren']==1)]
set1arr = set1['Unique_Node'].values.flatten()
len(set1arr)

19570

In [81]:
set2 = unique_children[(unique_children['hasChildren']==0) & (unique_children['hasParent']==1)]
set2arr = set2['Unique_Node'].values.flatten()
len(set2arr)

4847566

### Generating FAKE EDGES which meet our selection criteria:
<br>< Source, Sink, Label = 0 >

In [84]:
counter = 100000
FElist = []
with tqdm(total=counter) as pbar:
    while counter is not 0:
        source = set1arr[random.randrange(len(set1arr))]
        sink = set2arr[random.randrange(len(set2arr))]
        string4Compare = str(source) + ',' + str(sink)
        if not string4Compare in combinedStringValues:
            FElist.append((source,sink,0))
            counter -= 1
            pbar.update(1)

HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))




In [61]:
dfFakeEdge = pd.DataFrame.from_records(FElist, columns=['Source','Sink','Value'])
dfFakeEdge.head()

Unnamed: 0,Source,Sink,Value
0,33435,1204880,0
1,1249781,1909884,0
2,3511971,174670,0
3,4583130,3252471,0
4,735485,43986,0


In [60]:
# For checking purposes only, all edges in FElist should not exist in training dataframe
df[(df['Source']==4861782) & (df['Source']==4567431)]

Unnamed: 0,Source,Sink,string


In [62]:
dfFakeEdge.to_csv('/Users/k/Desktop/data/fake0.csv', index = False)

### Generating TRUE EDGES which meet our selection criteria:
<br>< Source, Sink, Label = 1 >

### For efficient hashing and indexing during JOIN Operations

Sample generation from the previously generated sets

We want these edges to be <span style="color:green">TRUE</span> so we add 1 from our end.<br>Then joining dataframes to get intersection.

Checking how many edges have been generated and whether those are correct or not, saving to CSV

### Alternative approach to generate TRUE EDGE data which meets test sink set conditions

In [86]:
newdf = pd.merge(df,unique_children,left_on='Sink',right_on='Unique_Node',left_index=False, right_index=False, how='right')

In [87]:
newdf = newdf[(newdf['hasChildren']==0)&(newdf['hasParent']==1)]

In [88]:
df[(df['Source']==72839)&(df['Sink']==3851527)] # Just for checking purposes

Unnamed: 0,Source,Sink,string
14117071,72839,3851527,728393851527


In [89]:
dk = newdf # Copying to save newdf from further change

In [90]:
dk.drop(columns=['Unique_Node','hasChildren','hasParent','string'], inplace=True)
dk['value'] = 1
dk.head()

Unnamed: 0,Source,Sink,value
20427,4066935,349769,1
20428,2549365,349769,1
20429,4066452,349769,1
20430,2553285,349769,1
20431,428997,349769,1


Rebuilding index after the join operation, we'll be picking data from this dataframe at random

In [91]:
dk = dk.reset_index(drop=True)

In [92]:
tcounter = 100000
dataSetSize = len(dk)
trueSet = []
with tqdm(total=tcounter) as pbar:
    while tcounter is not 0:
        tup = list(dk.loc[random.randrange(dataSetSize)])
        trueSet.append(tup)
        tcounter -= 1
        pbar.update(1)

HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))




In [93]:
dfTrueEdge = pd.DataFrame.from_records(trueSet, columns=['Source','Sink','value'])
dfTrueEdge.head()

Unnamed: 0,Source,Sink,value
0,567886,1380860,1
1,4149575,3398706,1
2,700385,374721,1
3,4579095,1184153,1
4,1892700,2991974,1


In [94]:
dfTrueEdge.to_csv('/Users/k/Desktop/data/true1.csv', index = False)