In [1]:
import pandas as pd
import numpy as np

In [2]:
from log_progress import log_progress

In [3]:
import matplotlib
%matplotlib inline

In [4]:
import seaborn as sns

# Test Public

In [5]:
public_test_df = pd.read_csv('data/test-public.txt', sep='\t')
public_test_df.head()

Unnamed: 0,Id,Source,Sink
0,1,2184483,1300190
1,2,3151356,1452193
2,3,1579396,193159
3,4,1406432,2481036
4,5,2389638,593017


In [6]:
source_set = set(public_test_df.Source.values)
sink_set = set(public_test_df.Sink.values)
len(sink_set), len(source_set), len(source_set.intersection(sink_set))

(1978, 2000, 30)

In [7]:
len(source_set.intersection(sink_set))

30

In [8]:
test_set = source_set.union(sink_set)

# Train

In [9]:
node_children_counts = {}
node_children_sets = {}
items = []
with open('data/train.txt', 'rt') as f:
    line = f.readline()
    while line:
        numbers = line.split('\t')
        source_id = int(numbers[0])
        children_nodes = {int(item) for item in numbers[1:]}
        node_children_counts[source_id] = len(children_nodes)
        node_children_sets[source_id] = children_nodes
        for sink_id in children_nodes:
            items.append((source_id, sink_id)) 
        line = f.readline()
len(items)

23946602

In [10]:
edges_df = pd.DataFrame(items, columns=['Source', 'Sink'])
sink_nodes = set(item[1] for item in items)
source_nodes = set(node_children_counts.keys())
all_nodes = sink_nodes.union(source_nodes)
del items

In [11]:
len(all_nodes), len(sink_nodes), len(source_nodes)

(4867136, 4867136, 20000)

## Nodes Dataframe

In [12]:
nodes_df = pd.DataFrame(data=list(all_nodes), index=list(all_nodes), columns=['node_id'])
#nodes_df.index.name = 'node_id'

In [13]:
#nodes_df = nodes_df.iloc[:int(2e4)].copy()
nodes_df.shape

(4867136, 1)

In [14]:
nodes_df['is_parent_node'] = list(map(lambda x: int(x in source_nodes), nodes_df.index.values))
nodes_df['is_leaf_node'] = (~nodes_df.node_id.isin(source_nodes)).astype(int)

In [15]:
parent_counts = edges_df.groupby('Sink').Source.count()
parent_counts.name = 'parents_count'
nodes_df = nodes_df.join(parent_counts)

In [16]:
children_count = edges_df.Source.value_counts()
children_count.name = 'children_count'
nodes_df = nodes_df.join(children_count)
nodes_df['children_count'] = nodes_df.children_count.fillna(0).astype(int)

In [17]:
nodes_df[(nodes_df.parents_count > 0) & (nodes_df.children_count > 0)].shape

(19570, 5)

In [18]:
def node_children_links_count(node):
    if node not in node_children_sets:
        return 0
    nodes = node_children_sets[node]
    edges_count = 0
    all_children = set()
    for item in nodes:
        if item not in node_children_sets:
            continue
        cur_children = node_children_sets[item]
        edges_count += len(cur_children)
        all_children.update(cur_children)
    return edges_count - len(all_children)

In [19]:
%%time
children_links_counts = nodes_df.node_id.apply(node_children_links_count)

CPU times: user 46min 31s, sys: 2min 28s, total: 48min 59s
Wall time: 49min 27s


In [20]:
children_links_counts.name = 'children_links_counts'
nodes_df = nodes_df.join(children_links_counts)

### Node parents

In [21]:
sink_parents = edges_df.groupby('Sink').Source.agg(lambda x: set(x.values)).to_dict()
len(sink_parents)

4867136

In [25]:
def node_parents_children_links_count(node_id):
    nodes = sink_parents[node_id]
    edges_count = 0
    all_children = set()
    for item in nodes:
        if item not in node_children_sets:
            continue
        cur_children = node_children_sets[item]
        edges_count += len(cur_children)
        all_children.update(cur_children)
    return edges_count - len(all_children)

In [26]:
%%time
parents_links_counts = nodes_df.node_id.apply(node_parents_children_links_count)

CPU times: user 50min 6s, sys: 2min 24s, total: 52min 30s
Wall time: 53min 22s


In [27]:
parents_links_counts.name = 'parents_links_counts'
nodes_df = nodes_df.join(parents_links_counts)

In [28]:
nodes_df[nodes_df.parents_links_counts > 0].head()

Unnamed: 0,node_id,is_parent_node,is_leaf_node,parents_count,children_count,children_links_counts,parents_links_counts
1097,1097,1,0,21,176,25197,34232
1122,1122,1,0,109,1012,91833,50033
1152,1152,1,0,94,2777,538818,877504
1170,1170,1,0,138,452,70953,360778
1174,1174,1,0,187,542,1066453,1097483


In [29]:
nodes_df.to_csv('data/node_features.csv', index=False)

In [None]:
#edges_df.to_csv('data/edges.csv', index=False)

In [None]:
edges_df[edges_df.Sink == 1122]

In [None]:
mask = data.Source.isin(test_set) | data.Sink.isin(test_set)
data = data[mask]
mask.sum()

In [None]:
data.head()

In [None]:
data['is_train_set'] = 1
data['Id'] = range(2001, 2001 + data.shape[0], 1)

In [None]:
public_test_df['is_train_set'] = 0

In [None]:
df = pd.concat([public_test_df, data], axis=0)

In [None]:
df.head()

In [None]:
child_count = lambda x: node_child_counts[x] if x in node_child_counts.keys() else 0
df['source_childs_count'] = df.Source.apply(child_count).clip(upper=1000)
df['sink_childs_count'] = df.Sink.apply(child_count).clip(upper=1000)

In [None]:
def intersect(vector):
    if vector['Sink'] not in node_child_sets.keys():
        return 0
    set1 = node_child_sets[vector['Source']]
    set2 = node_child_sets[vector['Sink']] 
    return len(set1.intersection(set2))
df['source_sink_child_intersect_count'] = df.apply(intersect, axis=1)

In [None]:
df[df.source_childs_count > 0]

In [None]:
sns.pairplot(df.ix[:4000, 3:], hue='is_train_set', size=4)

In [None]:
df.Source.value_counts().head()

Intersection of source nodes Kaggge Test Set and Train Data Source Node Set

In [None]:
len(source_set.intersection(set(data.source.values)))

In [None]:
len(sink_set.intersection(set(data.sink.values)))

In [None]:
public_test_df.head()

In [None]:
df = data.source

In [None]:
df.dropna()

In [None]:
len(set(data.source.values).intersection(set(data.sink.values)))

In [None]:
data.groupby('source')['sink'].count()

# Predictions

In [None]:
predictions = pd.read_csv('data/sample.csv')
predictions['Prediction'] = 0

In [None]:
predictions.to_csv('prediction.csv', index=False)

In [None]:
df['prediction'] = (df['source_childs_count'] > 200).astype(int)
df[df.Id <= 2000][['Id', 'prediction']].to_csv('prediction.csv', index=False)