In [4]:
import pandas as pd
import numpy as np

import networkx as nx

import os
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6"
os.environ["PYSPARK_DRIVER_PYTHON"] = "/home/arthur/.local/bin/ipython3"

# This uses the Answers.csv file from the 10% Stack Overflow data
answer_file = "data/Answers.csv"
# This edge list is the intermediate file used for graph building
edges_list_file = "processed_data/answer_edges.txt"

## Pre-processing

In [5]:
# loads data with pands, it eats up memory, but parsing with pyspark is much more work
df = pd.read_csv("data/Answers.csv", encoding="ISO-8859-1")
df.head(5)

Unnamed: 0,Id,OwnerUserId,CreationDate,ParentId,Score,Body
0,92,61.0,2008-08-01T14:45:37Z,90,13,"<p><a href=""http://svnbook.red-bean.com/"">Vers..."
1,124,26.0,2008-08-01T16:09:47Z,80,12,<p>I wound up using this. It is a kind of a ha...
2,199,50.0,2008-08-01T19:36:46Z,180,1,<p>I've read somewhere the human eye can't dis...
3,269,91.0,2008-08-01T23:49:57Z,260,4,"<p>Yes, I thought about that, but I soon figur..."
4,307,49.0,2008-08-02T01:49:46Z,260,28,"<p><a href=""http://www.codeproject.com/Article..."


In [6]:
df.shape

(2014516, 6)

In [14]:
df.loc[df['ParentId'] == 90]


Unnamed: 0,Id,OwnerUserId,CreationDate,ParentId,Score,Body
0,92,61.0,2008-08-01T14:45:37Z,90,13,"<p><a href=""http://svnbook.red-bean.com/"">Vers..."
10748,202317,20709.0,2008-10-14T18:41:45Z,90,2,"<p>You can also try <em><a href=""http://www.co..."
85572,1466832,16012.0,2009-09-23T15:40:46Z,90,19,<p>My easy click-by-click instructions (<stron...


In [7]:
# Question_ids and user_ids may overlap, but that does not mean questions are users!!!
# Soln: each question_id += max_user_id
max_user_id = df[['OwnerUserId']].max()
max_user_id

OwnerUserId    7045028.0
dtype: float64

In [23]:
edge_df = df[['OwnerUserId', 'ParentId']]
# 1. drop null values
edge_df = edge_df.dropna()
# 2. make parentIds unique
edge_df = edge_df.assign(newParentId=lambda x: x.ParentId + max(max_user_id))
edge_df = edge_df.drop(['ParentId'], axis=1)
# 3. add weights to edges
edge_df['EdgeWeight'] = 1
# 4. cast the datafraem to int type
edge_df = edge_df.astype('int32')
edge_df.head(30)

Unnamed: 0,OwnerUserId,newParentId,EdgeWeight
0,61,7045118,1
1,26,7045108,1
2,50,7045208,1
3,91,7045288,1
4,49,7045288,1
5,59,7045358,1
6,100,7045288,1
7,119,7045288,1
8,49,7045498,1
9,86,7045208,1


In [24]:
edge_df.loc[edge_df['newParentId'] == 7045118]

Unnamed: 0,OwnerUserId,newParentId,EdgeWeight
0,61,7045118,1
10748,20709,7045118,1
85572,16012,7045118,1


In [25]:
edge_df.to_csv('processed_data/answer_edges.txt', sep=' ', header=False, index=False)


## Build Graph

In [26]:
# by default, nx creates undirected edges, exactly what we want
G = nx.read_edgelist(edges_list_file, nodetype=int, data=(('weight',float),))
print(nx.info(G))

Name: 
Type: Graph
Number of nodes: 1568735
Number of edges: 1993272
Average degree:   2.5412


In [27]:
all_user_ids = set()
all_question_ids = set()
with open(edges_list_file, 'r') as read_file:
    for line in read_file.readlines():
        user_id, question_id, weight = line.strip().split(' ')
        all_user_ids.add(int(user_id))
        all_question_ids.add(int(question_id))
print(list(all_user_ids)[:10])
print(list(all_question_ids)[:10])
# should be no intersection between user_ids and question_ids
print(len(all_user_ids.intersection(all_question_ids)))


[1, 3, 4, 5, 1048579, 2097159, 5242883, 9, 3145739, 1048588]
[18874368, 39845888, 37748738, 25165828, 35651588, 12582918, 46137348, 41943048, 8388618, 14680078]
0


In [13]:
# General Data Analysis
islands = [len(c) for c in sorted(nx.connected_components(G), key=len, reverse=True)]
print("connected components", islands[:10])

connected components [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]


In [14]:
# analyze how connected the graph is
# connectivity of 0..... oh well
from networkx.algorithms import approximation as approx
approx.node_connectivity(G)

0

## Test Performance

In [15]:
import random
# parameters
n_test_edge = 1000
n_steps = 1000

all_user_ids = set()

def test_algorithm(nx_g, all_edges):
    for i in tqdm(range(n_test_edge)):
        x, y = random.choice(all_edges)
        for s in range(n_steps):



In [19]:
all_edges = list(G.edges())
all_edges[:10]

[(61, 90),
 (61, 24270),
 (61, 47980),
 (61, 51390),
 (61, 142340),
 (61, 526660),
 (61, 1581560),
 (61, 2520220),
 (61, 6242540),
 (61, 6553950)]

In [20]:
G[61]

AtlasView({90: {'weight': 1.0}, 24270: {'weight': 1.0}, 47980: {'weight': 1.0}, 51390: {'weight': 1.0}, 142340: {'weight': 1.0}, 526660: {'weight': 1.0}, 1581560: {'weight': 1.0}, 2520220: {'weight': 1.0}, 6242540: {'weight': 1.0}, 6553950: {'weight': 1.0}})

In [21]:
list(G[61])


[90, 24270, 47980, 51390, 142340, 526660, 1581560, 2520220, 6242540, 6553950]