In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import pickle

# This uses the Answers.csv file from the 10% Stack Overflow data
answer_file = "data/Answers.csv"
tag_file = "data/Tags.csv"
# This edge list is the intermediate file used for graph building
edges_list_file = "output/tripartite_edges.txt"
# tag list contains tag names in a list. tag_id from edge_list_file is the index of the list
tag_list_file = 'output/tag_list.pkl'

# Question_ids and user_ids may overlap, but that does not mean questions are users!!!
# Diff Soln: append a header in ids. This is important as we will need to keep the original questino_header
# to associate with tags
question_header = 'q_'
answer_header = 'u_'
tag_header = 't_'

## Pre-processing

In [2]:
df = pd.read_csv(answer_file, encoding="ISO-8859-1")
print(df.shape)
print(df.dtypes)
df.head(5)


(2014516, 6)
Id                int64
OwnerUserId     float64
CreationDate     object
ParentId          int64
Score             int64
Body             object
dtype: object


Unnamed: 0,Id,OwnerUserId,CreationDate,ParentId,Score,Body
0,92,61.0,2008-08-01T14:45:37Z,90,13,"<p><a href=""http://svnbook.red-bean.com/"">Vers..."
1,124,26.0,2008-08-01T16:09:47Z,80,12,<p>I wound up using this. It is a kind of a ha...
2,199,50.0,2008-08-01T19:36:46Z,180,1,<p>I've read somewhere the human eye can't dis...
3,269,91.0,2008-08-01T23:49:57Z,260,4,"<p>Yes, I thought about that, but I soon figur..."
4,307,49.0,2008-08-02T01:49:46Z,260,28,"<p><a href=""http://www.codeproject.com/Article..."


In [3]:
# check that one question can have multiple answers
df.loc[df['ParentId'] == 90]


Unnamed: 0,Id,OwnerUserId,CreationDate,ParentId,Score,Body
0,92,61.0,2008-08-01T14:45:37Z,90,13,"<p><a href=""http://svnbook.red-bean.com/"">Vers..."
10748,202317,20709.0,2008-10-14T18:41:45Z,90,2,"<p>You can also try <em><a href=""http://www.co..."
85572,1466832,16012.0,2009-09-23T15:40:46Z,90,19,<p>My easy click-by-click instructions (<stron...


In [4]:
edge_df = df[['OwnerUserId', 'ParentId']]
# 1. drop null values
edge_df = edge_df.dropna()
# 2. make parentIds unique
edge_df = edge_df.assign(question_id=lambda x: [question_header + str(i) for i in x.ParentId])
edge_df = edge_df.assign(answer_id=lambda x: [answer_header + str(int(i))for i in x.OwnerUserId])

# 2.a) just a small sanity check
tmp_a = [(i, j ) for i, j in zip(edge_df['ParentId'], edge_df['question_id']) if question_header + str(i) != j]
tmp_b = [(i, j ) for i, j in zip(edge_df['OwnerUserId'], edge_df['answer_id']) if answer_header + str(int(i)) != j]
assert len(tmp_a) == 0 and len(tmp_b) == 0

# 2.b) drop the old columns
edge_df = edge_df.drop(['ParentId'], axis=1)
edge_df = edge_df.drop(['OwnerUserId'], axis=1)
edge_df.head(30)

Unnamed: 0,question_id,answer_id
0,q_90,u_61
1,q_80,u_26
2,q_180,u_50
3,q_260,u_91
4,q_260,u_49
5,q_330,u_59
6,q_260,u_100
7,q_260,u_119
8,q_470,u_49
9,q_180,u_86


In [5]:
# Add tags
df_tags = pd.read_csv(tag_file, encoding="ISO-8859-1", dtype={'Tag': str})
print(df_tags.shape)
print(df_tags.dtypes)
df_tags.head(5)

(3750994, 2)
Id      int64
Tag    object
dtype: object


Unnamed: 0,Id,Tag
0,80,flex
1,80,actionscript-3
2,80,air
3,90,svn
4,90,tortoisesvn


In [6]:
# create id to the tags so we don't have to use the raw tags
tag_list = list(set(df_tags['Tag']))
name_to_tag = {curr_tag: index for index, curr_tag in enumerate(tag_list)}

# note: use answer id so append would work nicely in the next step
df_tags = df_tags.assign(answer_id=lambda x: [tag_header + str(name_to_tag[i]) for i in df_tags.Tag])
df_tags = df_tags.assign(question_id=lambda x: [question_header + str(i) for i in x.Id])

df_tags = df_tags.drop(['Id'], axis=1)
df_tags = df_tags.drop(['Tag'], axis=1)
df_tags.head(5)

Unnamed: 0,answer_id,question_id
0,t_9662,q_80
1,t_13492,q_80
2,t_14279,q_80
3,t_11367,q_90
4,t_27737,q_90


In [7]:
tripartite_edges = edge_df.append(df_tags, ignore_index=True)
# lastly add weights to edges
tripartite_edges['EdgeWeight'] = 1
tripartite_edges.head(5)

Unnamed: 0,answer_id,question_id,EdgeWeight
0,u_61,q_90,1
1,u_26,q_80,1
2,u_50,q_180,1
3,u_91,q_260,1
4,u_49,q_260,1


In [8]:
tripartite_edges.to_csv(edges_list_file, sep=' ', header=False, index=False)

with open(tag_list_file, 'wb') as handle:
    pickle.dump(tag_list, handle, protocol=pickle.HIGHEST_PROTOCOL)

# uncomment below for reading in tag list
# with open(tag_list_file, 'rb') as handle:
#     tag_list  = pickle.load(handle)

In [9]:
tripartite_edges.tail(5)

Unnamed: 0,answer_id,question_id,EdgeWeight
5752305,t_33184,q_40143360,1
5752306,t_16221,q_40143360,1
5752307,t_4218,q_40143380,1
5752308,t_31377,q_40143380,1
5752309,t_4064,q_40143380,1


## recreate orignal random walk (i.e. no tags)

In [10]:
edge_df.tail(5)

Unnamed: 0,question_id,answer_id
2014511,q_40143190,u_333403
2014512,q_40137110,u_642706
2014513,q_40141860,u_2239781
2014514,q_40077010,u_6934347
2014515,q_40142910,u_4464432


In [11]:
edge_df['EdgeWeight'] = 1
edge_df.head(5)

Unnamed: 0,question_id,answer_id,EdgeWeight
0,q_90,u_61,1
1,q_80,u_26,1
2,q_180,u_50,1
3,q_260,u_91,1
4,q_260,u_49,1


In [13]:
edge_df.to_csv("output/original_edges.txt", sep=' ', header=False, index=False)