# Community Finding - Algorithm Comparision
## Jake Carlson

## Load Data

In [1]:
import numpy as np
import pandas as pd
import igraph

questions = pd.read_csv("./data/2008-questions.csv")
answers = pd.read_csv("./data/2008-answers.csv")
tags = pd.read_csv("./data/2008-tags.csv")

# remove NaN owner ids
questions = questions[np.isfinite(questions.OwnerUserId.values)]
answers = answers[np.isfinite(answers.OwnerUserId.values)]

questions.OwnerUserId = questions.OwnerUserId.astype(np.int)
answers.OwnerUserId = answers.OwnerUserId.astype(np.int)

In [2]:
questions.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title
0,80,26,2008-08-01T13:57:07Z,26,SQLStatement.execute() - multiple queries in o...
1,90,58,2008-08-01T14:41:24Z,144,Good branching and merging tutorials for Torto...
2,120,83,2008-08-01T15:50:08Z,21,ASP.NET Site Maps
3,180,2089740,2008-08-01T18:42:19Z,53,Function for creating color wheels
4,260,91,2008-08-01T23:22:08Z,49,Adding scripting functionality to .NET applica...


In [3]:
answers.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,ParentId,Score
0,92,61,2008-08-01T14:45:37Z,90,13
1,124,26,2008-08-01T16:09:47Z,80,12
2,199,50,2008-08-01T19:36:46Z,180,1
3,269,91,2008-08-01T23:49:57Z,260,4
4,307,49,2008-08-02T01:49:46Z,260,28


In [4]:
tags.head()

Unnamed: 0,Id,Tag
0,80,flex
1,80,actionscript-3
2,80,air
3,90,svn
4,90,tortoisesvn


In [5]:
users_to_questions = {}
for g, data in questions.groupby(['OwnerUserId']):
    users_to_questions[g] = set(data.Id.values)
for g, data in answers.groupby(['OwnerUserId']):
    if g in users_to_questions.keys():
        users_to_questions[g].update(data.ParentId.values)
    else:
        users_to_questions[g] = set(data.ParentId.values)

In [26]:
users = list(users_to_questions.keys())
questions = list(set([j for i in users_to_questions.values() for j in list(i)]))

In [28]:
users_to_idx = {}
for k, v in users_to_questions.items():
    users_to_idx[k] = [questions.index(q) for q in list(v)]

In [34]:
types = [1 for u in users] + [0 for q in questions]
edges = []
offset = len(users)
for i, u in enumerate(users):
    for j in users_to_idx[u]:
        edges.append((i, offset + j))

In [35]:
graph = igraph.Graph.Bipartite(types, edges)

In [36]:
print(graph)

IGRAPH U--T 14231 24983 --
+ attr: type (v)
+ edges:
0--14094 0--13003 0--12803 0--8604 0--10389 0--9621 0--10871 1--10884 1--9202
1--13203 1--11368 1--12136 1--10027 1--10765 1--9916 1--11725 1--9041 1--10322
1--11351 1--8849 2--9221 2--11346 2--9363 2--9051 2--9159 2--10185 2--8926
2--14198 2--9378 2--13071 2--11347 2--9506 2--10073 2--11208 2--11612 2--10223
2--8667 2--11744 2--14202 2--13851 3--10625 4--12710 5--10500 5--9445 5--8532
5--12460 5--8832 5--11910 5--10609 5--9065 5--10354 5--8447 5--14215 5--12685
6--13892 7--12203 7--12535 7--9601 7--9794 7--12990 7--8562 7--13037 7--11718
7--10947 7--13050 7--12791 7--13057 8--13434 8--11331 8--9751 8--8700 9--11381
9--13883 9--9054 9--10623 9--13109 9--9124 9--11317 9--8667 9--13406 9--10699
10--13475 10--11285 10--13234 10--11568 10--9965 11--10928 11--12907 11--9055
11--8835 11--11049 12--9306 12--10138 13--12652 13--9695 13--10241 13--11896
13--13023 13--13905 13--11835 13--11812 13--8957 14--8590 14--12788 14--8476
14--10677 14-