In [64]:
import praw
import csv
from pyprind import ProgPercent

In [2]:
class CommentNode(object):
    def __init__(self, comment):
        self.comment = comment

    def __repr__(self):
        return self.id

    def __hash__(self):
        return hash(self.id)

In [84]:
class CommentGatherer(object):
    def __init__(self):
        with open('login.csv') as f:
            reader = csv.reader(f, delimiter=':')
            login = {k.upper(): v.strip() for k, v in reader}
        self.r = praw.Reddit(user_agent='LDA comment collector by /u/{}'\
            .format(login['USERNAME']))
        self.r.login(login['USERNAME'], login['PASSWORD'])
        
    def gather_replies(self, comment, depth):
        replies = self.filter_comments(comment.replies)
        self.comments.extend(replies)
        self.transitions.extend((comment, r) for r in replies)
        if depth > 1:
            for r in replies:
                self.gather_replies(r, depth - 1)
    
    def filter_comments(self, comments):
        if len(comments) > 0:
            return comments[:-1]
        else:
            return comments
        
    def traverse(self, sub_name, nroots=100, max_depth=float('inf')):
        self.comments = []
        self.transitions = []
        stream = self.r.get_subreddit(sub_name).get_hot(limit=nroots)
        pbar = ProgPercent(nroots)
        for submission in stream:
            for comment in self.filter_comments(submission.comments):
                self.comments.append(comment)
                self.gather_replies(comment, max_depth)
                pbar.update()
        self.id2index = {id: index for index, id in enumerate(c.id for c in self.comments)}

In [85]:
cg = CommentGatherer()
cg.traverse('all', nroots=5)

[100 %] elapsed[sec]: 5.396 | ETA[sec]: 0.000 
Total time elapsed: 5.396 sec
  for arg in args[1:]:
  for arg in args[1:]:
  for arg in args[1:]:
  for arg in args[1:]:
  for arg in args[1:]:
  for arg in args[1:]:


In [86]:
len(cg.transitions)

409

In [45]:
import numpy as np
import lda_gibbs

In [133]:
x = np.array([1, 2, 3, 4])
y = np.array([5, 6, 7, 8])
M = x[np.newaxis].T*y; M

array([[ 5,  6,  7,  8],
       [10, 12, 14, 16],
       [15, 18, 21, 24],
       [20, 24, 28, 32]])

In [153]:
(M.T/M.sum(axis=0)).T

array([[ 0.1       ,  0.12      ,  0.14      ,  0.16      ],
       [ 0.16666667,  0.2       ,  0.23333333,  0.26666667],
       [ 0.21428571,  0.25714286,  0.3       ,  0.34285714],
       [ 0.25      ,  0.3       ,  0.35      ,  0.4       ]])

In [47]:
def tokenize(comment):
    return comment.lower().strip().split()

In [90]:
model = LDA([tokenize(c.body) for c in cg.comments])
theta, beta = model.train(ntopics=10, niter=200, seed=42)

In [91]:
theta.shape

(675, 10)

In [171]:
def transition_matrix(theta, id2index, transitions):
    ntopics = theta.shape[1]
    M = np.zeros((ntopics, ntopics))
    for x, y in transitions:
        pi_x = theta[id2index[x.id], :]
        pi_y = theta[id2index[y.id], :]
        M += pi_x[np.newaxis].T * pi_y
    for i in range(ntopics):
        M[i, :] /= M[i, :].sum()
    return M

In [176]:
M = transition_matrix(theta, cg.id2index, cg.transitions)

In [177]:
M

array([[ 0.15488975,  0.07956753,  0.10233533,  0.08027625,  0.10434367,
         0.07914406,  0.11590143,  0.09840701,  0.08636246,  0.09877251],
       [ 0.1072669 ,  0.11611744,  0.07841264,  0.05971687,  0.12470298,
         0.11042655,  0.11837047,  0.09982086,  0.11650131,  0.06866399],
       [ 0.14964303,  0.08756545,  0.07721038,  0.08939235,  0.10363121,
         0.09041514,  0.17440531,  0.08521565,  0.06642272,  0.07609874],
       [ 0.12798721,  0.08016937,  0.07921507,  0.11546337,  0.11301409,
         0.0819874 ,  0.07943038,  0.12176846,  0.10521848,  0.09574617],
       [ 0.12714166,  0.1081655 ,  0.09234769,  0.0729589 ,  0.14109567,
         0.10125124,  0.08929223,  0.09239422,  0.08047531,  0.09487758],
       [ 0.14275692,  0.08715716,  0.05984145,  0.07617228,  0.07625432,
         0.16992231,  0.0872106 ,  0.11396594,  0.12601106,  0.06070797],
       [ 0.12211422,  0.08951637,  0.09147158,  0.07141356,  0.09911231,
         0.11095336,  0.17075788,  0.08957727