In [60]:
from nltk.corpus import stopwords as stp_wrd
from tqdm import tqdm

In [9]:
import numpy as np
from matplotlib import pyplot as plt
from datetime import datetime
import pandas as pd
import collections
import re
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.special import gammaln
import math
from math import sqrt
from scipy.stats import norm
from scipy.stats import invgamma
import time as tm

In [19]:
stops2 = 
# Load Data
mainData = pd.read_csv('mainData.csv')
all_videos = pd.read_csv('all_videos.csv')
# Load the transcripts for each DF
Transcripts_m = mainData.transcript
Transcripts_a = all_videos.transcript

In [54]:
with open('stop_words.txt', 'r') as stopfile:
    stops = stopfile.read().split()

In [47]:
Output = list(Transcripts)

In [52]:
stoppers = open('stop_words.txt', 'w')
stoppers.write("\n".join(stops))

3807

In [53]:
stoppers = open('Transcripts.txt', 'w')
stoppers.write("\n".join(Output))

25909750

## LDA with Gibbs Sampling

In [63]:
class LDACGS:
    """Do LDA with Gibbs Sampling."""

    def __init__(self, n_topics, alpha=0.1, beta=0.1):
        """Initialize system parameters."""
        self.n_topics = n_topics
        self.alpha = alpha
        self.beta = beta

    def buildCorpus(self, filename, stopwords_file=None):
        """Read the given filename and build the vocabulary."""
        # timed = tm.time()
        with open(filename, 'r') as infile:
            doclines = [line.rstrip().lower().split(' ') for line in infile]
        n_docs = len(doclines)
        self.vocab = list({v for doc in doclines for v in doc})
        if stopwords_file:
            with open(stopwords_file, 'r') as stopfile:
                stops = stopfile.read().split()
            self.vocab = [x for x in self.vocab if x not in stops]
            self.vocab.sort()
        self.documents = []
        for i in tqdm(range(n_docs)):
            self.documents.append({})
            for j in range(len(doclines[i])):
                if doclines[i][j] in self.vocab:
                    self.documents[i][j] = self.vocab.index(doclines[i][j])
         #print('Time building: ',tm.time()-timed)

    def initialize(self):
        """Initialize the three count matrices."""
        self.n_words = len(self.vocab)
        self.n_docs = len(self.documents)
        
        # Initialize the three count matrices.
        # The (i,j) entry of self.nmz is the number of words in document i assigned to topic j.
        self.nmz = np.zeros((self.n_docs, self.n_topics))
        # The (i,j) entry of self.nzw is the number of times term j is assigned to topic i.
        self.nzw = np.zeros((self.n_topics, self.n_words))
        # The (i)-th entry is the number of times topic i is assigned in the corpus.
        self.nz = np.zeros(self.n_topics)
        # Initialize the topic assignment dictionary.
        self.topics = {} # key-value pairs of form (m,i):z

        for m in range(self.n_docs):
            for i in self.documents[m]:
                # Get random topic assignment, i.e. z = ...
                # Increment count matrices
                # Store topic assignment, i.e. self.topics[(m,i)]=z
                w = self.documents[m][i]
                z = np.random.randint(0,self.n_topics,1)
                self.nmz[m,z] += 1 
                self.nzw[z,w] += 1
                self.nz[z] += 1
                self.topics[(m,i)] = z
                
    def sample(self, filename, burnin=100, sample_rate=10, n_samples=10, stopwords_file='stop_words.txt'):
        self.buildCorpus(filename, stopwords_file)
        self.initialize()
        self.total_nzw = np.zeros((self.n_topics, self.n_words))
        self.total_nmz = np.zeros((self.n_docs, self.n_topics))
        self.logprobs = np.zeros(burnin + sample_rate*n_samples)
        j=0
        # timed = tm.time()
        for i in tqdm(range(burnin)):
            # Sweep and store log likelihood.
            self._sweep()
            self.logprobs[j] = self._loglikelihood()
            j+=1
        # print('Time burning: ',tm.time()-timed)
        # timed = tm.time()
        for i in tqdm(range(n_samples*sample_rate)):
            # Sweep and store log likelihood
            self._sweep()
            self.logprobs[j] = self._loglikelihood()
            j+=1
            if not i % sample_rate:
                # accumulate counts
                self.total_nzw += self.nzw
                self.total_nmz += self.nmz
        #print('Time sampling: ',tm.time()-timed)
                

    def phi(self):
        phi = self.total_nzw + self.beta
        self._phi = phi / np.sum(phi, axis=1)[:,np.newaxis]

    def theta(self):
        theta = self.total_nmz + self.alpha
        self._theta = theta / np.sum(theta, axis=1)[:,np.newaxis]

    def topterms(self,n_terms=10):
        self.phi()
        self.theta()
        vec = np.atleast_2d(np.arange(0,self.n_words))
        topics = []
        for k in range(self.n_topics):
            probs = np.atleast_2d(self._phi[k,:])
            mat = np.append(probs,vec,0)
            sind = np.array([mat[:,i] for i in np.argsort(mat[0])]).T
            topics.append([self.vocab[int(sind[1,self.n_words - 1 - i])] for i in range(n_terms)])
        return topics

    def toplines(self,n_lines=5):
        lines = np.zeros((self.n_topics,n_lines))
        for i in range(self.n_topics):
            args = np.argsort(self._theta[:,i]).tolist()
            args.reverse()
            lines[i,:] = np.array(args)[0:n_lines] + 1
        return lines

    def _removeStopwords(self, stopwords):
        return [x for x in self.vocab if x not in stopwords]

    def _conditional(self, m, w):
        dist = (self.nmz[m,:] + self.alpha) * (self.nzw[:,w] + self.beta) / (self.nz + self.beta*self.n_words)
        return dist / np.sum(dist)

    def _sweep(self):
        for m in range(self.n_docs):
            for i in self.documents[m]:
                # Retrieve vocab index for i-th word in document m.
                w = self.documents[m][i]
                # Retrieve topic assignment for i-th word in document m.
                z = self.topics[(m,i)]
                # Decrement count matrices.
                self.nmz[m,z] -= 1 
                self.nzw[z,w] -= 1
                self.nz[z] -= 1
                self.topics[(m,i)] = z
                # Get conditional distribution.
                theta = self._conditional(m,w)
                # Sample new topic assignment.
                z_ = np.argmax(np.random.multinomial(1,theta))
                # Increment count matrices.
                self.nmz[m,z_] += 1 
                self.nzw[z_,w] += 1
                self.nz[z_] += 1
                # Store new topic assignment.
                self.topics[(m,i)] = z_

    def _loglikelihood(self):
        lik = 0
        for z in range(self.n_topics):
            lik += np.sum(gammaln(self.nzw[z,:] + self.beta)) - gammaln(np.sum(self.nzw[z,:] + self.beta))
            lik -= self.n_words * gammaln(self.beta) - gammaln(self.n_words*self.beta)

        for m in range(self.n_docs):
            lik += np.sum(gammaln(self.nmz[m,:] + self.alpha)) - gammaln(np.sum(self.nmz[m,:] + self.alpha))
            lik -= self.n_topics * gammaln(self.alpha) - gammaln(self.n_topics*self.alpha)
        return lik