In [1]:
from gensim import utils
from gensim import corpora
from gensim import models
import codecs

In [2]:
class AmazonCorpus(corpora.TextCorpus):
    def __init__(self, question_file, tokenizer):
        # The stack-overflow questons are stored in a file, one question per line
        self.question_file = question_file

        # A tokenizer is a function that takes as input a text (possibly multiple sentences) and returns all 
        # containing tokens (a token is the unit we are going to train the LDA on, can be either a single 
        # word, a words stem or a word phrase) as an array of strings.
        self.tokenizer = tokenizer

        # The `TextCorpus` class is going to create a dictionary on all tokens of all documents we got. The 
        # tokens for every document are provided in the `get_texts` function. 
        super(AmazonCorpus, self).__init__(input=True)

        # Ignore common stop words (words that don't carry much meaning) lime 'the' or 'is'
        self.dictionary.filter_extremes(no_below=3, no_above=0.2)

    # Provides an array of arrays of all the tokens for all documents.
    # Example:
    #   Let documents be 
    #     `["Hello world. I am doc1.", "Nice code! I like it."]` 
    #   In that case the function will yield two arrays with each cell containing the tokens of the sentence
    #     `[["hello", "world", ".", "I", "am", "doc1", "."], ["Nice", "code", "!", "I", "like", "it", "."]]`
    def get_texts(self):
        with codecs.open(self.question_file, 'r', 'utf-8') as questions:
            for question in questions:
                yield list(self.tokenizer(question))

In [3]:
tokenizer = utils.simple_preprocess

In [4]:
corpus = AmazonCorpus('data/headphone_sents.txt', tokenizer)

In [16]:
# model = models.LdaMulticore(corpus=corpus, iterations=3000, chunksize=5000, num_topics=100, id2word=corpus.dictionary, eval_every=3, workers=5)
model = models.LdaMulticore(corpus=corpus, iterations=5000, chunksize=100, passes=3, num_topics=10, id2word=corpus.dictionary, workers=3)

KeyboardInterrupt: 

In [15]:
model.print_topics(20)

['0.132*hearing + 0.076*anyone + 0.074*who + 0.068*computer + 0.050*affordable + 0.046*recommend + 0.029*listen + 0.028*ideal + 0.027*short + 0.024*podcasts',
 '0.175*we + 0.066*our + 0.027*movies + 0.025*movie + 0.022*watch + 0.020*system + 0.018*late + 0.017*watching + 0.015*colored + 0.014*dvd',
 '0.037*nc + 0.028*thank + 0.023*users + 0.019*situation + 0.017*prime + 0.016*silicon + 0.016*replaceable + 0.016*orange + 0.013*popped + 0.012*passive',
 '0.112*sweat + 0.044*runs + 0.020*rubbing + 0.019*per + 0.014*surroundings + 0.013*bang + 0.013*gonna + 0.013*somewhere + 0.013*thrilled + 0.012*advise',
 '0.050*case + 0.049*inexpensive + 0.047*stop + 0.041*gets + 0.040*job + 0.031*pulling + 0.030*excellent + 0.028*especially + 0.027*does + 0.021*running',
 '0.067*item + 0.062*company + 0.048*amazon + 0.029*by + 0.026*email + 0.026*order + 0.021*sent + 0.020*whenever + 0.018*review + 0.018*made',
 '0.071*tangled + 0.048*seal + 0.045*falling + 0.038*keep + 0.034*stay + 0.031*throw + 0.027