In [1]:
from gensim import utils
from gensim import corpora
from gensim import models
import codecs

In [2]:
class AmazonCorpus(corpora.TextCorpus):
    def __init__(self, question_file, tokenizer):
        # The stack-overflow questons are stored in a file, one question per line
        self.question_file = question_file

        # A tokenizer is a function that takes as input a text (possibly multiple sentences) and returns all 
        # containing tokens (a token is the unit we are going to train the LDA on, can be either a single 
        # word, a words stem or a word phrase) as an array of strings.
        self.tokenizer = tokenizer

        # The `TextCorpus` class is going to create a dictionary on all tokens of all documents we got. The 
        # tokens for every document are provided in the `get_texts` function. 
        super(AmazonCorpus, self).__init__(input=True)

        # Ignore common stop words (words that don't carry much meaning) lime 'the' or 'is'
        self.dictionary.filter_extremes(no_below=3, no_above=0.2)

    # Provides an array of arrays of all the tokens for all documents.
    # Example:
    #   Let documents be 
    #     `["Hello world. I am doc1.", "Nice code! I like it."]` 
    #   In that case the function will yield two arrays with each cell containing the tokens of the sentence
    #     `[["hello", "world", ".", "I", "am", "doc1", "."], ["Nice", "code", "!", "I", "like", "it", "."]]`
    def get_texts(self):
        with codecs.open(self.question_file, 'r', 'utf-8') as questions:
            for question in questions:
                yield list(self.tokenizer(question))

In [3]:
tokenizer = utils.simple_preprocess

In [4]:
corpus = AmazonCorpus('data/headphone_sents.txt', tokenizer)

In [17]:
# model = models.LdaMulticore(corpus=corpus, iterations=3000, chunksize=5000, num_topics=100, id2word=corpus.dictionary, eval_every=3, workers=5)
model = models.LdaMulticore(corpus=corpus, iterations=5000, chunksize=100, passes=3, num_topics=10, id2word=corpus.dictionary, workers=3)

In [18]:
model.print_topics(20)

['0.021*months + 0.018*working + 0.015*product + 0.013*now + 0.012*first + 0.012*broke + 0.011*bought + 0.011*back + 0.010*warranty + 0.009*new',
 '0.016*head + 0.010*re + 0.009*he + 0.009*wear + 0.009*little + 0.009*around + 0.007*off + 0.007*nice + 0.007*cord + 0.007*over',
 '0.020*beats + 0.011*love + 0.010*by + 0.010*she + 0.010*best + 0.009*am + 0.009*amazing + 0.008*got + 0.008*look + 0.007*buy',
 '0.018*phone + 0.017*headset + 0.014*bluetooth + 0.013*mic + 0.013*volume + 0.012*button + 0.009*cable + 0.008*microphone + 0.008*iphone + 0.008*control',
 '0.008*by + 0.007*set + 0.007*off + 0.007*been + 0.006*first + 0.006*into + 0.006*through + 0.006*work + 0.005*then + 0.005*using',
 '0.027*earphones + 0.021*tips + 0.011*remote + 0.011*their + 0.009*colors + 0.009*foam + 0.009*company + 0.008*we + 0.008*klipsch + 0.008*set',
 '0.029*volume + 0.027*cord + 0.016*control + 0.010*clip + 0.009*wire + 0.009*into + 0.009*down + 0.008*cable + 0.007*pull + 0.007*rain',
 '0.050*noise + 0.019*