First, we specify which directory we're working with. If you only have a tokens file and no documents file, run `tokens-to-documents.sh`.

In [7]:
from os.path import join
path = 'cocktails'
documents_path = join(path, 'documents')
corpus_path = join(path, 'corpus.mm')
dictionary_path = join(path, 'dictionary.dict')
vectors_path = join(path, 'vectors')
words_path = join(path, 'words')

In [8]:
import os
from os.path import isfile, join
from progressbar import ProgressBar
def listdir(path):
    pbar = ProgressBar()
    for filename in pbar(os.listdir(path)):
        filepath = join(path, filename)
        if isfile(filepath) and not filepath.startswith('.'):
            yield filepath

import nltk
def tokenize(document):
    return nltk.word_tokenize(document)

def load_tokens(path):
    for file_path in listdir(path):
        with open(file_path, encoding='ascii', errors='ignore') as f:
            yield tokenize(f.read().replace('\n', ' '))

import gensim
from gensim import corpora, models, similarities
def load_dictionary(path):
    return corpora.Dictionary(load_tokens(path))

from nltk.corpus import stopwords
def clean_dictionary(dictionary):
    stoplist = stopwords.words('english')
    stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
    once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]
    dictionary.filter_tokens(stop_ids + once_ids)
    dictionary.compactify()
    return dictionary

In [9]:
class Corpus(object):
    def __init__(self, path):
        self.path = path
        self.dictionary = clean_dictionary(load_dictionary(path))
    def __iter__(self):
        for tokens in load_tokens(self.path):
            yield self.dictionary.doc2bow(tokens)

This next line will create the dictionary, which can take a little while.

In [11]:
corpus = Corpus(documents_path)

100% (665 of 665) |#######################| Elapsed Time: 0:00:00 Time: 0:00:00


See how long the first 10 documents are.

In [12]:
from itertools import islice
for document in islice(corpus, 10):
    print(len(document))

3
4
2
4
7
2
2
4
3
4


  0% (0 of 665) |                        | Elapsed Time: 0:00:00 ETA:  --:--:--  1% (7 of 665) |                         | Elapsed Time: 0:00:00 ETA:  0:00:00

See what the first few documents look like.

In [13]:
for document in islice(load_tokens(documents_path), 10):
    print(document)

['whiskey', 'cherry_brandy', 'lemon']
['gin', 'lemon_juice', 'sugar', 'clear_soda']
['gin', 'root_beer']
['mandarin_vodka', 'clear_soda', 'sprite', 'cranberry_juice']
['orange_liqueur', 'peach_liqueur', 'almond_liqueur', 'vodka', 'pineapple_juice', 'cranberry_juice', 'grenadine']
['tequila', 'water']
['dark_rum', 'coffee']
['gin', 'dry_vermouth', 'sweet_vermouth', 'orange_liqueur']
['chocolate_liqueur', 'apricot_brandy', 'heavy_cream']
['spiced_rum', 'orange_liqueur', 'raspberry_liqueur', 'ice']


  0% (0 of 665) |                        | Elapsed Time: 0:00:00 ETA:  --:--:--  1% (7 of 665) |                         | Elapsed Time: 0:00:00 ETA:  0:00:00

Save corpus and dictionary to disk. This may also take a while because it means tokenizing the whole corpus again.

In [14]:
corpora.MmCorpus.serialize(corpus_path, corpus)
corpus.dictionary.save(dictionary_path)

100% (665 of 665) |#######################| Elapsed Time: 0:00:00 Time: 0:00:00


Load corpus and dictionary from disk.

In [15]:
corpus = corpora.MmCorpus(corpus_path)
corpus.dictionary = corpora.Dictionary.load(dictionary_path)

Build a TF-IDF model from corpus and create transformed corpus. This will discount common terms so they don't affect the LDA as much.

In [17]:
tfidf = gensim.models.tfidfmodel.TfidfModel(corpus, normalize=True)
corpus_tfidf = tfidf[corpus]

Build LDA model from the TF-IDF vectors. 

In [23]:
lda = gensim.models.ldamodel.LdaModel(corpus_tfidf, id2word=corpus.dictionary,
                                      passes=10, # default 1
                                      num_topics=10, # default 100
                                      iterations=50) # default 50
corpus_lda = lda[corpus_tfidf]

This is what the data looks like as it's being transformed.

In [24]:
for document in islice(corpus, 1):
    print('corpus:', document)
for document in islice(corpus_tfidf, 1):
    print('corpus_tfidf:', document)
for document in islice(corpus_lda, 10):
    print('corpus_lda:', document)

corpus: [(24, 1.0), (97, 1.0), (99, 1.0)]
corpus_tfidf: [(24, 0.6819840625308929), (97, 0.4953692698520294), (99, 0.5380585701762647)]
corpus_lda: [(0, 0.036839071098256938), (1, 0.036827188300655796), (2, 0.036832275441643261), (3, 0.036827904548863269), (4, 0.036827910485565288), (5, 0.66853834793986533), (6, 0.036826825691749443), (7, 0.036826825507673189), (8, 0.036826825466830061), (9, 0.036826825518897301)]
corpus_lda: [(0, 0.48357878072298816), (1, 0.033776611025191881), (2, 0.033781978411744927), (3, 0.033777386019390752), (4, 0.033776375900691304), (5, 0.033777784853131813), (6, 0.03378063618172384), (7, 0.24618999659378574), (8, 0.033775334200211213), (9, 0.033785116091140252)]
corpus_lda: [(0, 0.044659340119688501), (1, 0.044640112406898795), (2, 0.044639699471707102), (3, 0.044638364089912431), (4, 0.59822126693192257), (5, 0.044638209711270055), (6, 0.044641970577112368), (7, 0.044641936756351322), (8, 0.044638207867809865), (9, 0.044640892067326958)]
corpus_lda: [(0, 0.03

Then we serialize all LDA vectors to disk in tsv file.

In [25]:
numpy_matrix = gensim.matutils.corpus2dense(corpus_lda, num_topics) 
vectors = numpy_matrix.transpose()
import numpy
numpy.savetxt(vectors_path, vectors, fmt='%.5g', delimiter='\t')

If you want a label for each line of the vectors file, this piece of code will output a `words` file that has a label for each line (based on the filename of the document that generated that line).

In [26]:
with open(words_path, 'w') as file:
    for path in listdir(documents_path):
        document_id = path.split('/')[2].split('.')[0]
        file.write(document_id + '\n')

  0% (0 of 665) |                        | Elapsed Time: 0:00:00 ETA:  --:--:--  1% (7 of 665) |                         | Elapsed Time: 0:00:00 ETA:  0:00:00  2% (14 of 665) |                        | Elapsed Time: 0:00:00 ETA:  0:00:00  3% (21 of 665) |                        | Elapsed Time: 0:00:00 ETA:  0:00:00  4% (28 of 665) |#                       | Elapsed Time: 0:00:00 ETA:  0:00:00  5% (35 of 665) |#                       | Elapsed Time: 0:00:00 ETA:  0:00:00  6% (42 of 665) |#                       | Elapsed Time: 0:00:00 ETA:  0:00:00  7% (49 of 665) |#                       | Elapsed Time: 0:00:00 ETA:  0:00:00  8% (56 of 665) |##                      | Elapsed Time: 0:00:00 ETA:  0:00:00  9% (63 of 665) |##                      | Elapsed Time: 0:00:00 ETA:  0:00:00 10% (70 of 665) |##                      | Elapsed Time: 0:00:00 ETA:  0:00:00 11% (77 of 665) |##                      | Elapsed Time: 0:00:00 ETA:  0:00:00 12% (84 of 665) |###                  