In [1]:
from pathlib import Path
from time import time

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from gensim.models import LdaMulticore
from gensim.matutils import Sparse2Corpus
from spacy.lang.en.stop_words import STOP_WORDS

pd.options.display.max_columns = 100
pd.options.display.max_rows = 100
PROCESSED_DATA_DIR = Path("../data/processed/amazon")
MAX_VOCAB_SIZE = 10000

In [2]:
tokenized_descriptions = pd.read_pickle(
    PROCESSED_DATA_DIR / "tokenized_descriptions.p"
)

In [3]:
tokenized_descriptions.head()

Unnamed: 0,item,description,description_length
0,143588,"[barefoot, contessa, volume, this, three, disc...",67
1,143561,"[giada, de, laurentis, everyday, italian, dvds...",49
2,1499572,"[like, new]",2
3,1526863,"[steve, green, hide, -pron-, your, heart, 13, ...",14
4,1421409,"[angel, show, dumitru, california, las, vegas,...",128


In [4]:
train = tokenized_descriptions.sample(int(0.8 * tokenized_descriptions.shape[0]))
test = tokenized_descriptions.drop(train.index)

In [5]:
vectorizer = CountVectorizer(
    max_features=MAX_VOCAB_SIZE, stop_words=STOP_WORDS
)

In [6]:
tr_desc = [" ".join(d) for d in train.description.tolist()]
te_desc = [" ".join(d) for d in test.description.tolist()]

In [7]:
X_tr_sk = vectorizer.fit_transform(tr_desc)
X_te_sk = vectorizer.transform(te_desc)



In [8]:
id2word = dict()
for k, v in vectorizer.vocabulary_.items():
    id2word[v] = k
X_tr_gen = Sparse2Corpus(X_tr_sk, documents_columns=False)
X_te_gen = Sparse2Corpus(X_te_sk, documents_columns=False)

In [9]:
# MODEL PARAMETERS
NB_TOPICS = 10
decay = 0.7
offset = 1.
max_iterations = 10
batch_size = 200
max_e_steps = 100
eval_every = 1
mode = "online"

In [10]:
#SKLEARN
lda_sklearn = LatentDirichletAllocation(
    n_components=NB_TOPICS,
    batch_size=batch_size,
    learning_decay=decay,
    learning_offset=offset,
    n_jobs=-1,
    random_state=0,
    max_iter=max_iterations,
    learning_method=mode,
    max_doc_update_iter=max_e_steps,
    evaluate_every=eval_every)

start = time()
lda_sklearn.fit(X_tr_sk)
sk_time = time() - start

sklearn_perplexity = lda_sklearn.perplexity(X_te_sk)

In [11]:
# GENSIM
start = time()
lda_gensim_mc = LdaMulticore(
    X_tr_gen,
    id2word=id2word,
    decay=decay,
    offset=offset,
    num_topics=NB_TOPICS,
    passes=max_iterations,
    batch=False, #for online training
    chunksize=batch_size,
    iterations=max_e_steps,
    eval_every=eval_every)
gn_time = time() - start

log_prep_gensim_mc   = lda_gensim_mc.log_perplexity(X_te_gen)
preplexity_gensim_mc = np.exp(-1.*log_prep_gensim_mc)

In [12]:
print("gensim run time and perplexity: {}, {}".format(gn_time, preplexity_gensim_mc))
print("sklearn run time and perplexity: {}, {}".format(sk_time, sklearn_perplexity))

gensim run time and perplexity: 348.2337691783905, 2617.3677838127815
sklearn run time and perplexity: 235.26577377319336, 2761.7179135533775


In [13]:
topic_words = dict()
gensim_topics = lda_gensim_mc.show_topics(formatted=False)
def sklearn_show_topics(model, feature_names, n_top_words):
    sk_topics = []
    for topic_idx, topic in enumerate(model.components_):
        tot_score = np.sum(topic)
        top_words = [(feature_names[i],topic[i]/tot_score)
            for i in topic.argsort()[:-n_top_words - 1:-1]]
        sk_topics.append([topic_idx,top_words])
    return sk_topics
feature_names = vectorizer.get_feature_names()
sklearn_topics = sklearn_show_topics(lda_sklearn, feature_names,10)
topic_words['gensim']  = gensim_topics
topic_words['sklearn'] = sklearn_topics

# or in data frame formta
topic_words_df = dict()
for model, result in topic_words.items():
    df = pd.DataFrame()
    for topic in result:
        cols =  [[word[0] for word in topic[1]] for topic in result]
        for i,c in enumerate(cols):
            df["topic_"+str(i)] = c
    topic_words_df[model] = df

print('Sklearn \n')
print(topic_words_df['sklearn'])
print('\n')
print('Gensim \n')
print(topic_words_df['gensim'])

Sklearn 

    topic_0     topic_1         topic_2 topic_3 topic_4  topic_5 topic_6  \
0      film      strong           false    life   class       br     man   
1     story          li          locked    love  images  episode    find   
2     world         dvd        priority  dempty  amazon   season   woman   
3       war        disc              de  family    href   series    turn   
4  director          vs      semihidden    find     com     star    come   
5  american  collection  unhidewhenused    live   https   comedy  murder   
6       new   christmas    lsdexception   world     div       tv   young   
7   history     feature              la    year      na  include   movie   
8        em         new          accent   young     ssl     love    star   
9   include       video            true   child    spin     john    girl   

   topic_7   topic_8     topic_9  
0    music    amazon         dvd  
1     live       com      player  
2  include       dvd     english  
3     song   

In [5]:
train = tokenized_descriptions.sample(5000)
test = tokenized_descriptions.drop(train.index).sample(1000)

In [6]:
vectorizer = CountVectorizer(
    max_features=MAX_VOCAB_SIZE, stop_words=STOP_WORDS
)

In [7]:
tr_desc = [" ".join(d) for d in train.description.tolist()]
te_desc = [" ".join(d) for d in test.description.tolist()]

In [8]:
X_tr_sk = vectorizer.fit_transform(tr_desc)
X_te_sk = vectorizer.transform(te_desc)



In [9]:
id2word = dict()
for k, v in vectorizer.vocabulary_.items():
    id2word[v] = k
X_tr_gen = Sparse2Corpus(X_tr_sk, documents_columns=False)
X_te_gen = Sparse2Corpus(X_te_sk, documents_columns=False)

In [10]:
# MODEL PARAMETERS
decay = 0.7
offset = 1.
max_iterations = 10
batch_size = 200
max_e_steps = 100
eval_every = 1
mode = "online"

In [11]:
for n_topics in [5, 10, 20, 30]:
    lda_sklearn = LatentDirichletAllocation(
        n_components=n_topics,
        batch_size=batch_size,
        learning_decay=decay,
        learning_offset=offset,
        n_jobs=-1,
        random_state=0,
        max_iter=max_iterations,
        learning_method=mode,
        max_doc_update_iter=max_e_steps,
        evaluate_every=eval_every)
    lda_sklearn.fit(X_tr_sk)  
    sklearn_perplexity = lda_sklearn.perplexity(X_te_sk)   
    print(n_topics, sklearn_perplexity)

5 6322.687323963289
10 8164.051691175491
20 10627.883780705944
30 12783.048442798514


In [14]:
for n_topics in [5, 10, 20, 30]:
    lda_gensim_mc = LdaMulticore(
        X_tr_gen,
        id2word=id2word,
        decay=decay,
        offset=offset,
        num_topics=n_topics,
        passes=max_iterations,
        batch=False, #for online training
        chunksize=batch_size,
        iterations=max_e_steps,
        eval_every=eval_every)
    log_prep_gensim_mc   = lda_gensim_mc.log_perplexity(X_te_gen)
    preplexity_gensim_mc = np.exp(-1.*log_prep_gensim_mc)
    print(n_topics, preplexity_gensim_mc)

5 7797.699262434815
10 15881.674141667338
20 61983.10738771032
30 192539.38416285504
