# Initial configuration

In [7]:
import datetime as dt
import pandas as pd
import os

base_path = "../assets/data/all-news"

lemmatized_path = os.path.abspath(os.path.join(base_path, "lemmatized"))
transaction_path = os.path.abspath(os.path.join(base_path, "transaction"))

# List files
lemmatized_files = os.listdir(lemmatized_path)
transaction_files = os.listdir(transaction_path)

In [41]:
df = pd.read_csv(os.path.join(transaction_path, transaction_files[0]), usecols=['id', 'text'])
df.head()

Unnamed: 0,id,text
0,2016-01/0,paris hilton woman black uncle montys funeral
1,2016-01/0,paris hilton arrive lax wednesday dress pay la...
2,2016-01/0,paris fly switzerland especially funeral brins...
3,2016-01/0,monty die sunday long battle cancer
4,2016-01/0,loss obviously hit paris hard


## Frequency based stop words

In [42]:
def find_stop_words(data: pd.Series, percentile_top: int = 95, percentile_bottom: int = 5, quiet=False):
    t0 = dt.datetime.now()
    
    # Get unique words for each document
    unique = data.apply(lambda x: list(set(x.split())))
    
    # Calculate the words document frequency
    words_document_frequency = unique.explode().value_counts() / len(unique)
    
    # Get percentiles values
    top = np.percentile(words_document_frequency, percentile_top)
    bottom = np.percentile(words_document_frequency, percentile_bottom)
    
    # Find words
    mask_top = words_document_frequency >= top
    mask_bottom = bottom >= words_document_frequency
    mask_wdf = mask_top | mask_bottom
    freq_stop_words = list(words_document_frequency[mask_wdf].index)
    vocabulary = list(words_document_frequency[~mask_wdf].index)
    
    if not quiet:
        length = len(str(len(words_document_frequency))) + 1
        print(f"{len(words_document_frequency)}".rjust(length), "- raw vocabulary length")
        print(f"{len(vocabulary)}".rjust(length), "- new vocabulary length\n")
        print(f"{mask_wdf.sum()}".rjust(length), "- new stop words founded")
        print(f"{mask_top.sum()}".rjust(length), f"- df above  {top:.8f}")
        print(f"{mask_bottom.sum()}".rjust(length), f"- df bellow {bottom:.8f}\n")
        print(f"Max df: {words_document_frequency.max():.8f}")
        print(f"Min df: {words_document_frequency.min():.8f}\n")        
        print(f"Execution in {dt.datetime.now() - t0}")
        
    return freq_stop_words, vocabulary

In [43]:
# Find stop words
freq_stop_words, vocabulary = find_stop_words(df.text)

 243635 - raw vocabulary length
 110819 - new vocabulary length

 132816 - new stop words founded
  12235 - df above  0.00006010
 120581 - df bellow 0.00000077

Max df: 0.13734729
Min df: 0.00000077

Execution in 0:00:11.227079


# Data transformation: Corpus and Dictionary

In [44]:
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary([[word] for word in vocabulary])

# Create Corpus with Term Document Frequency
df["corpus"] = df.text.str.split().apply(id2word.doc2bow)

# Removing empty documents
filterred_df = df[df["corpus"].apply(len) > 0]

# Base Model Performance

In [51]:
import gensim

# Build LDA model
lda_model = gensim.models.LdaMulticore(
    corpus=filterred_df.corpus.to_list(),
    id2word=id2word,
    num_topics=10, 
    random_state=100,
    chunksize=100,
    passes=10,
    per_word_topics=True
)

## View the topics in LDA model

In [54]:
from pprint import pprint
import pickle

# Saving model in a pickle file
models_path = "../assets/models"
lda_model_path = os.path.abspath(os.path.join(models_path, "2020-09-21_10-topics.p"))
pickle.dump(lda_model, open(lda_model_path, "wb"))

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())

[(0,
  '0.008*"qian" + 0.008*"huxtable" + 0.005*"vilsack" + 0.005*"nassau" + '
  '0.005*"anthropologist" + 0.004*"suspense" + 0.004*"casa" + 0.004*"mota" + '
  '0.003*"gran" + 0.003*"austen"'),
 (1,
  '0.010*"yin" + 0.010*"fa" + 0.006*"kynect" + 0.005*"quicksand" + '
  '0.005*"steinberg" + 0.005*"staging" + 0.004*"baxter" + 0.004*"goodell" + '
  '0.004*"overstay" + 0.004*"twentyonestcentury"'),
 (2,
  '0.007*"qing" + 0.006*"konnikova" + 0.005*"vinegar" + 0.005*"condominium" + '
  '0.004*"tomlin" + 0.004*"rhoade" + 0.004*"snowflake" + 0.004*"vividly" + '
  '0.003*"eliot" + 0.003*"doubly"'),
 (3,
  '0.009*"mckinley" + 0.008*"chi" + 0.008*"ni" + 0.008*"edu" + 0.007*"napoli" '
  '+ 0.005*"lai" + 0.005*"nast" + 0.005*"conde" + 0.005*"serenity" + '
  '0.005*"barrow"'),
 (4,
  '0.008*"tai" + 0.008*"shu" + 0.006*"fourteenseven" + 0.006*"hao" + '
  '0.005*"barrack" + 0.004*"sundays" + 0.004*"fortyfiveth" + 0.004*"yong" + '
  '0.004*"fortysixth" + 0.004*"paulson"'),
 (5,
  '0.012*"ye" + 0.010*"c

## Compute Model Perplexity and Coherence Score

In [64]:
from gensim.models import CoherenceModel

# Compute Perplexity
print('Perplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=df.text, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

print('Coherence Score: ', coherence_lda)

  and should_run_async(code)


KeyboardInterrupt: 

## Label documents

In [56]:
doc_lda = lda_model[filterred_df.corpus.to_list()]

# Visualize Topics

In [63]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis

# Visualize the topics
pyLDAvis.enable_notebook()

LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)

LDAvis_prepared

  and should_run_async(code)
