# Feature Engineering: Latent Dirichlet Allocation

## imports

In [4]:
import pickle
import numpy as np
import pandas as pd
from datetime import date
import json
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import re
from collections import Counter, defaultdict
import itertools
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize, regexp_tokenize
import gensim
from gensim.corpora.dictionary import Dictionary
import spacy

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import warnings
warnings.filterwarnings('ignore')

In [6]:
sw = stopwords.words("english")

In [7]:
with open(f'../data/metascripts_df_profanity.pickle', 'rb') as file:
    metascripts = pickle.load(file)

## prepare the data

In [8]:
descriptions = list(metascripts['description'].values)
scripts = list(metascripts['transcript'].values)
scripts_dict = dict(zip(descriptions, scripts))

## tokenize and lemmatize

In [25]:
nlp = spacy.load('en_core_web_md')

def chunker(iterable, chunksize):
    for i in range(0, len(list(iterable)), chunksize):
        yield iterable[i:i+chunksize]

chunksize = 2
junk =  ['\n', '♪']
corpus = []
for scripts_subset in tqdm(chunker(scripts, 2), total = np.ceil(len(scripts)/chunksize)):
    corpus.extend([[token.lemma_ for token in script if token.text not in junk and not token.is_punct and not token.is_stop] for script in nlp.pipe(scripts_subset)])

  0%|          | 0/155.0 [00:00<?, ?it/s]

## CountVectorizer and TfidfVectorizer

In [29]:
def dummy(doc):
    return doc

ct_vectorizer = CountVectorizer(lowercase = True, 
                             tokenizer = dummy,
                             preprocessor = dummy
                            )
scripts_tf = ct_vectorizer.fit_transform(corpus)

tfidf_vectorizer = TfidfVectorizer(**ct_vectorizer.get_params())
scripts_tfidf = tfidf_vectorizer.fit_transform(corpus)

In [30]:
tfidf_vectorizer.get_feature_names_out()[8000:8015]

array(['Kaufman', 'Kavanaugh', 'Kavi', 'Kavin', 'Kawasaki', 'Kay', 'Kaye',
       'Kaz', 'Kazan', 'Ke$ha', 'Keanu', 'Keats', 'Keegan', 'Keema',
       'Keeno'], dtype=object)

In [33]:
# for TF DTM
lda_tf = LatentDirichletAllocation(n_components=50, random_state=0)
lda_tf.fit(scripts_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=50, random_state=0)
lda_tfidf.fit(scripts_tfidf)

LatentDirichletAllocation(n_components=50, random_state=0)

In [36]:
pyLDAvis.sklearn.prepare(lda_tf, scripts_tf, ct_vectorizer, mds='mmds')

In [34]:
pyLDAvis.sklearn.prepare(lda_tfidf, scripts_tfidf, tfidf_vectorizer)