# Topic Modeling for Exploration and Feature Creation

## Imports and Global Settings

In [1]:
import numpy as np
import pandas as pd
from pprint import pprint
# Connecting to Postgres RDS on AWS
from sqlalchemy import create_engine
from sqlalchemy.dialects import postgresql
# sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
# visuals
import pyLDAvis
import pyLDAvis.gensim
# gensim
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import gensim.corpora as corpora
from gensim.models import CoherenceModel
# nltk
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
# spacy
import spacy

pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 200)

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()
[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Loading Data

In [2]:
db_endpoint = None
db_password = None

engine = create_engine(
    f"postgresql+psycopg2://postgres:{db_password}@{db_endpoint}/yelp_2021_db"
)

In [3]:
train = pd.read_sql(sql=f"SELECT review_id, review_text FROM text_data_train LIMIT 10000", con=engine)
test = pd.read_sql(sql=f"SELECT review_id, review_text FROM text_data_test LIMIT 1000", con=engine)

In [4]:
train.shape

(10000, 2)

In [5]:
test.shape

(1000, 2)

## LDA via Gensim

### Preprocess

In [6]:
# Stopwords
stop_words = stopwords.words('english')
stop_words.extend([])

In [7]:
# Spacy Prep Model
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

In [8]:
def preprocess_text(df, stopwords):
    # Convert to List
    text = df['review_text'].values.tolist()
    # Create Tokens
    text_list = list(map(lambda x: (gensim.utils.simple_preprocess(str(x), deacc=True)), text))
    # Remove Stopwords
    text_list = list(map(lambda x: [word for word in x if word not in stopwords], text_list))
    # Add Bigrams
    bigram = gensim.models.Phrases(text_list, min_count=5, threshold=50)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    text_list = list(map(lambda x: bigram_mod[x], text_list))
    # Lemmatize
    allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']
    tokens = list(map(lambda x: [token.lemma_ for token in nlp(" ".join(x)) if token.pos_ in allowed_postags],
                      text_list))
    return tokens

In [9]:
processed_train = preprocess_text(train, stop_words)
processed_test = preprocess_text(test, stop_words)

In [10]:
# Prepare Data for Model
id2word = corpora.Dictionary(processed_train)
train_corpus = [id2word.doc2bow(text) for text in processed_train]
test_corpus = [id2word.doc2bow(text) for text in processed_test]

## Determine Number of Topics

In [12]:
def eval_lda_models(bow_corpus, id2word, processed_texts, topic_counts_to_test):
    results = {}
    lda = None
    for i in topic_counts_to_test:
        lda = gensim.models.LdaMulticore(bow_corpus, num_topics=i, id2word=id2word, passes=5)
        perplexity = lda.log_perplexity(bow_corpus)
        coherence_model_lda = CoherenceModel(model=lda, texts=processed_texts, dictionary=id2word, coherence='c_v')
        coherence = coherence_model_lda.get_coherence()
        results[f'{i}_topics'] = {}
        results[f'{i}_topics']['model'] = lda
        results[f'{i}_topics']['perplexity'] = perplexity
        results[f'{i}_topics']['coherence'] = coherence
        print(f'{i}_topics: {results[f"{i}_topics"]}')
    return results

In [None]:
lda_eval_results = eval_lda_models(train_corpus, id2word, processed_train, [3, 5, 7, 10, 20, 50, 100])

### Model Creation

In [14]:
final_lda_model = gensim.models.LdaMulticore(train_corpus, num_topics=10, id2word=id2word, passes=10,
                                             alpha='asymmetric', eta='auto')

### Model Scoring

In [16]:
print('\nPerplexity: ', final_lda_model.log_perplexity(train_corpus))
coherence_model_lda = CoherenceModel(model=final_lda_model, texts=processed_train, dictionary=id2word,
                                     coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.900115522932563

Coherence Score:  0.4074330545876653


### Visualization

In [None]:
model = lda_eval_results['3_topics']['model']

# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(model, corpus, id2word)
vis

### Save Model

In [19]:
final_lda_model.save('lda_100k.model')

### Load Model

In [11]:
final_lda_model = gensim.models.LdaMulticore.load('lda_model_1M')

### Create Feature Vectors

In [12]:
topic_count = 5

In [13]:
train_dicts = []
for i in range(len(train)):
    topics = final_lda_model.get_document_topics(train_corpus[i], minimum_probability=0.0)
    topic_dict = {f'topic_{j}_lda':topics[j][1] for j in range(topic_count)}
    train_dicts.append(topic_dict)

In [14]:
finished_train = pd.concat([train, pd.DataFrame(train_dicts)], axis=1)
finished_train = finished_train.drop(columns=['review_text'])

In [15]:
finished_train.head()

Unnamed: 0,review_id,topic_0_lda,topic_1_lda,topic_2_lda,topic_3_lda,topic_4_lda
0,3zgiXC-s7v1AeFmxSU-1nA,0.00518,0.00512,0.63333,0.00512,0.35125
1,3zr5Izh5zdjffqzRgDU5WQ,0.68811,0.1396,0.15149,0.01035,0.01045
2,3zueavCiQ3SUinF0_l_uIA,0.00504,0.00502,0.23784,0.74705,0.00505
3,4-F08YvsIFeZ7Rto7KFNnw,0.02377,0.40712,0.02385,0.52139,0.02388
4,4-MsrSjKfpfE3BSdvM1-MA,0.22151,0.07559,0.09533,0.22634,0.38124


In [16]:
test_dicts = []
for i in range(len(test)):
    topics = final_lda_model.get_document_topics(test_corpus[i], minimum_probability=0.0)
    topic_dict = {f'topic_{j}_lda':topics[j][1] for j in range(topic_count)}
    test_dicts.append(topic_dict)

In [17]:
finished_test = pd.concat([test, pd.DataFrame(test_dicts)], axis=1)
finished_test = finished_test.drop(columns=['review_text'])

In [18]:
finished_test.head()

Unnamed: 0,review_id,topic_0_lda,topic_1_lda,topic_2_lda,topic_3_lda,topic_4_lda
0,--p3d1axlnA7ka_p6hO-QQ,0.31891,0.2816,0.25317,0.14141,0.00491
1,-1v3W4XqQcIe44_I1lZYyA,0.17574,0.43749,0.20904,0.17425,0.00347
2,-21y2QEKfhjxh2algH_0nQ,0.0172,0.15948,0.38075,0.42536,0.01721
3,-358vecdAUh6ECkNfawvHw,0.0108,0.69864,0.01098,0.26876,0.01081
4,-3_NmlYMibrapNEnS_gfcg,0.0815,0.00452,0.54247,0.36702,0.00449


### Save Data

In [32]:
# finished_train.to_sql(
#         "text_topic_model_train",
#         con=engine,
#         index=False,
#         if_exists="replace",
#     )

In [None]:
# finished_test.to_sql(
#         "text_topic_model_test",
#         con=engine,
#         index=False,
#         if_exists="replace",
#     )