# Topic Modeling

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit Learn
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.models import LsiModel
from gensim.models import CoherenceModel

# spaCy
import spacy

# nltk
from nltk.corpus import stopwords
from nltk.tokenize.regexp import RegexpTokenizer

## pyLDAvis
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()


# Silence Future Warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Load Text Libraries


### Stop Words

#### Functions

In [3]:
def add_stop_words(word_list, list_stop_words):
    for word in word_list:
        list_stop_words.add(word)
        
def remove_stop_words(word_list, list_stop_words):
    for word in word_list:
        list_stop_words.remove(word)

#### Load Stop Words from NLTK and spaCy

In [4]:
# spaCy
nlp = spacy.load('en_core_web_sm')
stop_words_spacy = set(nlp.Defaults.stop_words)

# nltk
stop_words_nltk = set(stopwords.words('english'))

# full list of stop words
full_stop_words = stop_words_spacy.union(stop_words_nltk)

#### Modify Stop Words Based on EDA

In [5]:
words_to_add = ['like', 'know', 'want', 'feel', 'going', 'think', 'reddit', 'imgur', 'pron', 'officially', 'story', 'month', 'week', 'time', 'day', 'year', 'delete']

add_stop_words(words_to_add, full_stop_words)
remove_stop_words(['not'], full_stop_words)

## Read-In Data

In [6]:
health = pd.read_csv('../data/womens_health_preprocessed.csv', lineterminator='\n')
print(f'Health: {health.shape}')
obsgyn = pd.read_csv('../data/fertility_and_pregnancy_preprocessed.csv')
print(f'ObsGyn: {obsgyn.shape}')
pospar = pd.read_csv('../data/postpartum_preprocessed.csv')
print(f'Pospar: {pospar.shape}')

Health: (30616, 8)
ObsGyn: (92943, 8)
Pospar: (49094, 8)


## Light Preprocessing

In [7]:
def remove_digits(df, column1, column2):
    df[column1] = df[column1].str.replace('[0-9]+', '', regex = True)
    df[column2] = df[column2].str.replace('[0-9]+', '', regex = True)
    
# https://stackoverflow.com/questions/47010044/how-to-remove-numeric-characters-present-in-countvectorizer

dfs = [health, obsgyn, pospar]

for df in dfs:
    remove_digits(df, 'total_text', 'lemma_tokens')

In [8]:
def remove_phrase(df, column):
    df[column] = df[column].str.replace('delete', '')

remove_phrase(obsgyn, 'lemma_tokens')

## Model Preprocessing

In [33]:
def build_tvec(df, column, stop_words = full_stop_words):
    tvec = TfidfVectorizer(max_df = 0.98, min_df = 0.005, ngram_range = (1,2), stop_words = stop_words)
    dtm = tvec.fit_transform(df[column])
    return dtm

In [34]:
health_tvec = build_tvec(health, 'lemma_tokens')
obsgyn_tvec = build_tvec(obsgyn, 'lemma_tokens')
pospar_tvec = build_tvec(pospar, 'lemma_tokens')

## Modeling

### Model 1: Tfid-f Vectorizer with K-Means Clustering

#### Function

In [31]:
def build_kmeans(df, column, k, n_terms = 15):
    # Build Vectorizer
    dtm = build_tvec(df, column)
    
    # Build KMeans Model
    kmeans = KMeans(n_clusters = k, random_state = 42, n_jobs = -1)
    kmeans.fit(dtm)
    
    # Calculate Silhouette Score
    preds = kmeans.fit_predict(dtm)
    sil_score = silhouette_score(dtm, preds)
    
    # Show Cluster Descriptions
    # Code modified from # https://pythonprogramminglanguage.com/kmeans-text-clustering/
    print(f'Number of Clusters = {k}')
    print(f'Silhouette Score: {round(sil_score,3)}')
    print()
    print(f'Top {n_terms} terms per cluster:')
    print()
    
    sorted_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
    terms = tvec.get_feature_names()
    
    for i in range(k):
        print(f'Cluster {i}:')
        for index in sorted_centroids[i, :n_terms]:
            print(f'{terms[index]}')
        print()
        
    return sorted_centroids, terms

#### Women's Health Data

In [10]:
health_centroids, health_vocab = build_kmeans(health, 'lemma_tokens', k = 17)

Number of Clusters = 17
Silhouette Score: 0.019

Top 15 terms per cluster:

Cluster 0:
cyst
ovarian
ovarian cyst
pain
ovary
ultrasound
cm
left
surgery
doctor
not
right
experience
period
remove

Cluster 1:
hair
look
grow
dye
shampoo
wash
cut
long
not
shave
thick
product
try
dry
color

Cluster 2:
test
pregnancy
period
pregnancy test
negative
pregnant
late
not
sex
come
th
symptom
period late
test negative
come negative

Cluster 3:
pain
period
sex
not
doctor
cramp
hurt
bad
experience
pelvic
painful
low
start
sharp
help

Cluster 4:
smell
discharge
not
vagina
normal
white
vaginal
odor
sex
notice
infection
bv
brown
period
help

Cluster 5:
not
guy
work
people
thing
job
date
talk
relationship
life
tell
look
ask
try
good

Cluster 6:
period
start
bleed
not
normal
blood
cramp
cycle
heavy
spot
light
come
bleeding
late
sex

Cluster 7:
uti
pap
pee
smear
pap smear
antibiotic
doctor
test
not
sex
hpv
symptom
come
result
pain

Cluster 8:
iud
period
copper
copper iud
mirena
cramp
experience
string
inserti

#### Fertility and Pregnancy Data

In [11]:
obsgyn_centroids, obsgyn_vocab = build_kmeans(obsgyn, 'lemma_tokens', k = 20)

Number of Clusters = 20
Silhouette Score: 0.012

Top 15 terms per cluster:

Cluster 0:
boy
girl
gender
baby
find
reveal
baby girl
baby boy
little
excited
not
husband
tell
scan
ultrasound

Cluster 1:
cycle
cd
opk
ovulation
ovulate
temp
positive
start
test
period
try
not
positive opk
lh
track

Cluster 2:
test
positive
pregnancy test
negative
pregnancy
line
period
result
blood
not
pregnant
faint
dpo
blood test
ovulation

Cluster 3:
birth
labor
hospital
section
induce
induction
baby
not
experience
delivery
plan
epidural
doctor
need
schedule

Cluster 4:
pregnant
not
find
pregnancy
find pregnant
try
baby
woman
tell
help
look
pregnant woman
advice
experience
start

Cluster 5:
pain
bad
not
hurt
low
experience
hip
cramp
help
right
pregnancy
sharp
walk
normal
pelvic

Cluster 6:
not
help
try
look
start
good
today
pregnancy
need
find
thank
experience
thing
little
baby

Cluster 7:
sex
pregnant
not
husband
pregnancy
try
period
unprotected
baby
condom
ovulation
chance
drive
start
tell

Cluster 8:
sho

#### Postpartum Data

In [12]:
pospar_centroids, pospar_covab = build_kmeans(pospar, 'lemma_tokens', k = 20)

Number of Clusters = 20
Silhouette Score: 0.012

Top 15 terms per cluster:

Cluster 0:
seat
stroller
car
car seat
baby
buy
infant
travel
need
system
registry
use
bassinet
double
look

Cluster 1:
breastfeed
nurse
nursing
wean
old
boob
not
feed
baby
stop
bra
start
try
bite
breast

Cluster 2:
test
period
positive
pregnancy
pregnancy test
negative
pregnant
blood
result
not
glucose
doctor
come
late
blood test

Cluster 3:
milk
supply
breast
pump
feed
formula
baby
breast milk
not
oz
breastfeed
supplement
old
try
feeding

Cluster 4:
baby
not
tell
husband
family
mom
work
friend
thing
people
child
come
need
parent
pregnant

Cluster 5:
nipple
latch
shield
nipple shield
feed
breast
pump
try
breastfeed
old
tie
not
pain
baby
milk

Cluster 6:
bottle
feed
pump
milk
breast
baby
try
work
oz
old
feeding
breastfeed
not
formula
nurse

Cluster 7:
weight
gain
weight gain
eat
gain weight
lose
lbs
pound
lose weight
not
pregnancy
baby
lb
weigh
healthy

Cluster 8:
boy
gender
girl
baby
find
reveal
not
baby boy
hu

### Model 2: Tfid-f Vectorizer with DBSCAN

#### Function

In [35]:
def build_dbscan(df, column, epsilon, samples):
    # Build Vectorizer
    dtm = build_tvec(df, column)
    
    # Build DBSCAN
    dbscan = DBSCAN(eps = epsilon, min_samples = samples)
    dbscan.fit(dtm)
    
    return len(set(dbscan.labels_))

#### Women's Health Data

In [None]:
build_dbscan(health, 'lemma_tokens', 0.5, 10)

#### Fertility and Pregnancy Data

#### Postpartum Data

### Model 3: Latent Semantic Analysis (LSA) with SciKit Learn

#### Women's Health Data

#### Fertility and Pregnancy Data

#### Postpartum Data

### Model 4: Latent Dirichlet Allocation (LDA) with SciKit Learn

#### Function to Build LDA Model

In [13]:
def build_lda(df, column, n_topics, stop_words = full_stop_words):
    tvec = TfidfVectorizer(max_df = 0.98, min_df = 0.005, ngram_range = (1,2), stop_words = stop_words)
    dtm = tvec.fit_transform(df[column])
    
    LDA = LatentDirichletAllocation(n_components = n_topics, random_state = 42, n_jobs = -1)
    LDA.fit(dtm)
    
    vocab = tvec.get_feature_names()
    topics = LDA.components_
    
    print(f'The number of topics is {n_topics}.')
    print(f'Log-Likelihood Score: {round(LDA.score(dtm), 3)}')
    print(f'Perplexity Score: {round(LDA.perplexity(dtm), 3)}')
    print('\n')
    
    for index, topic in enumerate(topics):
        print(f'The top 15 words for topic {index}')
        print([vocab[index] for index in topic.argsort()[-15:]])
        print('\n')
        
    pyLDAvis.sklearn.prepare(LDA, dtm, tvec)
    
    return LDA, dtm, tvec

In [14]:
def grid_search_sk_lda(df, column, n_topics, step_size, stop_words = full_stop_words):
    for i in range(0, n_topics+1, step_size):
        build_lda(df, column, n_topics, stop_words = full_stop_words)
        print('- - - - - - - - - - - - - - - - - -')

#### Grid Search Num Topics for General Health Data

In [15]:
model, dtm, vectorizer = build_lda(health, 'lemma_tokens', 17)

The number of topics is 17.
Log-Likelihood Score: -1142357.126
Perplexity Score: 2350.653


The top 15 words for topic 0
['wet', 'enjoy', 'ob', 'not', 'sexual', 'bf', 'cum', 'girl', 'partner', 'advice', 'relationship', 'masturbate', 'boyfriend', 'sex', 'orgasm']


The top 15 words for topic 1
['try', 'notice', 'look', 'lump', 'inside', 'finger', 'painful', 'bump', 'cyst', 'pain', 'hurt', 'not', 'sex', 'labia', 'vagina']


The top 15 words for topic 2
['burn', 'vagina', 'smell', 'doctor', 'symptom', 'pee', 'sex', 'not', 'antibiotic', 'bv', 'discharge', 'uti', 'yeast infection', 'yeast', 'infection']


The top 15 words for topic 3
['start', 'dark', 'not', 'ovulation', 'heavy', 'irregular', 'brown discharge', 'spot', 'light', 'cycle', 'blood', 'normal', 'brown', 'discharge', 'period']


The top 15 words for topic 4
['area', 'ingrown', 'bikini', 'not', 'clothe', 'jean', 'razor', 'look', 'leg', 'buy', 'wax', 'hair', 'dress', 'shave', 'wear']


The top 15 words for topic 5
['wear', 'heavy fl

In [16]:
pyLDAvis.sklearn.prepare(model, dtm, vectorizer)

#### Grid Search Num Topics for Fertility and Pregnancy

In [17]:
model, dtm, vectorizer = build_lda(obsgyn, 'lemma_tokens', 15)

The number of topics is 15.
Log-Likelihood Score: -3402096.962
Perplexity Score: 2007.747


The top 15 words for topic 0
['thing', 'find', 'happy', 'cry', 'excited', 'family', 'people', 'pregnancy', 'try', 'friend', 'husband', 'baby', 'not', 'tell', 'pregnant']


The top 15 words for topic 1
['ask', 'anatomy scan', 'anatomy', 'tell', 'risk', 'measure', 'result', 'not', 'test', 'ob', 'baby', 'doctor', 'appointment', 'ultrasound', 'scan']


The top 15 words for topic 2
['look', 'husband', 'group', 'not', 'pregnant', 'baby girl', 'excited', 'tic', 'little', 'love', 'find', 'gender', 'baby', 'boy', 'girl']


The top 15 words for topic 3
['delivery', 'nurse', 'not', 'come', 'cm', 'push', 'pain', 'hour', 'baby', 'section', 'epidural', 'hospital', 'labor', 'contraction', 'birth']


The top 15 words for topic 4
['control', 'pill', 'husband', 'birth control', 'sperm', 'sex', 'period', 'not', 'conceive', 'start', 'pregnant', 'fertility', 'cycle', 'ttc', 'try']


The top 15 words for topic 5
['gl

In [18]:
pyLDAvis.sklearn.prepare(model, dtm, vectorizer)

#### Grid Search Num Topics for Pospartum Data

In [19]:
# build_lda(pospar, 'lemma_tokens', n_topics = 30)

### LSA with Scikit Learn

In [20]:
# https://www.datacamp.com/community/tutorials/discovering-hidden-topics-python

In [21]:
# lsa_tvec = TfidfVectorizer(min_df = 0.001, ngram_range = (1,2), stop_words = full_stop_words)
# X = lsa_tvec.fit_transform(health['lemma_tokens'])
# X.shape

In [22]:
# svd_model = TruncatedSVD(n_components = 10, random_state = 42)
# svd_model.fit(X)

In [23]:
# terms = lsa_tvec.get_feature_names()

# for i, comp in enumerate(svd_model.components_):
#     terms_comp = zip(terms, comp)
#     sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:15]
#     print()
#     print("Topic "+str(i)+": ")
#     for t in sorted_terms:
#         print(t[0])


In [24]:
# pospar['subreddit'].value_counts()

In [25]:
# len(pospar) * 0.005