# Topic Modeling

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit Learn
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.models import LsiModel
from gensim.models import CoherenceModel

# spaCy
import spacy

# nltk
from nltk.corpus import stopwords
from nltk.tokenize.regexp import RegexpTokenizer

## pyLDAvis
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()


# Silence Future Warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Load Text Libraries


### Load Corpora from spaCy and gensim

In [2]:
nlp = spacy.load('en_core_web_sm')

### Stop Words

In [3]:
def add_stop_words(word_list, list_stop_words):
    for word in word_list:
        list_stop_words.add(word)
        
def remove_stop_words(word_list, list_stop_words):
    for word in word_list:
        list_stop_words.remove(word)

#### Load Stop Words from NLTK and spaCy

In [4]:
stop_words_spacy = set(nlp.Defaults.stop_words)
stop_words_nltk = set(stopwords.words('english'))
full_stop_words = stop_words_spacy.union(stop_words_nltk)

#### Modify Stop Words Based on EDA

In [5]:
words_to_add = ['like', 'know', 'want', 'feel', 'going', 'think', 'reddit', 'imgur', 'pron', 'officially', 'story', 'month', 'week', 'time', 'day', 'year']

add_stop_words(words_to_add, full_stop_words)
remove_stop_words(['not'], full_stop_words)

## Read-In Data

In [23]:
health = pd.read_csv('../data/womens_health_preprocessed.csv', lineterminator='\n')
obsgyn = pd.read_csv('../data/fertility_and_pregnancy_preprocessed.csv')
pospar = pd.read_csv('../data/postpartum_preprocessed.csv')

## Modeling

### Model 1: LDA with Scikit Learn

#### Remove Digits from Text
After initial LDA models, it became clear that, while some numbers may occur frequently in posts, they do not add a lot of value to deriving meaning from the topics. Therefore, digits will be removed from the text.

In [24]:
def remove_digits(df, column1, column2):
    df[column1] = df[column1].str.replace('[0-9]+', '', regex = True)
    df[column2] = df[column2].str.replace('[0-9]+', '', regex = True)
    
# https://stackoverflow.com/questions/47010044/how-to-remove-numeric-characters-present-in-countvectorizer

dfs = [health, obsgyn, pospar]

for df in dfs:
    remove_digits(df, 'total_text', 'lemma_tokens')

In [9]:
# def remove_phrase(df, column):
#     df[column] = df[column].str.replace([deleted]', '')

# remove_phrase(obsgyn, 'lemma_tokens')

#### Function to Build LDA Model

In [10]:
def build_lda(df, column, n_topics, stop_words = full_stop_words):
    tvec = TfidfVectorizer(max_df = 0.98, min_df = 0.005, ngram_range = (1,2), stop_words = stop_words)
    dtm = tvec.fit_transform(df[column])
    
    LDA = LatentDirichletAllocation(n_components = n_topics, random_state = 42, n_jobs = -1)
    LDA.fit(dtm)
    
    vocab = tvec.get_feature_names()
    topics = LDA.components_
    
    print(f'The number of topics is {n_topics}.')
    print(f'Log-Likelihood Score: {round(LDA.score(dtm), 3)}')
    print(f'Perplexity Score: {round(LDA.perplexity(dtm), 3)}')
    print('\n')
    
    for index, topic in enumerate(topics):
        print(f'The top 15 words for topic {index}')
        print([vocab[index] for index in topic.argsort()[-15:]])
        print('\n')
        
    pyLDAvis.sklearn.prepare(LDA, dtm, tvec)
    
    return LDA, dtm, tvec

In [11]:
def grid_search_sk_lda(df, column, n_topics, step_size, stop_words = full_stop_words):
    for i in range(0, n_topics+1, step_size):
        build_lda(df, column, n_topics, stop_words = full_stop_words)
        print('- - - - - - - - - - - - - - - - - -')

#### Grid Search Num Topics for General Health Data

In [12]:
model, dtm, vectorizer = build_lda(health, 'lemma_tokens', 17)

The number of topics is 17.
Log-Likelihood Score: -1144294.513
Perplexity Score: 2371.087


The top 15 words for topic 0
['view', 'hormone', 'surgery', 'woman', 'pcos', 'uterus', 'cm', 'estrogen', 'menstrual cycle', 'progesterone', 'cycle', 'fibroid', 'tube', 'menstrual', 'menopause']


The top 15 words for topic 1
['experience', 'surgery', 'sharp', 'right', 'doctor', 'pelvic', 'left', 'ultrasound', 'ovarian cyst', 'breast', 'ovary', 'ovarian', 'delete', 'cyst', 'pain']


The top 15 words for topic 2
['painful', 'skin', 'look', 'small', 'notice', 'lump', 'bra', 'boob', 'nipple', 'bump', 'hurt', 'not', 'labia', 'breast', 'vagina']


The top 15 words for topic 3
['travel', 'ex', 'home', 'bag', 'deal', 'love', 'work', 'ring', 'help', 'favorite', 'guy', 'sleep', 'app', 'date', 'tip']


The top 15 words for topic 4
['pound', 'weight gain', 'not', 'fat', 'gain weight', 'food', 'lose weight', 'exercise', 'healthy', 'diet', 'lose', 'body', 'gain', 'eat', 'weight']


The top 15 words for topic 

In [13]:
pyLDAvis.sklearn.prepare(model, dtm, vectorizer)

#### Grid Search Num Topics for Fertility and Pregnancy

In [25]:
model, dtm, vectorizer = build_lda(obsgyn, 'lemma_tokens', 30)

KeyboardInterrupt: 

In [15]:
pyLDAvis.sklearn.prepare(model, dtm, vectorizer)

#### Grid Search Num Topics for Pospartum Data

In [16]:
build_lda(pospar, 'lemma_tokens', n_topics = 30)

The number of topics is 30.
Log-Likelihood Score: -1989621.426
Perplexity Score: 3108.972


The top 15 words for topic 0
['normal', 'doctor', 'test', 'hcg', 'urine', 'headache', 'level', 'preeclampsia', 'symptom', 'bp', 'high', 'pregnancy', 'pressure', 'blood pressure', 'blood']


The top 15 words for topic 1
['normal', 'pelvic', 'belly', 'braxton hicks', 'hicks', 'low', 'contraction', 'bad', 'pregnancy', 'hurt', 'experience', 'braxton', 'not', 'cramp', 'pain']


The top 15 words for topic 2
['lactation', 'pump work', 'nipple shield', 'pumping', 'use', 'insurance', 'work', 'shield', 'flange', 'nipple', 'medela', 'breast', 'breast pump', 'spectra', 'pump']


The top 15 words for topic 3
['research', 'health', 'baby', 'woman', 'find', 'ob', 'book', 'experience', 'doctor', 'pregnant', 'not', 'class', 'pregnancy', 'hospital', 'birth']


The top 15 words for topic 4
['excited', 'depression', 'try', 'tell', 'find', 'thing', 'child', 'anxiety', 'husband', 'life', 'love', 'pregnancy', 'not', '

(LatentDirichletAllocation(n_components=30, n_jobs=-1, random_state=42),
 <49094x1749 sparse matrix of type '<class 'numpy.float64'>'
 	with 1856873 stored elements in Compressed Sparse Row format>,
 TfidfVectorizer(max_df=0.98, min_df=0.005, ngram_range=(1, 2),
                 stop_words={"'d", "'ll", "'m", "'re", "'s", "'ve", 'a', 'about',
                             'above', 'across', 'after', 'afterwards', 'again',
                             'against', 'ain', 'all', 'almost', 'alone', 'along',
                             'already', 'also', 'although', 'always', 'am',
                             'among', 'amongst', 'amount', 'an', 'and',
                             'another', ...}))

### LSA with Gensim

In [17]:
# https://www.datacamp.com/community/tutorials/discovering-hidden-topics-python

In [18]:
lsa_tvec = TfidfVectorizer(min_df = 0.001, ngram_range = (1,2), stop_words = full_stop_words)
X = lsa_tvec.fit_transform(health['lemma_tokens'])
X.shape

(30616, 6724)

In [19]:
svd_model = TruncatedSVD(n_components = 10, random_state = 42)
svd_model.fit(X)

TruncatedSVD(n_components=10, random_state=42)

In [20]:
terms = lsa_tvec.get_feature_names()

for i, comp in enumerate(svd_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:15]
    print()
    print("Topic "+str(i)+": ")
    for t in sorted_terms:
        print(t[0])



Topic 0: 
period
not
start
pain
pill
sex
control
help
birth
birth control
come
try
doctor
look
normal

Topic 1: 
period
pill
birth control
birth
control
bleed
cramp
pregnancy
pregnant
cycle
iud
blood
test
normal
spot

Topic 2: 
infection
yeast
yeast infection
discharge
uti
vagina
bv
smell
pain
sex
antibiotic
symptom
doctor
vaginal
test

Topic 3: 
hair
shave
yeast
infection
yeast infection
wax
wash
period
grow
dye
birth control
birth
pill
control
shampoo

Topic 4: 
birth control
control
birth
pill
yeast
yeast infection
infection
control pill
friend
pack
effect
work
good
new
start birth

Topic 5: 
pain
cyst
iud
birth
birth control
control
doctor
breast
ovarian
hurt
ovarian cyst
ovary
painful
ultrasound
experience

Topic 6: 
wear
bra
dress
look
breast
size
clothe
smell
buy
boob
makeup
jean
fit
skin
cup

Topic 7: 
iud
period
friend
pain
yeast
yeast infection
infection
cramp
cyst
work
copper
bad
heavy
copper iud
experience

Topic 8: 
iud
sex
bleed
smell
copper
copper iud
mirena
vagina
disc

In [21]:
pospar['subreddit'].value_counts()

BabyBumps               33280
breastfeeding           15593
postpartumdepression      221
Name: subreddit, dtype: int64

In [22]:
len(pospar) * 0.005

245.47