# LDA topic analysis

## Loading the data

In [6]:
import numpy as np
import pandas as pd
import re, nltk, spacy, gensim

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
df = pd.read_csv(r'../../gen/analysis/temp/coding.csv')
df = df.dropna(subset=['abstract_wos'])

In [8]:
df.head()

Unnamed: 0,doi,paper,tc,tcperyear,ntc,year,journal,keywords,abstract_wos,nauthors,...,"author_other (e.g., teaching)",sum authors,verified,adone,special issue,marked_for_review,added date,check_hd,web,is_aggregator
0,,"ESCALAS JE, 2004, J CONSUM PSYCHOL-a",537,28.263158,2.98611,2004,JCP,,"IN THIS RESEARCH, WE INVESTIGATE CONSUMERS' MO...",,...,,,,,,,,,False,
1,,"HOMBURG C, 2005, J MARK",433,24.055556,2.899793,2004,JCP,,"IN THIS RESEARCH, WE INVESTIGATE CONSUMERS' MO...",,...,,,,,,,,,False,
2,,"READ S, 2009, J MARK",250,17.857143,2.734648,2004,JCP,,"IN THIS RESEARCH, WE INVESTIGATE CONSUMERS' MO...",,...,,,,,,,,,False,
3,,"HUMPHREYS A, 2010, J MARK",234,18.0,2.248611,2004,JCP,,"IN THIS RESEARCH, WE INVESTIGATE CONSUMERS' MO...",,...,,,,,,,,,False,
4,,"PHAM MT, 2004, J CONSUM PSYCHOL",195,10.263158,1.084342,2004,JCP,,"IN THIS RESEARCH, WE INVESTIGATE CONSUMERS' MO...",,...,,,,,,,,,False,


In [9]:
df.shape

(4707, 62)

# Data cleaning

In [11]:
# Convert to list

data = df.abstract_wos.values.tolist()
# Remove Emails
data = [re.sub(r'\S*@\S*\s?', '', sent) for sent in data]
# Remove new line characters
data = [re.sub(r'\s+', ' ', sent) for sent in data]
# Remove distracting single quotes
data = [re.sub(r"\'", "", sent) for sent in data]

In [12]:
len(data)

4707

In [13]:
type(data)

list

### Tokenization

In [14]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
data_words = list(sent_to_words(data))

In [15]:
type(data_words)

list

In [16]:
len(data_words)

4707

In [17]:
avoid_these_words=['article','articles','author','authors','study','studied','studies','studying']

In [18]:
all=[]
for x in data_words:
    sub=[]
    for y in x:
        if y not in avoid_these_words:
            sub.append(y)
    all.append(sub)        
    

In [19]:
len(all[0])

168

In [22]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): #'NOUN', 'ADJ', 'VERB', 'ADV'
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

In [23]:
# Initialize spacy ‘en’ model, keeping only tagger component (for efficiency)
# Run in terminal: python -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(all, allowed_postags=['NOUN', 'VERB']) #select noun and verb

### Document-Word matrix

In [24]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10, # minimum reqd occurences of a word 
                             stop_words='english', # remove stop words
                             lowercase=True,# convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  
                             # num chars > 3
                             # max_features=50000, 
                             # max number of uniq words    
                            )
data_vectorized = vectorizer.fit_transform(data_lemmatized)

## Build LDA model

with sklearn

In [28]:
# Build LDA Model
lda_model = LatentDirichletAllocation(n_components=6,               # Number of topics
                                      max_iter=10,               
# Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          
# Random state
                                      batch_size=128,            
# n docs in each learning iter
                                      evaluate_every = -1,       
# compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               
# Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)
print(lda_model)  # Model attributes

LatentDirichletAllocation(learning_method='online', n_components=6, n_jobs=-1,
                          random_state=100)


## Diagnising model performance with perplexity and log-likelihood

In [29]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))
# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))
# See model parameters
pprint(lda_model.get_params())

Log Likelihood:  -2001696.8245067187
Perplexity:  678.0125600111246
{'batch_size': 128,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'online',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 10,
 'mean_change_tol': 0.001,
 'n_components': 6,
 'n_jobs': -1,
 'perp_tol': 0.1,
 'random_state': 100,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}


### Using GridSearch
to determine the best LDA model

In [None]:
# Define Search Param
search_params = {'n_components': [4,5,6,7,8,9,10,11,12], 'learning_decay': [.3,.4,.5,.6,.7,.8,.9]}
# Init the Model
lda = LatentDirichletAllocation(max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)
# Do the Grid Search
model.fit(data_vectorized)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-no

In [68]:
# Best Model
best_lda_model = model.best_estimator_
# Model Parameters
print("Best Model's Params: ", model.best_params_)
# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)
# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

  and should_run_async(code)


Best Model's Params:  {'learning_decay': 0.6, 'n_components': 4}
Best Log Likelihood Score:  -415581.8654101815
Model Perplexity:  677.2712870514948


### Extract dominant topic

In [70]:
# Create Document — Topic Matrix
lda_output = best_lda_model.transform(data_vectorized)
# column names
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
# index names
docnames = ["Doc" + str(i) for i in range(len(data))]
# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Styling
def color_green(val):
 color = 'green' if val > .1 else 'black'
 return 'color: {col}'.format(col=color)
def make_bold(val):
 weight = 700 if val > .1 else 400
 return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

  and should_run_async(code)


Unnamed: 0,Topic0,Topic1,Topic2,Topic3,dominant_topic
Doc0,0.01,0.9,0.09,0.01,1
Doc1,0.98,0.01,0.01,0.01,0
Doc2,0.13,0.85,0.01,0.01,1
Doc3,0.92,0.07,0.01,0.01,0
Doc4,0.11,0.64,0.25,0.01,1
Doc5,0.74,0.09,0.01,0.17,0
Doc6,0.47,0.53,0.0,0.0,1
Doc7,0.3,0.69,0.01,0.01,1
Doc8,0.27,0.72,0.0,0.0,1
Doc9,0.48,0.39,0.12,0.01,0


In [73]:
df_document_topic.head()

  and should_run_async(code)


Unnamed: 0,Topic0,Topic1,Topic2,Topic3,dominant_topic
Doc0,0.01,0.9,0.09,0.01,1
Doc1,0.98,0.01,0.01,0.01,0
Doc2,0.13,0.85,0.01,0.01,1
Doc3,0.92,0.07,0.01,0.01,0
Doc4,0.11,0.64,0.25,0.01,1


In [74]:
df_document_topic.to_csv("../../gen/analysis/temp/lda_analysis.csv",index=False)

  and should_run_async(code)


In [75]:
#top_30_keywords_for_each_topic!

  and should_run_async(code)


In [76]:
# Show top n keywords for each topic
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=30):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords
topic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=30)
# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords


  and should_run_async(code)


Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,...,Word 20,Word 21,Word 22,Word 23,Word 24,Word 25,Word 26,Word 27,Word 28,Word 29
Topic 0,product,consumer,effect,choice,preference,decision,information,experiment,research,influence,...,purchase,brand,finding,experience,base,people,lead,level,perceive,propose
Topic 1,consumer,self,research,consumption,behavior,brand,effect,experience,goal,people,...,result,role,theory,time,perceive,suggest,provide,emotion,status,group
Topic 2,customer,model,firm,brand,use,marketing,market,datum,effect,advertising,...,develop,provide,impact,research,approach,manager,increase,estimate,network,method
Topic 3,price,consumer,retailer,cost,increase,store,purchase,pricing,effect,search,...,pay,quality,seller,policy,food,time,competition,decrease,incentive,sale


In [77]:
df_topic_keywords=df_topic_keywords.transpose()

  and should_run_async(code)


In [78]:
df_topic_keywords

  and should_run_async(code)


Unnamed: 0,Topic 0,Topic 1,Topic 2,Topic 3
Word 0,product,consumer,customer,price
Word 1,consumer,self,model,consumer
Word 2,effect,research,firm,retailer
Word 3,choice,consumption,brand,cost
Word 4,preference,behavior,use,increase
Word 5,decision,brand,marketing,store
Word 6,information,effect,market,purchase
Word 7,experiment,experience,datum,pricing
Word 8,research,goal,effect,effect
Word 9,influence,people,advertising,search


In [79]:
df_topic_keywords.to_csv('../../gen/analysis/temp/top_keywords.csv',index=False)

  and should_run_async(code)
