# All PPTs

In [97]:
import nltk; nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/Owner/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [98]:
!pip install --upgrade gensim
!pip install -U spacy
!pip install -U pip setuptools wheel
!pip install pyldavis



In [99]:
!python3 -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
     |████████████████████████████████| 13.9 MB 745 kB/s            
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [100]:
!pip install pyLDAvis==3.2.1



In [101]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

import gensim
import gensim.models
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy

import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [102]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [103]:
all = pd.read_csv('User Needs Assessments Data.csv')
all = all.loc[all['Person'] != 'Interviewer']

In [104]:
#tokenize words and clean text
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(all['Question/Answer']))

print(data_words[:1])

[['bought', 'the', 'place', 'years', 'ago', 'but', 'it', 'still', 'under', 'renovation']]


In [105]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['bought', 'the', 'place', 'years', 'ago', 'but', 'it', 'still', 'under', 'renovation']


In [106]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [107]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['buy', 'place', 'year', 'ago', 'still', 'renovation']]


In [108]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)]]


In [109]:
id2word[0]

'ago'

In [110]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('ago', 1),
  ('buy', 1),
  ('place', 1),
  ('renovation', 1),
  ('still', 1),
  ('year', 1)]]

In [111]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [112]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.100*"know" + 0.072*"go" + 0.053*"think" + 0.045*"online" + '
  '0.043*"service" + 0.036*"maybe" + 0.035*"mobile" + 0.033*"mean" + '
  '0.033*"thing" + 0.026*"example"'),
 (1,
  '0.071*"result" + 0.053*"come" + 0.029*"actually" + 0.025*"inaudible" + '
  '0.021*"inform" + 0.021*"term" + 0.020*"call" + 0.016*"website" + '
  '0.015*"know" + 0.015*"today"'),
 (2,
  '0.130*"person" + 0.090*"change" + 0.087*"really" + 0.080*"apply" + '
  '0.075*"department" + 0.065*"actually" + 0.063*"go" + 0.062*"even" + '
  '0.043*"question" + 0.041*"register"'),
 (3,
  '0.152*"laugh" + 0.127*"receive" + 0.112*"much" + 0.035*"term" + 0.030*"let" '
  '+ 0.024*"site" + 0.018*"different" + 0.018*"whatnot" + 0.013*"electricity" '
  '+ 0.013*"water"'),
 (4,
  '0.084*"sale" + 0.026*"eco" + 0.026*"contribution" + 0.026*"visit" + '
  '0.023*"half" + 0.017*"ehm" + 0.017*"band" + 0.017*"unit" + '
  '0.017*"government" + 0.017*"usually"'),
 (5,
  '0.077*"arm" + 0.076*"accompany" + 0.045*"provide" + 0.031*"qu

In [113]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.423005380407647

Coherence Score:  0.3497337851419796


In [114]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word, mds='mmds')   
vis

In [115]:
pyLDAvis.save_html(vis, 'All PPTs.html')

# PPT01

In [116]:
all = pd.read_csv('User Needs Assessments Data.csv')
PPT01 = all.loc[(all['Person'] == 'PPT01') & (all['Person'] != 'Interviewer')]

In [117]:
#tokenize words and clean text
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(PPT01['Question/Answer']))

print(data_words[:1])

[['bought', 'the', 'place', 'years', 'ago', 'but', 'it', 'still', 'under', 'renovation']]


In [118]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['bought', 'the', 'place', 'years', 'ago', 'but', 'it', 'still', 'under', 'renovation']


In [119]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [120]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['buy', 'place', 'year', 'ago', 'still', 'renovation']]


In [121]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)]]


In [122]:
id2word[0]

'ago'

In [123]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('ago', 1),
  ('buy', 1),
  ('place', 1),
  ('renovation', 1),
  ('still', 1),
  ('year', 1)]]

In [124]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [125]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.052*"customer" + 0.039*"website" + 0.039*"people" + 0.039*"form" + '
  '0.039*"ideally" + 0.027*"fill" + 0.026*"touchpoint" + 0.026*"available" + '
  '0.026*"need" + 0.026*"help"'),
 (1,
  '0.049*"additional" + 0.049*"document" + 0.049*"card" + 0.049*"number" + '
  '0.049*"send" + 0.049*"time" + 0.049*"d" + 0.049*"monthly" + 0.049*"thing" + '
  '0.049*"basis"'),
 (2,
  '0.089*"mobile" + 0.060*"app" + 0.060*"website" + 0.031*"operate" + '
  '0.031*"factor" + 0.031*"know" + 0.031*"convert" + 0.031*"least" + '
  '0.031*"information" + 0.031*"support"'),
 (3,
  '0.057*"form" + 0.044*"service" + 0.044*"payment" + 0.032*"pay" + '
  '0.026*"take" + 0.026*"go" + 0.026*"time" + 0.019*"tell" + 0.019*"reply" + '
  '0.019*"usually"'),
 (4,
  '0.004*"along" + 0.004*"agent" + 0.004*"bubble" + 0.004*"bot" + '
  '0.004*"available" + 0.004*"amount" + 0.004*"clearly" + 0.004*"science" + '
  '0.004*"want" + 0.004*"close"'),
 (5,
  '0.042*"pay" + 0.041*"utility" + 0.041*"service" + 0.041*"bill" 

In [126]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -5.9172383365722805

Coherence Score:  0.5357074207249144


In [127]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word, mds='mmds')   
vis

In [128]:
pyLDAvis.save_html(vis, 'PPT01.html')

# PPT02

In [129]:
all = pd.read_csv('User Needs Assessments Data.csv')
PPT02 = all.loc[(all['Person'] == 'PPT02') & (all['Person'] != 'Interviewer')]

In [130]:
#tokenize words and clean text
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(PPT02['Question/Answer']))

print(data_words[:1])

[['to', 'pay', 'my', 'bills', 'and', 'to', 'get', 'an', 'estimate', 'of', 'my', 'upcoming', 'bills', 'so', 'that', 'can', 'put', 'in', 'the', 'inaudible', 'meter', 'readings', 'and', 'get', 'an', 'estimate', 'of', 'my', 'monthly', 'bills', 'that', 'and', 'to', 'check', 'whether', 'there', 'are', 'any', 'interruptions', 'in', 'the', 'services']]


In [131]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['to', 'pay', 'my', 'bills', 'and', 'to', 'get', 'an', 'estimate', 'of', 'my', 'upcoming', 'bills', 'so', 'that', 'can', 'put', 'in', 'the', 'inaudible', 'meter', 'readings', 'and', 'get', 'an', 'estimate', 'of', 'my', 'monthly', 'bills', 'that', 'and', 'to', 'check', 'whether', 'there', 'are', 'any', 'interruptions', 'in', 'the', 'services']


In [132]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [133]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['pay', 'bill', 'get', 'estimate', 'upcoming', 'bill', 'put', 'inaudible', 'meter', 'reading', 'get', 'estimate', 'monthly', 'bill', 'check', 'interruption', 'service']]


In [134]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 3), (1, 1), (2, 2), (3, 2), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)]]


In [135]:
id2word[0]

'bill'

In [136]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('bill', 3),
  ('check', 1),
  ('estimate', 2),
  ('get', 2),
  ('inaudible', 1),
  ('interruption', 1),
  ('meter', 1),
  ('monthly', 1),
  ('pay', 1),
  ('put', 1),
  ('reading', 1),
  ('service', 1),
  ('upcoming', 1)]]

In [137]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [138]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.005*"least" + 0.005*"phonecall" + 0.005*"acknowledge" + 0.005*"come" + '
  '0.005*"expect" + 0.005*"good" + 0.005*"government" + 0.005*"happen" + '
  '0.005*"inform" + 0.005*"installation"'),
 (1,
  '0.052*"mean" + 0.052*"form" + 0.052*"fill" + 0.052*"meter" + 0.027*"call" + '
  '0.027*"anyway" + 0.027*"time" + 0.027*"inaudible" + 0.027*"install" + '
  '0.027*"expect"'),
 (2,
  '0.006*"customer" + 0.006*"account" + 0.006*"support" + 0.006*"mean" + '
  '0.006*"advise" + 0.006*"know" + 0.006*"figure" + 0.006*"quite" + '
  '0.006*"number" + 0.006*"care"'),
 (3,
  '0.005*"least" + 0.005*"phonecall" + 0.005*"acknowledge" + 0.005*"come" + '
  '0.005*"expect" + 0.005*"good" + 0.005*"government" + 0.005*"happen" + '
  '0.005*"inform" + 0.005*"installation"'),
 (4,
  '0.049*"customer" + 0.037*"account" + 0.025*"really" + 0.025*"say" + '
  '0.025*"tell" + 0.025*"call" + 0.025*"advise" + 0.025*"figure" + '
  '0.025*"care" + 0.025*"login"'),
 (5,
  '0.080*"service" + 0.048*"mobile" + 0.0

In [139]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -5.710982562560174

Coherence Score:  0.7217597410864645


In [140]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word, mds='mmds')   
vis

In [141]:
pyLDAvis.save_html(vis, 'PPT02.html')

# PPT03

In [142]:
all = pd.read_csv('User Needs Assessments Data.csv')
PPT03 = all.loc[(all['Person'] == 'PPT03') & (all['Person'] != 'Interviewer')]

In [143]:
#tokenize words and clean text
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(PPT03['Question/Answer']))

print(data_words[:1])

[['laughs', 'because', 'it', 'the', 'only', 'place', 'to', 'do', 'so', 'no', 'in', 'malta', 'no', 'there', 'no', 'other', 'form', 'of', 'receiving', 'water', 'and', 'electricity', 'through', 'different', 'ehm', 'correct', 'me', 'if', 'wrong', 'but', 'it', 'the', 'only', 'solution', 'to', 'doing', 'so']]


In [144]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['laughs', 'because', 'it', 'the', 'only', 'place', 'to', 'do', 'so', 'no', 'in', 'malta', 'no', 'there', 'no', 'other', 'form', 'of', 'receiving', 'water', 'and', 'electricity', 'through', 'different', 'ehm', 'correct', 'me', 'if', 'wrong', 'but', 'it', 'the', 'only', 'solution', 'to', 'doing', 'so']


In [145]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [146]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['laugh', 'place', 'form', 'receive', 'water', 'electricity', 'different', 'ehm', 'correct', 'wrong', 'solution']]


In [147]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)]]


In [148]:
id2word[0]

'correct'

In [149]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('correct', 1),
  ('different', 1),
  ('ehm', 1),
  ('electricity', 1),
  ('form', 1),
  ('laugh', 1),
  ('place', 1),
  ('receive', 1),
  ('solution', 1),
  ('water', 1),
  ('wrong', 1)]]

In [150]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [151]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.006*"experience" + 0.006*"much" + 0.006*"typical" + 0.006*"ui" + '
  '0.006*"user" + 0.006*"whatnot" + 0.006*"abroad" + 0.006*"apply" + '
  '0.006*"argue" + 0.006*"site"'),
 (1,
  '0.082*"online" + 0.042*"easily" + 0.042*"attach" + 0.042*"send" + '
  '0.042*"laugh" + 0.042*"document" + 0.042*"form" + 0.042*"system" + '
  '0.021*"stupid" + 0.021*"first"'),
 (2,
  '0.073*"term" + 0.073*"form" + 0.037*"good" + 0.037*"easy" + 0.037*"come" + '
  '0.037*"example" + 0.037*"manually" + 0.037*"make" + 0.037*"experience" + '
  '0.037*"moment"'),
 (3,
  '0.007*"quite" + 0.007*"think" + 0.007*"easy" + 0.007*"straightforward" + '
  '0.006*"read" + 0.006*"make" + 0.006*"yet" + 0.006*"browse" + 0.006*"d" + '
  '0.006*"enjoyable"'),
 (4,
  '0.073*"account" + 0.055*"change" + 0.037*"proof" + 0.037*"order" + '
  '0.037*"call" + 0.019*"really" + 0.019*"laugh" + 0.019*"low" + '
  '0.019*"always" + 0.019*"expectation"'),
 (5,
  '0.087*"bill" + 0.086*"receive" + 0.058*"issue" + 0.058*"laugh" + '
 

In [152]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -5.730853134467278

Coherence Score:  0.6579006829412793


In [153]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word, mds='mmds')   
vis

In [154]:
pyLDAvis.save_html(vis, 'PPT03.html')

# PPT04

In [155]:
all = pd.read_csv('User Needs Assessments Data.csv')
PPT04 = all.loc[(all['Person'] == 'PPT04') & (all['Person'] != 'Interviewer')]

In [156]:
#tokenize words and clean text
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(PPT04['Question/Answer']))

print(data_words[:1])

[['ehm', 'bought', 'the', 'place', 'almost', 'two', 'and', 'half', 'years', 'but', 've', 'been', 'living', 'here', 'for', 'year']]


In [157]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['ehm', 'bought', 'the', 'place', 'almost', 'two', 'and', 'half', 'years', 'but', 've', 'been', 'living', 'here', 'for', 'year']


In [158]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [159]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['buy', 'place', 'almost', 'half', 'year', 'live', 'year']]


In [160]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2)]]


In [161]:
id2word[0]

'almost'

In [162]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('almost', 1),
  ('buy', 1),
  ('half', 1),
  ('live', 1),
  ('place', 1),
  ('year', 2)]]

In [163]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [164]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.076*"website" + 0.076*"tell" + 0.076*"payment" + 0.076*"pay" + '
  '0.076*"bill" + 0.076*"truth" + 0.076*"mind" + 0.004*"band" + 0.004*"far" + '
  '0.004*"last"'),
 (1,
  '0.056*"form" + 0.056*"household" + 0.056*"download" + 0.029*"lease" + '
  '0.029*"i" + 0.029*"place" + 0.029*"course" + 0.029*"card" + 0.029*"d" + '
  '0.029*"state"'),
 (2,
  '0.118*"water" + 0.118*"electricity" + 0.079*"service" + 0.041*"bill" + '
  '0.041*"pay" + 0.041*"well" + 0.041*"somewhere" + 0.041*"put" + '
  '0.041*"money" + 0.041*"like"'),
 (3,
  '0.082*"trouble" + 0.082*"maybe" + 0.082*"bill" + 0.082*"time" + '
  '0.082*"know" + 0.082*"indeed" + 0.004*"get" + 0.004*"right" + 0.004*"far" + '
  '0.004*"last"'),
 (4,
  '0.007*"bit" + 0.007*"do" + 0.007*"right" + 0.007*"mind" + 0.007*"far" + '
  '0.007*"issue" + 0.007*"band" + 0.007*"government" + 0.007*"contribution" + '
  '0.007*"example"'),
 (5,
  '0.083*"actually" + 0.042*"arm" + 0.042*"form" + 0.042*"ok" + 0.042*"google" '
  '+ 0.042*"exactly" 

In [165]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -5.635885872485789

Coherence Score:  0.49915224130107017


In [166]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word, mds='mmds')   
vis

In [167]:
pyLDAvis.save_html(vis, 'PPT04.html')

# PPT05

In [168]:
all = pd.read_csv('User Needs Assessments Data.csv')
PPT05 = all.loc[(all['Person'] == 'PPT05') & (all['Person'] != 'Interviewer')]

In [169]:
#tokenize words and clean text
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(PPT05['Question/Answer']))

print(data_words[:1])

[['ve', 'been', 'homeowner', 'for', 'the', 'past', 'couple', 'of', 'years', 'think', 'three', 'years']]


In [170]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['ve', 'been', 'homeowner', 'for', 'the', 'past', 'couple', 'of', 'years', 'think', 'three', 'years']


In [171]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [172]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['homeowner', 'couple', 'year', 'think', 'year']]


In [173]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 2)]]


In [174]:
id2word[0]

'couple'

In [175]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('couple', 1), ('homeowner', 1), ('think', 1), ('year', 2)]]

In [176]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [177]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.062*"website" + 0.050*"bill" + 0.037*"honest" + 0.037*"pay" + '
  '0.037*"know" + 0.025*"think" + 0.025*"avoid" + 0.025*"message" + '
  '0.025*"mobile" + 0.025*"start"'),
 (1,
  '0.120*"form" + 0.081*"know" + 0.081*"fill" + 0.041*"go" + 0.041*"user" + '
  '0.041*"letter" + 0.041*"sure" + 0.041*"remember" + 0.041*"new" + '
  '0.041*"person"'),
 (2,
  '0.004*"mean" + 0.004*"internet" + 0.004*"old" + 0.004*"nation" + '
  '0.004*"move" + 0.004*"mentality" + 0.004*"overseas" + 0.004*"extent" + '
  '0.004*"forward" + 0.004*"regard"'),
 (3,
  '0.091*"go" + 0.047*"website" + 0.047*"person" + 0.047*"register" + '
  '0.047*"honest" + 0.047*"department" + 0.047*"completely" + 0.047*"apply" + '
  '0.047*"even" + 0.047*"actually"'),
 (4,
  '0.083*"eventually" + 0.005*"know" + 0.005*"sort" + 0.005*"person" + '
  '0.005*"way" + 0.005*"stuff" + 0.005*"service" + 0.005*"go" + 0.004*"even" + '
  '0.004*"mean"'),
 (5,
  '0.004*"mean" + 0.004*"internet" + 0.004*"old" + 0.004*"nation" + '
  '0.00

In [178]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -5.950132364242719

Coherence Score:  0.5817905302483501


In [179]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word, mds='mmds')   
vis

In [180]:
pyLDAvis.save_html(vis, 'PPT05.html')