# All PPTs

In [2]:
import nltk; nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/Owner/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
!pip install --upgrade gensim
!pip install -U spacy
!pip install -U pip setuptools wheel
!pip install pyldavis



In [4]:
!python3 -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
     |████████████████████████████████| 13.9 MB 4.6 MB/s            
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [5]:
!pip install pyLDAvis==3.2.1



In [6]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

import gensim
import gensim.models
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy

import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [7]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [8]:
all = pd.read_csv('Usability Test Data.csv')
all = all.loc[all['Person'] != 'Interviewer']

In [9]:
#tokenize words and clean text
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(all['Question/Answer']))

print(data_words[:1])

[['well', 'mean', 'my', 'last', 'trip', 'was', 'mostly', 'because', 'already', 'had', 'flight', 'that', 'couldn', 'use', 'which', 'moved', 'to', 'next', 'year', 'but', 'then', 'was', 'like', 'well', 'it', 'doesn', 'matter', 'so', 'thought', 'll', 'just', 'come', 'to', 'berlin', 'because', 'have', 'friends', 'here', 'and', 'that', 'was', 'through', 'easyjet', 'and', 'mean', 'had', 'to', 'do', 'few', 'things', 'on', 'the', 'website', 'to', 'change', 'the', 'flights', 'and', 'was', 'charged', 'bit', 'of', 'money', 'but', 'think', 'yeah', 'there', 'wasn', 'much', 'planning', 'in', 'it', 'because', 'it', 'was', 'more', 'of', 'kind', 'of', 'like', 'improvised', 'trip', 'what', 'was', 'the', 'other', 'part', 'of', 'the', 'question']]


In [10]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['well', 'mean', 'my', 'last', 'trip', 'was', 'mostly', 'because', 'already', 'had', 'flight', 'that', 'couldn', 'use', 'which', 'moved', 'to', 'next', 'year', 'but', 'then', 'was', 'like', 'well', 'it', 'doesn', 'matter', 'so', 'thought', 'll', 'just', 'come', 'to', 'berlin', 'because', 'have', 'friends', 'here', 'and', 'that', 'was', 'through', 'easyjet', 'and', 'mean', 'had', 'to', 'do', 'few', 'things', 'on', 'the', 'website', 'to', 'change', 'the', 'flights', 'and', 'was', 'charged', 'bit', 'of', 'money', 'but', 'think', 'yeah', 'there', 'wasn', 'much', 'planning', 'in', 'it', 'because', 'it', 'was', 'more', 'of', 'kind', 'of', 'like', 'improvised', 'trip', 'what', 'was', 'the', 'other', 'part', 'of', 'the', 'question']


In [11]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [12]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['mean', 'last', 'trip', 'mostly', 'already', 'flight', 'move', 'next', 'year', 'well', 'matter', 'thought', 'come', 'friend', 'easyjet', 'mean', 'thing', 'website', 'change', 'flight', 'charge', 'bit', 'money', 'think', 'much', 'planning', 'kind', 'improvise', 'trip', 'part', 'question']]


In [13]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 2), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 2), (25, 1), (26, 1), (27, 1)]]


In [14]:
id2word[0]

'already'

In [15]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('already', 1),
  ('bit', 1),
  ('change', 1),
  ('charge', 1),
  ('come', 1),
  ('easyjet', 1),
  ('flight', 2),
  ('friend', 1),
  ('improvise', 1),
  ('kind', 1),
  ('last', 1),
  ('matter', 1),
  ('mean', 2),
  ('money', 1),
  ('mostly', 1),
  ('move', 1),
  ('much', 1),
  ('next', 1),
  ('part', 1),
  ('planning', 1),
  ('question', 1),
  ('thing', 1),
  ('think', 1),
  ('thought', 1),
  ('trip', 2),
  ('website', 1),
  ('well', 1),
  ('year', 1)]]

In [16]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [17]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.082*"open" + 0.049*"put" + 0.034*"pick" + 0.032*"want" + 0.029*"price" + '
  '0.027*"know" + 0.024*"give" + 0.022*"super" + 0.022*"bad" + '
  '0.022*"sometimes"'),
 (1,
  '0.087*"early" + 0.062*"late" + 0.056*"flight" + 0.055*"browse" + '
  '0.046*"usually" + 0.041*"return" + 0.037*"price" + 0.030*"thing" + '
  '0.029*"whole" + 0.026*"mean"'),
 (2,
  '0.104*"inaudible" + 0.097*"date" + 0.081*"ok" + 0.068*"departure" + '
  '0.055*"good" + 0.045*"never" + 0.043*"later" + 0.038*"think" + '
  '0.030*"refund" + 0.023*"use"'),
 (3,
  '0.074*"mess" + 0.043*"list" + 0.029*"information" + 0.015*"way" + '
  '0.015*"complex" + 0.015*"manner" + 0.015*"layout" + 0.015*"basically" + '
  '0.015*"suspicious" + 0.015*"main"'),
 (4,
  '0.109*"actual" + 0.054*"air" + 0.054*"luggage" + 0.047*"cabin" + 0.042*"go" '
  '+ 0.026*"still" + 0.023*"exist" + 0.023*"skyscanner" + 0.017*"browse" + '
  '0.012*"good"'),
 (5,
  '0.019*"piss" + 0.019*"money" + 0.019*"do" + 0.015*"alright" + 0.007*"fuck" '
  '

In [18]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.073749250432356

Coherence Score:  0.4455539033523067


In [20]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word, mds='mmds')   
vis

In [19]:
pyLDAvis.save_html(vis, 'All PPTs.html')

# PPT01

In [20]:
all = pd.read_csv('Usability Test Data.csv')
PPT01 = all.loc[(all['Person'] == 'PPT01') & (all['Person'] != 'Interviewer')]

In [21]:
#tokenize words and clean text
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(PPT01['Question/Answer']))

print(data_words[:1])

[['well', 'mean', 'my', 'last', 'trip', 'was', 'mostly', 'because', 'already', 'had', 'flight', 'that', 'couldn', 'use', 'which', 'moved', 'to', 'next', 'year', 'but', 'then', 'was', 'like', 'well', 'it', 'doesn', 'matter', 'so', 'thought', 'll', 'just', 'come', 'to', 'berlin', 'because', 'have', 'friends', 'here', 'and', 'that', 'was', 'through', 'easyjet', 'and', 'mean', 'had', 'to', 'do', 'few', 'things', 'on', 'the', 'website', 'to', 'change', 'the', 'flights', 'and', 'was', 'charged', 'bit', 'of', 'money', 'but', 'think', 'yeah', 'there', 'wasn', 'much', 'planning', 'in', 'it', 'because', 'it', 'was', 'more', 'of', 'kind', 'of', 'like', 'improvised', 'trip', 'what', 'was', 'the', 'other', 'part', 'of', 'the', 'question']]


In [22]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['well', 'mean', 'my', 'last', 'trip', 'was', 'mostly', 'because', 'already', 'had', 'flight', 'that', 'couldn', 'use', 'which', 'moved', 'to', 'next', 'year', 'but', 'then', 'was', 'like', 'well', 'it', 'doesn', 'matter', 'so', 'thought', 'll', 'just', 'come', 'to', 'berlin', 'because', 'have', 'friends', 'here', 'and', 'that', 'was', 'through', 'easyjet', 'and', 'mean', 'had', 'to', 'do', 'few', 'things', 'on', 'the', 'website', 'to', 'change', 'the', 'flights', 'and', 'was', 'charged', 'bit', 'of', 'money', 'but', 'think', 'yeah', 'there', 'wasn', 'much', 'planning', 'in', 'it', 'because', 'it', 'was', 'more', 'of', 'kind', 'of', 'like', 'improvised', 'trip', 'what', 'was', 'the', 'other', 'part', 'of', 'the', 'question']


In [23]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [24]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['mean', 'last', 'trip', 'mostly', 'already', 'flight', 'move', 'next', 'year', 'well', 'matter', 'thought', 'come', 'friend', 'easyjet', 'mean', 'thing', 'website', 'change', 'flight', 'charge', 'bit', 'money', 'think', 'much', 'planning', 'kind', 'improvise', 'trip', 'part', 'question']]


In [25]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 2), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 2), (25, 1), (26, 1), (27, 1)]]


In [26]:
id2word[0]

'already'

In [27]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('already', 1),
  ('bit', 1),
  ('change', 1),
  ('charge', 1),
  ('come', 1),
  ('easyjet', 1),
  ('flight', 2),
  ('friend', 1),
  ('improvise', 1),
  ('kind', 1),
  ('last', 1),
  ('matter', 1),
  ('mean', 2),
  ('money', 1),
  ('mostly', 1),
  ('move', 1),
  ('much', 1),
  ('next', 1),
  ('part', 1),
  ('planning', 1),
  ('question', 1),
  ('thing', 1),
  ('think', 1),
  ('thought', 1),
  ('trip', 2),
  ('website', 1),
  ('well', 1),
  ('year', 1)]]

In [28]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [29]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.091*"go" + 0.037*"mean" + 0.037*"want" + 0.037*"ahead" + 0.037*"usually" '
  '+ 0.037*"month" + 0.019*"pick" + 0.019*"orly" + 0.019*"browse" + '
  '0.019*"trip"'),
 (1,
  '0.093*"website" + 0.047*"easyjet" + 0.047*"work" + 0.024*"specific" + '
  '0.024*"ninety" + 0.024*"say" + 0.024*"step" + 0.024*"sort" + 0.024*"part" + '
  '0.024*"difficult"'),
 (2,
  '0.064*"section" + 0.043*"explore" + 0.043*"come" + 0.043*"go" + '
  '0.043*"maybe" + 0.043*"want" + 0.022*"range" + 0.022*"make" + 0.022*"bunch" '
  '+ 0.022*"place"'),
 (3,
  '0.046*"go" + 0.046*"ohh" + 0.039*"know" + 0.031*"flight" + 0.031*"thing" + '
  '0.031*"let" + 0.031*"price" + 0.023*"see" + 0.023*"want" + 0.016*"think"'),
 (4,
  '0.081*"flight" + 0.045*"time" + 0.030*"go" + 0.030*"care" + 0.030*"hour" + '
  '0.030*"bottom" + 0.030*"link" + 0.029*"much" + 0.028*"kind" + 0.025*"give"'),
 (5,
  '0.035*"flight" + 0.035*"mean" + 0.035*"trip" + 0.035*"money" + 0.018*"seem" '
  '+ 0.018*"go" + 0.018*"fuck" + 0.018*"price" +

In [30]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -6.070447459425245

Coherence Score:  0.6090200479853406


In [31]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word, mds='mmds')   
vis

In [32]:
pyLDAvis.save_html(vis, 'PPT01.html')

# PPT02

In [33]:
all = pd.read_csv('Usability Test Data.csv')
PPT02 = all.loc[(all['Person'] == 'PPT02') & (all['Person'] != 'Interviewer')]

In [34]:
#tokenize words and clean text
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(PPT02['Question/Answer']))

print(data_words[:1])

[['actually', 'planned', 'trip', 'last', 'week', 'for', 'forthcoming', 'trip', 'during', 'nye', 'mean', 'it', 'was', 'quite', 'an', 'is', 'it', 'fine', 'telling', 'you', 'about', 'forthcoming', 'trip']]


In [35]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['actually', 'planned', 'trip', 'last', 'week', 'for', 'forthcoming', 'trip', 'during', 'nye', 'mean', 'it', 'was', 'quite', 'an', 'is', 'it', 'fine', 'telling', 'you', 'about', 'forthcoming', 'trip']


In [36]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [37]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['actually', 'plan', 'trip', 'last', 'week', 'forthcoming', 'trip', 'nye', 'mean', 'quite', 'fine', 'tell', 'forthcoming', 'trip']]


In [38]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 3), (10, 1)]]


In [39]:
id2word[0]

'actually'

In [40]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('actually', 1),
  ('fine', 1),
  ('forthcoming', 2),
  ('last', 1),
  ('mean', 1),
  ('nye', 1),
  ('plan', 1),
  ('quite', 1),
  ('tell', 1),
  ('trip', 3),
  ('week', 1)]]

In [41]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [42]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.057*"think" + 0.043*"skyscanner" + 0.043*"right" + 0.029*"flight" + '
  '0.029*"share" + 0.029*"usually" + 0.029*"help" + 0.029*"good" + '
  '0.029*"date" + 0.015*"want"'),
 (1,
  '0.036*"know" + 0.036*"let" + 0.036*"honest" + 0.036*"air" + 0.036*"ryanair" '
  '+ 0.036*"think" + 0.036*"go" + 0.019*"friendly" + 0.019*"tell" + '
  '0.019*"faith"'),
 (2,
  '0.049*"cheap" + 0.044*"refund" + 0.039*"option" + 0.038*"think" + '
  '0.033*"honest" + 0.033*"give" + 0.030*"really" + 0.027*"sure" + 0.027*"go" '
  '+ 0.022*"know"'),
 (3,
  '0.100*"way" + 0.050*"fill" + 0.050*"let" + 0.050*"round" + 0.026*"flight" + '
  '0.026*"look" + 0.026*"barcelona" + 0.026*"late" + 0.026*"include" + '
  '0.026*"back"'),
 (4,
  '0.079*"think" + 0.053*"interface" + 0.027*"come" + 0.027*"good" + '
  '0.027*"help" + 0.027*"friendly" + 0.027*"know" + 0.027*"used" + '
  '0.027*"user" + 0.027*"usually"'),
 (5,
  '0.068*"cheap" + 0.068*"go" + 0.057*"flight" + 0.034*"time" + '
  '0.034*"trigger" + 0.023*"reall

In [43]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -6.029150370856625

Coherence Score:  0.45019473548461


In [44]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word, mds='mmds')   
vis

In [45]:
pyLDAvis.save_html(vis, 'PPT02.html')

# PPT03

In [46]:
all = pd.read_csv('Usability Test Data.csv')
PPT03 = all.loc[(all['Person'] == 'PPT03') & (all['Person'] != 'Interviewer')]

In [47]:
#tokenize words and clean text
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(PPT03['Question/Answer']))

print(data_words[:1])

[['the', 'last', 'trip', 'was', 'the', 'four', 'days', 'in', 'france', 'for', 'my', 'cousin', 'wedding', 'so', 'it', 'was', 'bit']]


In [48]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['the', 'last', 'trip', 'was', 'the', 'four', 'days', 'in', 'france', 'for', 'my', 'cousin', 'wedding', 'so', 'it', 'was', 'bit']


In [49]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [50]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['last', 'trip', 'day', 'cousin', 'wedding', 'bit']]


In [51]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)]]


In [52]:
id2word[0]

'bit'

In [53]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('bit', 1),
  ('cousin', 1),
  ('day', 1),
  ('last', 1),
  ('trip', 1),
  ('wedding', 1)]]

In [54]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [55]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.003*"bag" + 0.003*"arrival" + 0.003*"outbound" + 0.003*"midnight" + '
  '0.003*"matter" + 0.003*"late" + 0.003*"ehe" + 0.003*"early" + 0.003*"cabin" '
  '+ 0.003*"return"'),
 (1,
  '0.033*"bit" + 0.033*"last" + 0.033*"wedding" + 0.033*"cousin" + '
  '0.033*"trip" + 0.033*"day" + 0.033*"book" + 0.033*"know" + 0.033*"strange" '
  '+ 0.033*"case"'),
 (2,
  '0.040*"ok" + 0.040*"thing" + 0.040*"never" + 0.020*"know" + 0.020*"airline" '
  '+ 0.020*"loading" + 0.020*"inaudible" + 0.020*"booking" + 0.020*"website" + '
  '0.020*"continue"'),
 (3,
  '0.043*"ok" + 0.043*"date" + 0.043*"click" + 0.043*"price" + 0.029*"think" + '
  '0.029*"outside" + 0.029*"box" + 0.015*"see" + 0.015*"show" + '
  '0.015*"inaudible"'),
 (4,
  '0.050*"sometimes" + 0.034*"give" + 0.034*"information" + 0.034*"get" + '
  '0.034*"especially" + 0.034*"step" + 0.017*"lufthansa" + 0.017*"book" + '
  '0.017*"frankfurt" + 0.017*"straight"'),
 (5,
  '0.003*"bag" + 0.003*"arrival" + 0.003*"outbound" + 0.003*"midnight"

In [56]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -6.111540555302239

Coherence Score:  0.47612121739874774


In [57]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word, mds='mmds')   
vis

In [58]:
pyLDAvis.save_html(vis, 'PPT03.html')