# Topic Modeling

### Load relevant libraries and data

In [None]:
# for data manipulation
import numpy as np
import pandas as pd
import ast

# for NLP
import nltk
from nltk.corpus import stopwords
import gensim
import gensim.corpora as corpora
from gensim import models
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import TfidfModel
import spacy
import pyLDAvis
import pyLDAvis.gensim

# for counter
from tqdm import tqdm

In [None]:
# load the already pre-processed dataframe
df = pd.read_csv('df_tokenized.csv')

In [None]:
df.head()

Unnamed: 0,tokens,sentiment
0,"['great', 'locat', 'close', 'main', 'public', ...",positive
1,"['famili', 'four', 'thi', 'flat', 'can', 'acco...",positive
2,"['place', 'wonder', 'plenti', 'room', 'us', 'h...",positive
3,"['great', 'locat', 'truli', 'onli', 'coupl', '...",positive
4,"['great', 'place', 'perfect', 'weekend', 'not'...",positive


In following steps only nouns, adjectives, verbs and adverbs are taken into account!

In [None]:
# convert to list of lists

# get back tokenized form
data = df['tokens'].apply(ast.literal_eval)

# convert back to a list of lists
data = data.tolist()

In [None]:
# initiate spacy pos tagging
nlp = spacy.load('en_core_web_sm')

In [None]:
# only including nouns, adjectives, verbs and adverbs
final_data = []

for tokens in tqdm(data):
    review = nlp(' '.join(tokens))
    new_rev = []
    for word in review:
        if word.pos_ in ['NOUN', 'ADJ', 'VERB', 'ADV']:
            new_rev.append(word)
    final_data.append(new_rev)

100%|██████████| 660645/660645 [2:12:26<00:00, 83.14it/s]   


In [None]:
# convert spacy format back to string
data = [[str(token) for token in tokens] for tokens in tqdm(final_data)]

100%|██████████| 660645/660645 [00:14<00:00, 46297.37it/s]


In [None]:
# convert to dataframe
df = pd.DataFrame(
    {'tokens': data,
     'sentiment': df['sentiment']
    })

In [None]:
# save df as csv
# df.to_csv('df_LDA.csv', index=False)

In [None]:
# load the data
df = pd.read_csv('topic_modeling_final.csv')

In [None]:
# convert to list of lists

# get back tokenized form
data = df['tokens'].apply(ast.literal_eval)

# convert back to a list of lists
data = data.tolist()

In [None]:
data[:2]

[['great',
  'close',
  'main',
  'public',
  'transport',
  'easi',
  'anywher',
  'get',
  'super',
  'easi',
  'check',
  'veri',
  'respons',
  'clean',
  'close_main',
  'main_public',
  'public_transport',
  'easi_get',
  'get_anywher',
  'get_back',
  'super_easi',
  'check_veri',
  'veri_respons',
  'respons_clean',
  'clean_well',
  'stock_thank',
  'transport_easi_get',
  'clean_well_stock'],
 ['famili',
  'thi',
  'flat',
  'easili',
  'lot',
  'space',
  'edeka',
  'corner',
  'get',
  'need',
  'great',
  'stay',
  'four_thi',
  'thi_flat',
  'flat_can',
  'can_accommod',
  'accommod_person',
  'person_easili',
  'easili_lot',
  'lot_space',
  'space_us',
  'can_get',
  'get_everyth',
  'need_great',
  'great_stay',
  'thi_flat_can',
  'around_corner_can',
  'corner_can_get',
  'can_get_everyth',
  'everyth_need_great']]

### Data pre-processing

Tokenization, lemmatization, stemming have already been done. The following code does the other steps necessary for topic modeling.

##### Create dictionary with all unique words: id2word

In [None]:
# create id2word which contains all unique words used in all reviews
id2word = corpora.Dictionary(data)

In [None]:
# create a list containing all reviews with their respective used words and word count
corpus = []
for text in data:
    new = id2word.doc2bow(text)
    corpus.append(new)

In [None]:
# check the first review
print(corpus[0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1)]


However, we will filter for extreme values later to get a better result and create a new list that will replace this one.

In [None]:
# check that length of corpus is as long as the amount of documents
len(corpus)

660645

In [None]:
# remove duplicate documents
corpus = [x for x in corpus if x]

In [None]:
# check new length of corpus
len(corpus)

659339

##### Filter for extreme values

Tokens that appear in less than 30 reviews are filtered out or in more than 80% of reviews will be filtered out.

Finally after both first steps, we keep the 10,000 most frequent tokens.

We played with the thresholds / parameters to obtain a satisfying result.

In [None]:
# filter for extreme values
id2word.filter_extremes(no_below=30, no_above=0.8, keep_n=10000)

In [None]:
# check new corpus length
print(len(id2word.iteritems()))

10000


##### Create a list containing all reviews with their respective used words and word count: bow_corpus

In [None]:
bow_corpus = [id2word.doc2bow(doc) for doc in data]

In [None]:
# check the first review
print(bow_corpus[0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1)]


In [None]:
# check that length of corpus is as long as the amount of documents
print(len(bow_corpus))

660645


In [None]:
# check the representation with actual words
first_review = bow_corpus[0]

for token in range(len(first_review)):
    print('Word {} (\"{}\") appears {} time.'.format(first_review[token][0], id2word[first_review[token][0]], first_review[token][1]))

Word 0 ("anywher") appears 1 time.
Word 1 ("check") appears 1 time.
Word 2 ("check_veri") appears 1 time.
Word 3 ("clean") appears 1 time.
Word 4 ("clean_well") appears 1 time.
Word 5 ("clean_well_stock") appears 1 time.
Word 6 ("close") appears 1 time.
Word 7 ("close_main") appears 1 time.
Word 8 ("easi") appears 2 time.
Word 9 ("easi_get") appears 1 time.
Word 10 ("get") appears 1 time.
Word 11 ("get_anywher") appears 1 time.
Word 12 ("get_back") appears 1 time.
Word 13 ("great") appears 1 time.
Word 14 ("main") appears 1 time.
Word 15 ("public") appears 1 time.
Word 16 ("public_transport") appears 1 time.
Word 17 ("respons") appears 1 time.
Word 18 ("super") appears 1 time.
Word 19 ("super_easi") appears 1 time.
Word 20 ("transport") appears 1 time.
Word 21 ("veri") appears 1 time.
Word 22 ("veri_respons") appears 1 time.


### Apply tf-idf scores to the corpus

In [None]:
# apply tf-idf to the bow_corpus
tfidf_model = TfidfModel(bow_corpus)
corpus_tfidf = tfidf_model[bow_corpus]

### Running LDA using Bag-of-Words

Latent Dirichlet Allocation utilizes the tf-idf scores. 

The tf-idf scores rank the importance of a token in a review given a corpus of reviews.

The idea is that the more a token appears in a document, the higher the token is ranked (tf: term frequency).
However, the term gets a lower weight the more it appears in other documents (idf: inverse document frequency).

We played with the parameters to obtain a satisfying result (e.g., number of topics etc.).

In [None]:
# initiate model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus_tfidf, 
                                            id2word=id2word,
                                            num_topics=6,
                                            random_state=42,
                                            passes=5,
                                            alpha='auto')

### Visualizing topics

In [None]:
# for visualization
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, id2word, mds='mmds', R=15)



In [None]:
# visualize
vis

In [None]:
# check identified topics
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx+1, topic))

Topic: 1 
Words: 0.013*"station" + 0.012*"train" + 0.009*"train_station" + 0.007*"hous" + 0.007*"far" + 0.006*"beauti_apart" + 0.005*"veri_much" + 0.005*"car" + 0.004*"detail" + 0.004*"pick"
Topic: 2 
Words: 0.022*"transport" + 0.020*"public" + 0.019*"public_transport" + 0.012*"pleasant" + 0.010*"nice_place" + 0.007*"veri_nice_apart" + 0.007*"veri_nice_place" + 0.007*"host_veri" + 0.006*"pleasant_stay" + 0.006*"veri_pleasant"
Topic: 3 
Words: 0.010*"amaz_view" + 0.009*"love_apart" + 0.007*"locat_easi" + 0.007*"excel_locat" + 0.006*"appoint" + 0.005*"condit" + 0.005*"great_apart_great" + 0.005*"locat_super" + 0.005*"well_appoint" + 0.005*"tram_bu"
Topic: 4 
Words: 0.011*"apart" + 0.008*"stay" + 0.008*"nice" + 0.008*"recommend" + 0.007*"host" + 0.007*"perfect" + 0.006*"love" + 0.005*"view" + 0.005*"help" + 0.005*"walk"
Topic: 5 
Words: 0.016*"beauti_view" + 0.014*"opera" + 0.013*"opera_hous" + 0.012*"super_nice" + 0.011*"thank_veri" + 0.011*"calm" + 0.011*"thank_veri_much" + 0.010*"nice_

### Running LDA using Bag-of-Words without tf-idf scores

Here, LDA is conducted with the bow_corpus and without tf-idf representation.

In [None]:
# initiate model
lda_model = gensim.models.ldamodel.LdaModel(corpus=bow_corpus, 
                                            id2word=id2word,
                                            num_topics=6,
                                            random_state=42,
                                            passes=5,
                                            alpha='auto')

### Visualizing topics

In [None]:
# for visualization
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, id2word, mds='mmds', R=15)



In [None]:
# visualize
vis

In [None]:
# check identified topics
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx+1, topic))

Topic: 1 
Words: 0.027*"time" + 0.023*"make" + 0.020*"hous" + 0.018*"home" + 0.017*"stay" + 0.016*"go" + 0.014*"thi" + 0.014*"even" + 0.014*"visit" + 0.013*"give"
Topic: 2 
Words: 0.171*"apart" + 0.061*"perfect" + 0.025*"love" + 0.025*"back" + 0.025*"view" + 0.025*"come" + 0.024*"wonder" + 0.016*"thi" + 0.014*"come_back" + 0.014*"question"
Topic: 3 
Words: 0.044*"walk" + 0.033*"close" + 0.023*"central" + 0.023*"easi" + 0.021*"restaur" + 0.020*"station" + 0.016*"area" + 0.014*"quiet" + 0.013*"lot" + 0.012*"train"
Topic: 4 
Words: 0.022*"room" + 0.015*"bed" + 0.014*"night" + 0.012*"kitchen" + 0.010*"space" + 0.010*"use" + 0.010*"small" + 0.010*"onli" + 0.010*"bathroom" + 0.009*"live"
Topic: 5 
Words: 0.103*"enjoy" + 0.066*"much" + 0.045*"enjoy_stay" + 0.029*"anyon" + 0.029*"definit_recommend" + 0.021*"realli_enjoy" + 0.021*"pleasant" + 0.018*"veri_much" + 0.018*"definit" + 0.016*"would_definit"
Topic: 6 
Words: 0.078*"great" + 0.070*"stay" + 0.058*"place" + 0.044*"recommend" + 0.041*"hos