# Keyword Extraction and Topic Modelling

At the very beginning, the most basic step is to install and import some necessary libraries.

In [1]:
!pip install gensim



In [2]:
!pip install -U scikit-learn



In [3]:
# install and import some necessary libraries 
import pandas as pd  # for data manipulation and analysis
import nltk          # for natural language processing
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [4]:
import re                   
import numpy as np          # for numerical calculations
from pprint import pprint   # print data structures in a readable, pretty way
import gensim               # a Python library for topic modelling
import gensim.corpora as corpora 
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import pyLDAvis             # visualize topic modelling
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt 
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [5]:
import csv

## Upload the dataset and Clean Data

In [6]:
# read in a tsv file with no header row
df = pd.read_csv('DH_CollectingData2022_review.tsv', sep = '\t', header = None, quoting = csv.QUOTE_NONE)

In [7]:
df.head()

Unnamed: 0,0,1
0,"For Nik, he only wants to silence the cacophon...",0.0
1,"""I can play this two ways",0.0
2,"Mild, because it isn't conclusive, and doesn't...",-1.0
3,You can also get some more information about t...,0.0
4,"Soon, Hero, who has never had friends, is thru...",0.0


Since the columns don't have names, I renamed them.

In [8]:
# rename the columns 
df = df.rename(columns={0: 'texts', 1: 'sentiment'})

In [9]:
df.head()

Unnamed: 0,texts,sentiment
0,"For Nik, he only wants to silence the cacophon...",0.0
1,"""I can play this two ways",0.0
2,"Mild, because it isn't conclusive, and doesn't...",-1.0
3,You can also get some more information about t...,0.0
4,"Soon, Hero, who has never had friends, is thru...",0.0


In [10]:
df.dtypes

texts         object
sentiment    float64
dtype: object

In this case, I change the data type in the sentiment column from float number to integer number.

In [11]:
# find the missing values

df.isna().sum()

texts        0
sentiment    2
dtype: int64

There are two missing values in the 'sentiment' column, and in this case, I deleted all the missing data.

In [12]:
# drop the rows with missing values
df.dropna(inplace=True) 

In [13]:
# change the data type
df['sentiment'] = df['sentiment'].astype(int)

In [14]:
df.head()

Unnamed: 0,texts,sentiment
0,"For Nik, he only wants to silence the cacophon...",0
1,"""I can play this two ways",0
2,"Mild, because it isn't conclusive, and doesn't...",-1
3,You can also get some more information about t...,0
4,"Soon, Hero, who has never had friends, is thru...",0


In [15]:
# check how many rows and columns in the DataFrame
df.shape

(388, 2)

## Extract Top 20 keywords

By modifying the code for tf-idf, extract the top 20 keywords for the whole dataset, firstly, I choose "texts" to create a data series that contains all the texts.

In [16]:
df_texts = df['texts']
print(df_texts)

0      For Nik, he only wants to silence the cacophon...
1                              "I can play this two ways
2      Mild, because it isn't conclusive, and doesn't...
3      You can also get some more information about t...
4      Soon, Hero, who has never had friends, is thru...
                             ...                        
385    August is torn by his actions but he absolutel...
386    Heroine Elise Benton is witty in the present d...
387                 I am glad there will be a part three
388    Sometimes while they were in a lesson, the wol...
389                                  I remained involved
Name: texts, Length: 388, dtype: object


In [17]:
# Code retrived from: https://www.analyticsvidhya.com/blog/2021/06/must-known-techniques-for-text-preprocessing-in-nlp/#:~:text=Text%20preprocessing%20is%20a%20method,text%20in%20a%20different%20case.

# use to stem the words into root word
lemmatizer = WordNetLemmatizer()

def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

df['df_texts'] = df['texts'].apply(lambda text: lemmatize_words(text))

In [18]:
# remove punctuation
import string

df['df_texts'] = df['texts'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '' , x))


In [19]:
# lowercase 
df['df_texts'] = df['df_texts'].str.lower()

In [20]:
df['df_texts']

0      for nik he only wants to silence the cacophony...
1                               i can play this two ways
2      mild because it isnt conclusive and doesnt giv...
3      you can also get some more information about t...
4      soon hero who has never had friends is thrust ...
                             ...                        
385    august is torn by his actions but he absolutel...
386    heroine elise benton is witty in the present d...
387                 i am glad there will be a part three
388    sometimes while they were in a lesson the wolf...
389                                  i remained involved
Name: df_texts, Length: 388, dtype: object

In [21]:
# concatenate all the sentences in the list into a single text
all_texts = ' '.join(df['df_texts'].tolist())

This will be used to extract the top 20 keywords for the whole dataset using tf-idf.

### TfidfVectorizer

In [22]:
# remove common English words that won't be of any value for keyword extraction
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# apply TfidfVectorizer to the all texts and transform the texts into a matrix of TF-IDF values
texts_tfidf = tfidf_vectorizer.fit_transform([all_texts])

In [23]:
# Stores the TF-IDF values into a DataFrame, with each row representing a text and each column representing a word
texts_tfidf_matrix = pd.DataFrame(texts_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print(texts_tfidf_matrix)

        100        12        14        15     1920s        20      2013  \
0  0.006966  0.006966  0.013931  0.006966  0.006966  0.006966  0.006966   

         46        50       600  ...      year     years    yelled   yelling  \
0  0.006966  0.020897  0.006966  ...  0.020897  0.013931  0.013931  0.006966   

      youll     young     youre     youve      zero     zusak  
0  0.006966  0.006966  0.013931  0.006966  0.006966  0.006966  

[1 rows x 1571 columns]


In [24]:
# convert the concatenated dataframe into a dictionary
words_dict_tfidf = texts_tfidf_matrix.to_dict('index')

# store the top 20 keywords for each row and print them
word_repr_tfidf = {}
for texts_id, target_words in words_dict_tfidf.items():
    list_targets = [(k, v) for k, v in target_words.items()]
    list_targets_sorted = sorted(list_targets, key=lambda x: x[1], reverse=True)
    word_repr_tfidf[texts_id] = list_targets_sorted[0:20]

print(word_repr_tfidf)

{0: [('book', 0.5224229744107162), ('read', 0.3134537846464297), ('story', 0.2437973880583342), ('just', 0.18110663112904826), ('characters', 0.16717535181142917), ('good', 0.13931279317619097), ('im', 0.13931279317619097), ('like', 0.1323471535173814), ('really', 0.1323471535173814), ('great', 0.11145023454095278), ('love', 0.11145023454095278), ('novel', 0.11145023454095278), ('reading', 0.09751895522333368), ('think', 0.09751895522333368), ('character', 0.09055331556452413), ('did', 0.08358767590571459), ('books', 0.07662203624690503), ('doesnt', 0.07662203624690503), ('interesting', 0.07662203624690503), ('liked', 0.07662203624690503)]}


The output shows that the top 20 keywords for the whole dataset are: 'book', 'read', 'story', 'just', 'characters', 'good', 'im', 'like', 'really', 'great', 'love', 'novel', 'reading', 'think', 'character', 'did', 'books', 'doesnt', 'interesting', ‘liked'.

# Topic Modelling

### Topic Model for the Positive Sentences

In [25]:
# create a new dataframe with only positive sentiment
texts_pos = df.loc[df['sentiment'] == 1]['texts']
texts_pos = texts_pos.apply(lambda x: x.lower())
texts_pos.head()

5     i did like steven (or stephen? i listened to t...
6     the plot is quick moving and the action is vio...
7                      loved everything about this book
9                                     great, quick read
10    although there isn't character development, as...
Name: texts, dtype: object

#### Pre-processing

In [26]:
#remove non-alphabetic characters

def remove_non_alphabetic(texts):
    return [re.sub('[^a-zA-Z]', ' ', str(doc)) for doc in texts]
    
texts_pos_clean = remove_non_alphabetic(texts_pos)

In [27]:
# remove stop words from each positive text

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

stop_words = stopwords.words('english')
texts_pos_nostops = remove_stopwords(texts_pos_clean)

In [28]:
# reduce inflected words to their root word

def lemmatize(texts):
    lemmatizer = WordNetLemmatizer()
    return [[lemmatizer.lemmatize(word) for word in doc] for doc in texts]

texts_pos_lemmatized = lemmatize(texts_pos_nostops)

In [29]:
texts_pos_ppr = texts_pos_lemmatized

In [30]:
print(texts_pos_ppr)

[['like', 'steven', 'stephen', 'listened', 'book'], ['plot', 'quick', 'moving', 'action', 'violent'], ['loved', 'everything', 'book'], ['great', 'quick', 'read'], ['although', 'character', 'development', 'case', 'mystery', 'novel', 'yet', 'reader', 'come', 'enlightened', 'several', 'notion', 'idea', 'life'], ['liked', 'ending'], ['believe', 'barnes', 'delivers', 'promise', 'book', 'garden', 'stone', 'well', 'worth', 'read'], ['finish', 'pig', 'island', 'really', 'like', 'ending', 'read', 'hanging', 'hill'], ['narrator', 'good', 'audiobook', 'seven', 'hour'], ['great', 'story', 'girl', 'friendship', 'dog'], ['read', 'book', 'admire'], ['year', 'old', 'loved'], ['much', 'liked', 'main', 'character', 'never', 'questioned', 'acted', 'reacted', 'bizarre', 'situation', 'impressive'], ['think', 'another', 'reason', 'enjoyed', 'much', 'reminded', 'bit'], ['read', 'hayder', 'hanging', 'hill', 'enjoyed', 'ending', 'maybe', 'yelled', 'book', 'little', 'bit', 'read', 'pig', 'island'], ['cathy', 'a

In [31]:
# creates the vocabulary of the corpus (the vocabulary of all the positive sentiments in the dataset) 
# and associates it with a unique id. Words are represented by unique ids.
id2word = corpora.Dictionary(texts_pos_ppr)

# create a Bag-of-words of each text in texts_pos
corpus_pos = [id2word.doc2bow(text) for text in texts_pos_ppr]

In [32]:
print(corpus_pos[:5])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)], [(5, 1), (6, 1), (7, 1), (8, 1), (9, 1)], [(0, 1), (10, 1), (11, 1)], [(8, 1), (12, 1), (13, 1)], [(14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1)]]


Prints the first five elements of the list.

The first number is the index of the word in the vocabulary and the second number is the frequency of this word in the specific document

In [33]:
# see the actual words
print([[(id2word[id], freq) for id, freq in cp] for cp in corpus_pos[:3]])

[[('book', 1), ('like', 1), ('listened', 1), ('stephen', 1), ('steven', 1)], [('action', 1), ('moving', 1), ('plot', 1), ('quick', 1), ('violent', 1)], [('book', 1), ('everything', 1), ('loved', 1)]]


Train an LDA model.

Use corpus_pos for topic modelling, which was generated from the positive texts after removing stop words.

passes: total number of training passes

chunksize: number of documents processed at a time

In [34]:
# for 'num_topics' parameter , create an LDA model with 4 topics
# for 'passes' parameter, perform 100 passes over the corpus during training
# for 'chunksize' parameter, processing each chunk of 20 documents

lda_model_pos_100p = gensim.models.ldamodel.LdaModel(
   corpus=corpus_pos, id2word=id2word, num_topics=4, random_state=42, 
   update_every=1, chunksize=10, passes=100, alpha='auto', per_word_topics=True
)

In [35]:
pprint(lda_model_pos_100p.print_topics())

[(0,
  '0.014*"world" + 0.014*"series" + 0.013*"well" + 0.013*"developed" + '
  '0.011*"could" + 0.010*"hero" + 0.010*"many" + 0.010*"one" + '
  '0.010*"crawford" + 0.008*"part"'),
 (1,
  '0.037*"character" + 0.024*"good" + 0.021*"novel" + 0.020*"great" + '
  '0.015*"lot" + 0.015*"reader" + 0.014*"mystery" + 0.012*"liked" + '
  '0.012*"better" + 0.012*"story"'),
 (2,
  '0.059*"book" + 0.025*"read" + 0.016*"much" + 0.015*"really" + 0.015*"love" '
  '+ 0.011*"think" + 0.011*"story" + 0.011*"first" + 0.010*"time" + '
  '0.009*"way"'),
 (3,
  '0.032*"read" + 0.023*"well" + 0.012*"bit" + 0.012*"hill" + 0.010*"little" + '
  '0.010*"take" + 0.010*"fun" + 0.010*"voice" + 0.009*"see" + 0.009*"star"')]


In [36]:
# The output is stored in the variable doc_lda_pos
doc_lda_pos = lda_model_pos_100p[corpus_pos]

In [37]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model_pos_100p, corpus_pos, id2word)
vis

  default_term_info = default_term_info.sort_values(


In [38]:
pyLDAvis.save_html(vis, 'lda_visualization_pos.html')

### Topic Model for the Negative Sentences

In [39]:
# create a new dataframe with only negative sentiment

texts_neg = df.loc[df['sentiment'] == -1]['texts']
texts_neg.head()

2     Mild, because it isn't conclusive, and doesn't...
12    Going in I really liked it but unfortunately l...
14    I'm not giving this 5 stars because the big re...
17    eh I hate how the author made Duke from a nice...
21    I guess I didn't track this on goodreads when ...
Name: texts, dtype: object

#### Pre-processing

In [40]:
#remove non-alphabetic characters

texts_neg_clean = remove_non_alphabetic(texts_neg)

# remove stop words from each negativetext

stop_words = stopwords.words('english')
texts_neg_nostops = remove_stopwords(texts_neg_clean)

# reduce inflected words to their root word

texts_neg_lemmatized = lemmatize(texts_neg_nostops)

texts_neg_ppr = texts_neg_lemmatized

In [41]:
id2word = corpora.Dictionary(texts_neg_ppr)
corpus_neg = [id2word.doc2bow(text) for text in texts_neg_ppr]

In [42]:
# see the actual words
print([[(id2word[id], freq) for id, freq in cp] for cp in corpus_neg[:3]])

[[('appreciate', 1), ('conclusive', 1), ('fully', 1), ('give', 1), ('information', 1), ('lloyd', 1), ('mild', 1), ('need', 1), ('story', 1), ('telling', 1), ('u', 1)], [('book', 1), ('bored', 1), ('end', 1), ('finished', 1), ('first', 1), ('going', 1), ('hard', 1), ('infuriated', 1), ('left', 1), ('liked', 1), ('really', 2), ('think', 1), ('unfortunately', 1), ('wanted', 1)], [('really', 1), ('big', 1), ('case', 1), ('classic', 1), ('disappointed', 1), ('giving', 1), ('kind', 1), ('possible', 1), ('probable', 1), ('reveal', 1), ('star', 1)]]


In [43]:
# for 'num_topics' parameter , create an LDA model with 4 topics
# for 'passes' parameter, perform 100 passes over the corpus during training
# for 'chunksize' parameter, processing each chunk of 5 documents

lda_model_neg_100p = gensim.models.ldamodel.LdaModel(
   corpus=corpus_neg, id2word=id2word, num_topics=4, random_state=42, 
   update_every=1, chunksize=5, passes=100, alpha='auto', per_word_topics=True
)

In [44]:
pprint(lda_model_neg_100p.print_topics())

[(0,
  '0.034*"character" + 0.022*"felt" + 0.021*"feel" + 0.018*"made" + '
  '0.016*"author" + 0.016*"every" + 0.013*"scene" + 0.012*"even" + '
  '0.012*"cheesier" + 0.010*"one"'),
 (1,
  '0.061*"book" + 0.037*"first" + 0.022*"part" + 0.021*"like" + 0.020*"sequel" '
  '+ 0.018*"unfortunately" + 0.017*"overlooked" + 0.017*"largely" + '
  '0.017*"ignored" + 0.017*"flaw"'),
 (2,
  '0.020*"get" + 0.018*"interesting" + 0.017*"much" + 0.014*"reading" + '
  '0.013*"place" + 0.013*"got" + 0.013*"though" + 0.011*"sort" + 0.009*"found" '
  '+ 0.009*"lee"'),
 (3,
  '0.017*"disappointed" + 0.016*"story" + 0.015*"time" + 0.015*"really" + '
  '0.014*"got" + 0.013*"good" + 0.013*"something" + 0.011*"much" + '
  '0.010*"wanted" + 0.010*"finished"')]


In [45]:
pyLDAvis.enable_notebook()
vis_neg = pyLDAvis.gensim_models.prepare(lda_model_neg_100p, corpus_neg, id2word)
vis_neg

  default_term_info = default_term_info.sort_values(


In [46]:
pyLDAvis.save_html(vis_neg, 'lda_visualization.html')