![MSE Logo](https://moodle.msengineering.ch/pluginfile.php/1/core_admin/logocompact/300x300/1613732714/logo-mse.png "MSE Logo") 

# Latent Semantic Analysis with Gensim

In [1]:
import nltk
import pandas as pd
from TextPreprocessor import TextPreprocessor
from gensim import models, corpora, similarities
import json

from nltk.corpus import stopwords, wordnet

In [2]:
content = {}

with open("../../content_daily2.json") as f:
    content.update(json.load(f))
with open("../../content_guardian2.json") as f:
    content.update(json.load(f))
with open("../../content_huffpost2.json") as f:
    content.update(json.load(f))

text = list(content.values())

In [3]:
data_df = pd.DataFrame({'text': text})

## Data preprocessing

You will need first to preprocess the data through the following stages:
1. tokenization
2. stopword removal
2. POS-based filtering (optional)
3. lemmatization or stemming (optional)
4. addition of bigrams to each document (optional)
5. filtering of infrequent words
6. inspection and filtering of frequent words

In [4]:
language = 'english'
stop_words = set(stopwords.words(language))
stop_words = list(stop_words).extend(['\"', '\'', '\'\'', '`', '``', '\'s',"'",','])

processor = TextPreprocessor(
    language = language,
    pos_tags = {"n","j"},
    stopwords = stop_words,
    lemmatize=True,
    remove_numbers=True,
    punctuations=['.', ',', ':', ';', '?', '!', '"', '\'', '`', '``', '\'s','"'],
)

In [5]:
data_df['processed'] = processor.transform(data_df['text'])

In [6]:
data_df['tokenized'] = data_df['processed'].apply(nltk.word_tokenize)   

In [7]:
data_df.head()

Unnamed: 0,text,processed,tokenized
0,Deborah James has been honoured with a Damehoo...,deborah james damehood ps4million charity sinc...,"[deborah, james, damehood, ps4million, charity..."
1,Aussie children got the surprise of their live...,aussie child surprise life grandmother lounge ...,"[aussie, child, surprise, life, grandmother, l..."
2,Durham Police retrospectively fined a woman PS...,"durham police woman ps10,000 breaching covid r...","[durham, police, woman, ps10,000, breaching, c..."
3,Liz Truss will overhaul the foreign aid budget...,liz truss overhaul aid budget britain hand mon...,"[liz, truss, overhaul, aid, budget, britain, h..."
4,James Corden has praised Prince Harry as a dev...,james corden prince harry husband father child...,"[james, corden, prince, harry, husband, father..."


In [8]:
words_to_filter = {"u","two","could","mr","mp","one", "per", "cent","hrt","--","the","i","'s","...","..","u","iii","'","it","if","would","could","may","should","will","can","that","like","use","need", "no","put","we","you","he","she","us"}

data_df["filter"] = data_df["tokenized"].apply(lambda x: [word.replace("'","").replace(",","").replace(".","") for word in x]).apply(lambda x: [word for word in x if not word.isnumeric()]).apply(lambda x: [word for word in x if word not in words_to_filter]).apply(lambda x: [word for word in x if not word[:2] == "ps"])

In [9]:
vocabulary = data_df['filter'].sum()

most_common = [word for word, freq in nltk.FreqDist(vocabulary).items() if freq > 4 and word not in words_to_filter]

print(f"Vocabulary length: {len(nltk.FreqDist(vocabulary))}")
print(f"Most common words: {len(most_common)}")

data_df.head()

Vocabulary length: 46342
Most common words: 17023


Unnamed: 0,text,processed,tokenized,filter
0,Deborah James has been honoured with a Damehoo...,deborah james damehood ps4million charity sinc...,"[deborah, james, damehood, ps4million, charity...","[deborah, james, damehood, charity, since, mon..."
1,Aussie children got the surprise of their live...,aussie child surprise life grandmother lounge ...,"[aussie, child, surprise, life, grandmother, l...","[aussie, child, surprise, life, grandmother, l..."
2,Durham Police retrospectively fined a woman PS...,"durham police woman ps10,000 breaching covid r...","[durham, police, woman, ps10,000, breaching, c...","[durham, police, woman, breaching, covid, rule..."
3,Liz Truss will overhaul the foreign aid budget...,liz truss overhaul aid budget britain hand mon...,"[liz, truss, overhaul, aid, budget, britain, h...","[liz, truss, overhaul, aid, budget, britain, h..."
4,James Corden has praised Prince Harry as a dev...,james corden prince harry husband father child...,"[james, corden, prince, harry, husband, father...","[james, corden, prince, harry, husband, father..."


## LSA with Gensim

In [10]:
def train_lsa(filtered_texts, num_topics = 10):
    dictionary = corpora.Dictionary(filtered_texts)
    corpus = [dictionary.doc2bow(text) for text in filtered_texts]

    # transform the vectors to tf-idf representation
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    lsa = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics)
    corpus_tfidf = lsa[corpus_tfidf]

    return lsa,dictionary,corpus,corpus_tfidf

In [11]:
number_of_topics = 50

In [12]:
lsa_model, dictionary, corpus, corpus_tfidf = train_lsa(data_df['filter'], number_of_topics)

In [13]:
for index, topic in lsa_model.print_topics(number_of_topics, 5):
    print(f"{index:3d}: {topic}")

  0: 0.213*"ukraine" + 0.170*"russia" + 0.121*"labour" + 0.107*"war" + 0.105*"putin"
  1: 0.406*"ukraine" + 0.351*"russia" + 0.212*"putin" + -0.209*"labour" + 0.179*"war"
  2: 0.424*"abortion" + -0.275*"labour" + -0.232*"keir" + 0.188*"roe" + -0.179*"starmer"
  3: -0.507*"abortion" + -0.224*"roe" + -0.167*"labour" + -0.150*"keir" + 0.148*"musk"
  4: -0.325*"musk" + -0.214*"twitter" + 0.189*"rebekah" + 0.181*"vardy" + -0.179*"price"
  5: -0.539*"musk" + -0.370*"twitter" + -0.195*"trump" + 0.179*"rate" + 0.153*"price"
  6: 0.426*"rebekah" + 0.410*"vardy" + 0.318*"coleen" + 0.254*"rooney" + 0.234*"watt"
  7: 0.342*"queen" + 0.267*"harry" + 0.257*"prince" + 0.233*"royal" + 0.188*"meghan"
  8: 0.226*"keir" + 0.203*"durham" + -0.198*"ireland" + 0.188*"rate" + 0.159*"price"
  9: 0.556*"trump" + 0.353*"biden" + -0.232*"musk" + 0.175*"president" + -0.161*"abortion"
 10: 0.521*"vicky" + 0.380*"casey" + 0.199*"sheriff" + 0.188*"singleton" + 0.185*"lauderdale"
 11: -0.673*"depp" + -0.330*"heard" +

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
def wordsim(word1, word2, model, dictionary):
    vec_w1=dictionary.doc2bow(word1.lower().split())
    vec_w2=dictionary.doc2bow(word2.lower().split())

    #get words in lsa space
    lsa_w1=model[vec_w1]
    lsa_w2=model[vec_w2]

    return cosine_similarity(lsa_w1,lsa_w2)

In [17]:
from gensim import similarities

In [18]:
def word_ranking(word0, word_list, model, dictionary):
    # transform corpus to LSI space and index it
    vec_w0=dictionary.doc2bow(word0.lower().split())
    vec_w_list=[dictionary.doc2bow(text.lower().split()) for text in word_list]
    index = similarities.MatrixSimilarity(model[vec_w_list])

    #get word in lsa space
    lsa_w0=model[vec_w0]

    sims_w0 = index[lsa_w0]
    sims = sorted(enumerate(sims_w0), key=lambda item: -item[1])
    for doc_position, doc_score in sims:
        print(doc_score, dictionary[doc_position])

In [19]:
# call here the function on your choice of words

word_ranking("ukraine",set(vocabulary),lsa_model,dictionary)

1.0 tint
0.9917381 succeeds
0.98781854 dogmocracy
0.98700285 persecution
0.9863183 thrifty
0.98525906 life-limiting
0.98410064 tarmacked
0.98348814 ambulance
0.98134273 abd
0.9789631 install
0.9789631 falmouth
0.9789631 siva
0.9789631 tedium
0.9789631 breakneck
0.9783505 finlay
0.9783333 taiba
0.97766346 downbeat
0.9772215 gadget
0.97656524 groundwork
0.9755578 black-tie
0.974932 peterborough
0.9748407 bogorodsk
0.9745182 beer-drinking
0.9745182 developmental
0.973927 replenish
0.9736546 abduction
0.973055 brockenhurst
0.9718622 millen
0.971232 westerberg
0.9709941 kallum
0.9706558 o2
0.97040033 mega-millions
0.9701597 kensal
0.9696203 nyes
0.9694263 hogshead
0.9693794 giulia
0.9693475 hard
0.9688416 omnipresent
0.9687472 talbot
0.9683809 suitcase
0.96830803 blunts
0.9679539 project
0.96790206 waikiki
0.96790206 mini-budget
0.96790206 shailene
0.96790206 mudgee
0.96790206 yalyshev
0.96790206 coarsen
0.96790206 neytiri
0.9678335 speedwell
0.96705496 rehearse
0.96665996 sell
0.96656805 a

In [80]:
nltk.FreqDist(vocabulary).most_common(200)

[('year', 1122),
 ('people', 935),
 ('time', 846),
 ('told', 655),
 ('labour', 641),
 ('party', 634),
 ('day', 586),
 ('woman', 577),
 ('government', 514),
 ('ukraine', 488),
 ('election', 435),
 ('country', 434),
 ('right', 432),
 ('home', 428),
 ('court', 420),
 ('week', 394),
 ('family', 393),
 ('since', 392),
 ('tory', 389),
 ('minister', 385),
 ('child', 381),
 ('state', 381),
 ('think', 380),
 ('council', 379),
 ('uk', 368),
 ('johnson', 367),
 ('war', 362),
 ('russia', 360),
 ('month', 347),
 ('case', 341),
 ('show', 340),
 ('work', 331),
 ('heard', 324),
 ('leader', 317),
 ('house', 313),
 ('abortion', 313),
 ('city', 310),
 ('around', 307),
 ('want', 303),
 ('three', 301),
 ('company', 293),
 ('life', 292),
 ('police', 291),
 ('found', 290),
 ('another', 287),
 ('way', 269),
 ('group', 264),
 ('thing', 263),
 ('seat', 262),
 ('report', 259),
 ('part', 259),
 ('across', 258),
 ('decision', 251),
 ('result', 251),
 ('change', 250),
 ('public', 248),
 ('cost', 247),
 ('area', 245

In [15]:
id2word = corpora.Dictionary(data_df['filter'])

In [16]:
from gensim.models import CoherenceModel

coherence_model_lda = CoherenceModel(model=lsa_model, texts=data_df['filter'], dictionary=id2word, coherence='c_v')

coherence_lda = coherence_model_lda.get_coherence()

In [17]:
coherence_lda

0.4089103738170688

In [22]:
import pyLDAvis
import pyLDAvis.gensim_models 
pyLDAvis.enable_notebook()
vis = pyLDAvis.prepare(lsa_model, corpus, vocab=id2word)
vis

TypeError: prepare() missing 2 required positional arguments: 'doc_lengths' and 'term_frequency'