In [1]:
# LSI (Latent Semantic Indexing) is an indexing and information retrieval method that uses 
# Singular Value Decomposition to identify relationships between terms and concepts in a collection of documents.

# Extracts the conceptual content of a document by establising associations
# between those terms that occur in similar cotexts - these terms are more likely to have similar meanings

# Here, we will use Gensim, but the LSI model is also present in Sklearn:
# sklearn.decomposition.TruncatedSVD

In [2]:
# Gensim is a free Python library to analyze plain text documents for semantic structure
import gensim 

import nltk
nltk.download('wordnet') # WordNET is a lexical database of words in more than 200 languages
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/isavchuk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/isavchuk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/isavchuk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/isavchuk/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
# Importing libraries and packages 
import pandas as pd
import re

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from gensim.corpora import Dictionary
from gensim.models import LsiModel

In [4]:
# Dataset - large multi-domain ontology derived from Wikipedia, can be found at:
# https://www.kaggle.com/datasets/danofer/dbpedia-classes
# http://dbpedia-generic.tib.eu/release/text/
dbpedia_df = pd.read_csv('./datasets/dbpedia/DBPEDIA_train.csv')

dbpedia_df.shape

(240942, 4)

In [5]:
NUM_SAMPLES = 10000

dbpedia_df = dbpedia_df.sample(NUM_SAMPLES, random_state=1000, replace=False).reset_index(drop=True)
dbpedia_df.sample(5)

Unnamed: 0,text,l1,l2,l3
9431,"8776 Campestris, provisional designation 2287 ...",Place,CelestialBody,Planet
7159,Les Capewell is a retired English professional...,Agent,Athlete,DartsPlayer
2513,Cteniza moggridgei is a spider species found i...,Species,Animal,Arachnid
9676,The Zeitschrift für Ostmitteleuropa-Forschung ...,Work,PeriodicalLiterature,AcademicJournal
6793,The 1991 PGA Championship was the 73rd PGA Cha...,Event,Tournament,GolfTournament


In [6]:
text = dbpedia_df['text']
text.head()

0    Piz de la Lumbreida is a mountain of the Lepon...
1    The Men's 81 kg Judo competition at the 2008 S...
2    21561 Masterman (1998 QR93) is a main-belt ast...
3    The Knud Rasmussen Range (Danish: Knud Rasmuss...
4    Richard \"Dick\" Ramsdale (birth registered Ap...
Name: text, dtype: object

In [7]:
# Pre-processing to remove numbers
documents_list=[]

for line in text:
    sentence = line.strip()
    new_sentence= re.sub(r"\d","",sentence)
    
    documents_list.append(new_sentence)
    
print(documents_list[0])

Piz de la Lumbreida is a mountain of the Lepontine Alps, overlooking San Bernardino in the canton of Graubünden.


In [8]:
len(documents_list) # we now have list of 10000 text documents, with all numbers removed

10000

In [9]:
stop_words = set(stopwords.words('english'))
stop_words.update('.', ',', '"', "'", '!', '?', ';', ':', '(', ')','[', ']', '{', '}', '#', '...', '--', "'s", 'also', 
                  '&', '-', '—', '=', 'known', 'mi', 'km', '$', "''", '\\', '*', '–', "'s", '\\n', )

In [10]:
processed_list = []
lemmatizer = WordNetLemmatizer()

for doc in documents_list:
    tokens = word_tokenize(doc.lower())
    stopped_tokens = [token for token in tokens if token not in stop_words]
    lemmatized_tokens = [lemmatizer.lemmatize(i, pos='n') for i in stopped_tokens]
    processed_list.append(lemmatized_tokens)
    
print(processed_list[0])

['piz', 'de', 'la', 'lumbreida', 'mountain', 'lepontine', 'alp', 'overlooking', 'san', 'bernardino', 'canton', 'graubünden']


In [11]:
len(processed_list)

10000

In [12]:
term_dictionary = Dictionary(processed_list)
print(term_dictionary)

Dictionary<66723 unique tokens: ['alp', 'bernardino', 'canton', 'de', 'graubünden']...>


In [13]:
term_dictionary.token2id["maximum"]

55

In [14]:
term_dictionary.token2id["ridge"]

1747

In [15]:
# Document-term matrix is a mathematical matrix that describes the frequency of terms
# that occur in a set of documents
document_term_matrix = [term_dictionary.doc2bow(document) 
                        for document in processed_list] #generate Bag-of-words representation
len(document_term_matrix)

10000

In [16]:
# Let's have a look at one particular document
print(processed_list[5])
print(document_term_matrix[5], '\n')
print('List of words and document terms: ', len(processed_list[5]), len(document_term_matrix[5]))
print('Set of words and document terms: ', len(set(processed_list[5])), len(document_term_matrix[5]))

['john', 'warren', 'bettis', 'october', 'june', 'ohio', 'jurist', 'served', 'judge', 'ohio', 'court', 'claim']
[(43, 1), (149, 1), (163, 1), (164, 1), (165, 1), (166, 1), (167, 1), (168, 1), (169, 2), (170, 1), (171, 1)] 

List of words and document terms:  12 11
Set of words and document terms:  11 11


In [17]:
NUM_TOPICS = 7

model = LsiModel(corpus=document_term_matrix, num_topics=NUM_TOPICS, id2word=term_dictionary)

# To show the keywords associated with the topics, along with the score
lsi_topics = model.show_topics(num_topics=NUM_TOPICS, formatted=False)
lsi_topics 

[(0,
  [("''", 0.7868019465282237),
   ("'s", 0.2949460633717554),
   ('album', 0.15503679854182595),
   ('first', 0.12029661732505383),
   ('also', 0.11615194695443597),
   ('single', 0.09729577077536046),
   ('song', 0.0935425692511347),
   ('one', 0.08980391920778537),
   ('released', 0.08950381575486088),
   ('year', 0.08113436708630277)]),
 (1,
  [('\\n', 0.9329067589089908),
   ('callistomimus', 0.16162991628371332),
   ('nosodendron', 0.15446341113908774),
   ("''", -0.12109436618298937),
   ('hyalella', 0.08957836824909708),
   ('arctodiaptomus', 0.08421716634244263),
   ('ceraeochrysa', 0.08046257864131594),
   ('geosesarma', 0.07480803196162861),
   ('th', 0.04982636289752765),
   ('division', 0.03929004531129575)]),
 (2,
  [("''", 0.47130589671823137),
   ("'s", -0.30038389507087965),
   ('team', -0.2252121031603723),
   ('season', -0.22512358971487034),
   ('league', -0.18238261649991783),
   ('\\n', 0.1554818831827751),
   ('first', -0.15207340076753897),
   ('football', -