## **Step 1: Read documents from csv file** 

In [1]:
import pandas as pd

In [2]:
corpus = pd.read_csv('data1.csv', encoding='cp1252')

In [3]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

In [4]:
corpus

Unnamed: 0,topic,source,document
0,data_science,loc_1,Data Science (is) multi-disciplinary.
1,data_science,loc_2,Data Science has statistics.
2,data_science,loc_3,Python used in data science.
3,mental_health,loc_4,Mental Health (is) a state of the mind.
4,mental_health,loc_5,Mental Health includes emotional and psychological.
5,mental_health,loc_6,Watch out for mental health.
6,fashion,loc_7,Fashion can be a statement.
7,fashion,loc_8,"Fashion involves clothes, music, etc."
8,fashion,loc_9,Fashion is statement through clothes and music.


<br>

## **Step 2: Remove non alphanumeric characters in documents**

In [5]:
corpus['document_cleaned'] = corpus['document'].replace('[^A-Za-z0-9]', ' ', regex=True)

In [6]:
corpus

Unnamed: 0,topic,source,document,document_cleaned
0,data_science,loc_1,Data Science (is) multi-disciplinary.,Data Science is multi disciplinary
1,data_science,loc_2,Data Science has statistics.,Data Science has statistics
2,data_science,loc_3,Python used in data science.,Python used in data science
3,mental_health,loc_4,Mental Health (is) a state of the mind.,Mental Health is a state of the mind
4,mental_health,loc_5,Mental Health includes emotional and psychological.,Mental Health includes emotional and psychological
5,mental_health,loc_6,Watch out for mental health.,Watch out for mental health
6,fashion,loc_7,Fashion can be a statement.,Fashion can be a statement
7,fashion,loc_8,"Fashion involves clothes, music, etc.",Fashion involves clothes music etc
8,fashion,loc_9,Fashion is statement through clothes and music.,Fashion is statement through clothes and music


<br>

## **Step 3: Change characters in documents to lower case**

In [7]:
corpus['document_cleaned'] = corpus['document_cleaned'].str.lower()

In [8]:
corpus

Unnamed: 0,topic,source,document,document_cleaned
0,data_science,loc_1,Data Science (is) multi-disciplinary.,data science is multi disciplinary
1,data_science,loc_2,Data Science has statistics.,data science has statistics
2,data_science,loc_3,Python used in data science.,python used in data science
3,mental_health,loc_4,Mental Health (is) a state of the mind.,mental health is a state of the mind
4,mental_health,loc_5,Mental Health includes emotional and psychological.,mental health includes emotional and psychological
5,mental_health,loc_6,Watch out for mental health.,watch out for mental health
6,fashion,loc_7,Fashion can be a statement.,fashion can be a statement
7,fashion,loc_8,"Fashion involves clothes, music, etc.",fashion involves clothes music etc
8,fashion,loc_9,Fashion is statement through clothes and music.,fashion is statement through clothes and music


<br>

## **Step 4: Get stopwords from NLTK**

In [9]:
from nltk.corpus import stopwords

In [10]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

<br>

## **Step 5: Split documents from Step 3 and Step 4 into list of tokens (words) without stopwords**

In [11]:
corpus['token_list'] = corpus['document_cleaned'].apply(lambda x: [word for word in x.split() if word not in stopwords.words('english')])

In [12]:
corpus

Unnamed: 0,topic,source,document,document_cleaned,token_list
0,data_science,loc_1,Data Science (is) multi-disciplinary.,data science is multi disciplinary,"[data, science, multi, disciplinary]"
1,data_science,loc_2,Data Science has statistics.,data science has statistics,"[data, science, statistics]"
2,data_science,loc_3,Python used in data science.,python used in data science,"[python, used, data, science]"
3,mental_health,loc_4,Mental Health (is) a state of the mind.,mental health is a state of the mind,"[mental, health, state, mind]"
4,mental_health,loc_5,Mental Health includes emotional and psychological.,mental health includes emotional and psychological,"[mental, health, includes, emotional, psychological]"
5,mental_health,loc_6,Watch out for mental health.,watch out for mental health,"[watch, mental, health]"
6,fashion,loc_7,Fashion can be a statement.,fashion can be a statement,"[fashion, statement]"
7,fashion,loc_8,"Fashion involves clothes, music, etc.",fashion involves clothes music etc,"[fashion, involves, clothes, music, etc]"
8,fashion,loc_9,Fashion is statement through clothes and music.,fashion is statement through clothes and music,"[fashion, statement, clothes, music]"


<br>

## **Step 6: Count each and every token from Step 5**

In [13]:
from collections import defaultdict

In [14]:
token_frequency = defaultdict(int)

for each_token_list in corpus['token_list']:
    for token in each_token_list:
        token_frequency[token] = token_frequency[token] + 1

<br>

## **Step 7: Form list of tokens with more than count of 1 from Step 6**

In [15]:
corpus['token_list_new'] = corpus['token_list'].apply(lambda x: [i for i in x if token_frequency[i]>1])

In [16]:
corpus

Unnamed: 0,topic,source,document,document_cleaned,token_list,token_list_new
0,data_science,loc_1,Data Science (is) multi-disciplinary.,data science is multi disciplinary,"[data, science, multi, disciplinary]","[data, science]"
1,data_science,loc_2,Data Science has statistics.,data science has statistics,"[data, science, statistics]","[data, science]"
2,data_science,loc_3,Python used in data science.,python used in data science,"[python, used, data, science]","[data, science]"
3,mental_health,loc_4,Mental Health (is) a state of the mind.,mental health is a state of the mind,"[mental, health, state, mind]","[mental, health]"
4,mental_health,loc_5,Mental Health includes emotional and psychological.,mental health includes emotional and psychological,"[mental, health, includes, emotional, psychological]","[mental, health]"
5,mental_health,loc_6,Watch out for mental health.,watch out for mental health,"[watch, mental, health]","[mental, health]"
6,fashion,loc_7,Fashion can be a statement.,fashion can be a statement,"[fashion, statement]","[fashion, statement]"
7,fashion,loc_8,"Fashion involves clothes, music, etc.",fashion involves clothes music etc,"[fashion, involves, clothes, music, etc]","[fashion, clothes, music]"
8,fashion,loc_9,Fashion is statement through clothes and music.,fashion is statement through clothes and music,"[fashion, statement, clothes, music]","[fashion, statement, clothes, music]"


<br>

## **Step 8: Put unique tokens from Step 7 in dictionary**

In [17]:
from gensim import corpora

In [18]:
dictionary_unique_tokens = corpora.Dictionary(corpus['token_list_new'])

In [19]:
print(dictionary_unique_tokens)

Dictionary<8 unique tokens: ['data', 'science', 'health', 'mental', 'fashion']...>


<br>

## **Step 9: Convert documents from Step 7 to bag of words vector using dictionary in Step 8**

In [20]:
corpus['bag_of_words_vector'] = corpus['token_list_new'].apply(lambda x: dictionary_unique_tokens.doc2bow(x))

In [21]:
corpus

Unnamed: 0,topic,source,document,document_cleaned,token_list,token_list_new,bag_of_words_vector
0,data_science,loc_1,Data Science (is) multi-disciplinary.,data science is multi disciplinary,"[data, science, multi, disciplinary]","[data, science]","[(0, 1), (1, 1)]"
1,data_science,loc_2,Data Science has statistics.,data science has statistics,"[data, science, statistics]","[data, science]","[(0, 1), (1, 1)]"
2,data_science,loc_3,Python used in data science.,python used in data science,"[python, used, data, science]","[data, science]","[(0, 1), (1, 1)]"
3,mental_health,loc_4,Mental Health (is) a state of the mind.,mental health is a state of the mind,"[mental, health, state, mind]","[mental, health]","[(2, 1), (3, 1)]"
4,mental_health,loc_5,Mental Health includes emotional and psychological.,mental health includes emotional and psychological,"[mental, health, includes, emotional, psychological]","[mental, health]","[(2, 1), (3, 1)]"
5,mental_health,loc_6,Watch out for mental health.,watch out for mental health,"[watch, mental, health]","[mental, health]","[(2, 1), (3, 1)]"
6,fashion,loc_7,Fashion can be a statement.,fashion can be a statement,"[fashion, statement]","[fashion, statement]","[(4, 1), (5, 1)]"
7,fashion,loc_8,"Fashion involves clothes, music, etc.",fashion involves clothes music etc,"[fashion, involves, clothes, music, etc]","[fashion, clothes, music]","[(4, 1), (6, 1), (7, 1)]"
8,fashion,loc_9,Fashion is statement through clothes and music.,fashion is statement through clothes and music,"[fashion, statement, clothes, music]","[fashion, statement, clothes, music]","[(4, 1), (5, 1), (6, 1), (7, 1)]"


<br>

## **Others: Get id of unique tokens**

In [22]:
print(dictionary_unique_tokens.token2id)

{'data': 0, 'science': 1, 'health': 2, 'mental': 3, 'fashion': 4, 'statement': 5, 'clothes': 6, 'music': 7}


**This is the X in (X,Y) bag of words token**

<br>

## **Others: Get frequency of unique tokens**

In [23]:
# sorted by token

print(sorted(token_frequency.items()))

[('clothes', 2), ('data', 3), ('disciplinary', 1), ('emotional', 1), ('etc', 1), ('fashion', 3), ('health', 3), ('includes', 1), ('involves', 1), ('mental', 3), ('mind', 1), ('multi', 1), ('music', 2), ('psychological', 1), ('python', 1), ('science', 3), ('state', 1), ('statement', 2), ('statistics', 1), ('used', 1), ('watch', 1)]


In [24]:
# sorted by token frequency

print(sorted(token_frequency.items(), key=lambda item: item[1]))

[('multi', 1), ('disciplinary', 1), ('statistics', 1), ('python', 1), ('used', 1), ('state', 1), ('mind', 1), ('includes', 1), ('emotional', 1), ('psychological', 1), ('watch', 1), ('involves', 1), ('etc', 1), ('statement', 2), ('clothes', 2), ('music', 2), ('data', 3), ('science', 3), ('mental', 3), ('health', 3), ('fashion', 3)]


<br>

## **Step 10a: Apply Latent Sematic Indexing (LSI) - 1 topic with coherence score**

In [25]:
from gensim import models

In [26]:
lsi_model = models.LsiModel(corpus=corpus['bag_of_words_vector'], id2word=dictionary_unique_tokens, num_topics=1, random_seed=42, power_iters=1000, onepass=False)

In [27]:
import pprint

In [28]:
pprint.pprint(lsi_model.print_topics())

[(0,
  '0.616*"fashion" + 0.478*"music" + 0.478*"clothes" + 0.404*"statement" + '
  '0.000*"data" + 0.000*"science" + 0.000*"mental" + 0.000*"health"')]


In [29]:
from gensim.models.coherencemodel import CoherenceModel

In [30]:
cm = CoherenceModel(model=lsi_model, corpus=corpus['bag_of_words_vector'], coherence='u_mass')
coherence = cm.get_coherence()

In [31]:
print(coherence)

-18.870903033332613


<br>

## **Step 10b: Apply Latent Sematic Indexing (LSI) - 2 topics with coherence score**

In [32]:
lsi_model = models.LsiModel(corpus=corpus['bag_of_words_vector'], id2word=dictionary_unique_tokens, num_topics=2, random_seed=42, power_iters=1000, onepass=False)

In [33]:
pprint.pprint(lsi_model.print_topics())

[(0,
  '-0.616*"fashion" + -0.478*"clothes" + -0.478*"music" + -0.404*"statement" + '
  '0.000*"data" + 0.000*"science" + -0.000*"health" + -0.000*"mental"'),
 (1,
  '0.705*"data" + 0.705*"science" + -0.055*"health" + -0.055*"mental" + '
  '0.000*"fashion" + 0.000*"statement" + 0.000*"clothes" + 0.000*"music"')]


In [34]:
cm = CoherenceModel(model=lsi_model, corpus=corpus['bag_of_words_vector'], coherence='u_mass')
coherence = cm.get_coherence()

In [35]:
print(coherence)

-18.979509758718727


<br>

## **Step 10c: Apply Latent Sematic Indexing (LSI) - 3 topics with coherence score**

In [36]:
lsi_model = models.LsiModel(corpus=corpus['bag_of_words_vector'], id2word=dictionary_unique_tokens, num_topics=3, random_seed=42, power_iters=1000, onepass=False)

In [37]:
pprint.pprint(lsi_model.print_topics())

[(0,
  '-0.616*"fashion" + -0.478*"clothes" + -0.478*"music" + -0.404*"statement" + '
  '0.000*"data" + 0.000*"science" + -0.000*"health" + -0.000*"mental"'),
 (1,
  '0.702*"data" + 0.702*"science" + -0.083*"health" + -0.083*"mental" + '
  '0.000*"fashion" + 0.000*"clothes" + 0.000*"music" + 0.000*"statement"'),
 (2,
  '-0.702*"health" + -0.702*"mental" + -0.083*"data" + -0.083*"science" + '
  '-0.000*"fashion" + 0.000*"statement" + 0.000*"clothes" + 0.000*"music"')]


In [38]:
cm = CoherenceModel(model=lsi_model, corpus=corpus['bag_of_words_vector'], coherence='u_mass')
coherence = cm.get_coherence()

In [39]:
print(coherence)

-18.92882662020521


<br>

## **Step 10d: Apply Latent Sematic Indexing (LSI) - 4 topics with coherence score**

In [40]:
lsi_model = models.LsiModel(corpus=corpus['bag_of_words_vector'], id2word=dictionary_unique_tokens, num_topics=4, random_seed=42, power_iters=1000, onepass=False)

In [41]:
pprint.pprint(lsi_model.print_topics())

[(0,
  '-0.616*"fashion" + -0.478*"clothes" + -0.478*"music" + -0.404*"statement" + '
  '0.000*"data" + 0.000*"science" + -0.000*"health" + -0.000*"mental"'),
 (1,
  '0.706*"data" + 0.706*"science" + -0.044*"mental" + -0.044*"health" + '
  '0.000*"clothes" + 0.000*"music" + 0.000*"fashion" + 0.000*"statement"'),
 (2,
  '0.706*"health" + 0.706*"mental" + 0.044*"data" + 0.044*"science" + '
  '0.000*"fashion" + -0.000*"statement" + -0.000*"clothes" + -0.000*"music"'),
 (3,
  '-0.725*"statement" + 0.457*"music" + 0.457*"clothes" + -0.235*"fashion" + '
  '-0.000*"mental" + 0.000*"health" + -0.000*"science" + 0.000*"data"')]


In [42]:
cm = CoherenceModel(model=lsi_model, corpus=corpus['bag_of_words_vector'], coherence='u_mass')
coherence = cm.get_coherence()

In [43]:
print(coherence)

-18.968649086180115
