# Project 2- Topic Classification

In [21]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
# set of stop words
stop_words = set(stopwords.words('english')) 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kasia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kasia\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [22]:
with open('meditations.txt', encoding="utf8") as f:
    lines = f.readlines()

In [23]:
lines

['\ufeffThe Project Gutenberg eBook of Meditations, by Marcus Aurelius\n',
 '\n',
 'This eBook is for the use of anyone anywhere in the United States and\n',
 'most other parts of the world at no cost and with almost no restrictions\n',
 'whatsoever. You may copy it, give it away or re-use it under the terms\n',
 'of the Project Gutenberg License included with this eBook or online at\n',
 'www.gutenberg.org. If you are not located in the United States, you\n',
 'will have to check the laws of the country where you are located before\n',
 'using this eBook.\n',
 '\n',
 'Title: Meditations\n',
 '\n',
 'Author: Marcus Aurelius\n',
 '\n',
 'Translator: Meric Casaubon\n',
 '\n',
 'Release Date: June, 2001 [eBook #2680]\n',
 '[Most recently updated: March 8, 2021]\n',
 '\n',
 'Language: English\n',
 '\n',
 'Character set encoding: UTF-8\n',
 '\n',
 'Produced by: J. Boulton and David Widger\n',
 '\n',
 '*** START OF THE PROJECT GUTENBERG EBOOK MEDITATIONS ***\n',
 '\n',
 '\n',
 '\n',
 '\n',
 

In [24]:
def preprocess_review(lines):
    #remove punctuation
    tokenized = word_tokenize(str(lines)) # tokenize
    tokenized_alnum = [word for word in tokenized if word.isalnum()] #alphanum only
    tokens_lowercase = [token.lower() for token in tokenized_alnum] #change all to lowercase
    lemmatized = [lemmatizer.lemmatize(token) for token in tokens_lowercase] # lemmatize
    #add POS
    filtered_words = [word for word in lemmatized if word not in stopwords.words('english')] #remove stopwords
    return str(filtered_words)

In [25]:
preprocess_review(lines)

"['project', 'gutenberg', 'ebook', 'meditation', 'marcus', 'ebook', 'use', 'anyone', 'anywhere', 'united', 'state', 'part', 'world', 'cost', 'almost', 'may', 'copy', 'give', 'away', 'project', 'gutenberg', 'license', 'included', 'ebook', 'online', 'located', 'united', 'state', 'check', 'law', 'country', 'located', 'marcus', 'meric', 'date', 'june', '2001', 'ebook', '2680', 'recently', 'updated', 'march', '8', '2021', 'set', 'encoding', 'boulton', 'david', 'start', 'project', 'gutenberg', 'ebook', 'meditation', 'marcus', 'first', 'second', 'third', 'fourth', 'fifth', 'sixth', 'seventh', 'eighth', 'ninth', 'tenth', 'eleventh', 'twelfth', 'aurelius', 'antoninus', 'wa', 'born', 'april', '26', '121', 'real', 'annius', 'verus', 'wa', 'sprung', 'noble', 'family', 'numa', 'second', 'king', 'rome', 'thus', 'religious', 'came', 'blood', 'pious', 'early', 'king', 'father', 'verus', 'held', 'high', 'office', 'rome', 'grandfather', 'name', 'thrice', 'consul', 'parent', 'died', 'young', 'marcus', 'h

In [26]:
# import packages from scikit-learn
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [28]:
vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
                             stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(lines)

In [30]:
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components= 10, max_iter=10, learning_method='online') # no of components = no. of topics
lda_Z = lda_model.fit_transform(data_vectorized)
print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Non-Negative Matrix Factorization Model
nmf_model = NMF(n_components= 10)
nmf_Z = nmf_model.fit_transform(data_vectorized)
print(nmf_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Latent Semantic Indexing Model
lsi_model = TruncatedSVD(n_components=10)
lsi_Z = lsi_model.fit_transform(data_vectorized)
print(lsi_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

(7231, 10)
(7231, 10)
(7231, 10)




In [31]:
# inspect the inferred topics
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])

In [32]:
print("LDA Model:")
print_topics(lda_model, vectorizer)
print("=" * 20)

LDA Model:
Topic 0:
[('man', 250.97408473436712), ('thyself', 189.8469364731546), ('time', 131.62712493684538), ('project', 130.47340890503034), ('gutenberg-tm', 71.63893947169942), ('thine', 66.37612158087572), ('present', 65.75404878796223), ('gutenberg', 45.12749224488418), ('love', 42.668658722423125), ('soon', 41.19586809003059)]
Topic 1:
[('thou', 785.3496394747009), ('nature', 188.17199440630563), ('whatsoever', 163.0625465149156), ('shall', 161.4567430916252), ('world', 151.8991908521951), ('art', 113.58259852215636), ('shalt', 110.30407427717151), ('according', 85.45364169168957), ('long', 68.77968980309126), ('live', 67.42756078048706)]
Topic 2:
[('unto', 415.26046433516933), ('thy', 332.2095944411414), ('life', 159.11807081234662), ('let', 93.37484735819474), ('man', 90.19790681213455), ('nature', 72.24268144664063), ('gods', 70.21911548898497), ('particular', 66.35385084050854), ('universe', 59.371291118524034), ('right', 48.51511944169966)]
Topic 3:
[('thee', 294.954028508



In [33]:
print("NMF Model:")
print_topics(nmf_model, vectorizer)
print("=" * 20)


NMF Model:
Topic 0:
[('thou', 6.19423333974056), ('art', 0.814944658947285), ('shalt', 0.7764789173409067), ('hast', 0.6777785149536844), ('dost', 0.48389088494673366), ('mayest', 0.2930513854473862), ('doest', 0.2675264917529367), ('wilt', 0.25153568140300103), ('mayst', 0.17105169984094834), ('whatsoever', 0.15676965635223608)]
Topic 1:
[('things', 5.757659064167656), ('world', 0.3052084335530376), ('happen', 0.22720682080908933), ('worldly', 0.15777941056554004), ('consider', 0.13197468016650166), ('shall', 0.11952144783523701), ('according', 0.10609604094696888), ('like', 0.0840068402906415), ('use', 0.08308228337519842), ('mind', 0.07703700637677512)]
Topic 2:
[('unto', 4.911024867557624), ('happen', 0.3540277947357672), ('men', 0.18205290932236556), ('like', 0.16639471918038082), ('common', 0.15419367418840618), ('whatsoever', 0.12277729130632963), ('natural', 0.11663044071444695), ('gods', 0.10562220776653701), ('subject', 0.09708238687516918), ('happened', 0.09428496224581348)]

In [34]:
print("LSI Model:")
print_topics(lsi_model, vectorizer)
print("=" * 20)

LSI Model:
Topic 0:
[('thou', 0.8852409680734366), ('things', 0.17879531608160454), ('thy', 0.16817346187632787), ('unto', 0.15046502245734186), ('thee', 0.150345419926736), ('art', 0.11775051973317172), ('thyself', 0.11165986194550036), ('shalt', 0.11050806347047575), ('hast', 0.10071676076894585), ('man', 0.0889691806192033)]
Topic 1:
[('things', 0.7133594770612355), ('unto', 0.45007717770233163), ('thee', 0.2199263057486857), ('man', 0.19551934762616904), ('doth', 0.128659362109958), ('nature', 0.10308227668995726), ('good', 0.08584228952011604), ('happen', 0.08029687348856394), ('thy', 0.07555267342706977), ('world', 0.06319595286783095)]
Topic 2:
[('unto', 0.5505804182239338), ('thee', 0.3425392579873189), ('man', 0.23047723445827933), ('thy', 0.19403163789433048), ('doth', 0.12381836369995305), ('good', 0.058374942102960826), ('hath', 0.054289574570913456), ('nature', 0.04958130365596929), ('whatsoever', 0.048038539523396526), ('let', 0.040840506961922415)]
Topic 3:
[('thy', 0.87

In [35]:
!pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
    Preparing wheel metadata: started
    Preparing wheel metadata: finished with status 'done'
Collecting future
  Using cached future-0.18.2.tar.gz (829 kB)
Collecting funcy
  Downloading funcy-1.17-py2.py3-none-any.whl (33 kB)
Collecting numexpr
  Downloading numexpr-2.8.1-cp39-cp39-win_amd64.whl (88 kB)
Using legacy 'setup.py install' for future, since package 'wheel' is not installed.
Building wheels for collected packages: pyLDAvis
  Building wheel for pyLDAvis (PEP 517): started
  Building wheel for pyLDAvis (PEP 517): finished with status 'done'
  Created wheel for pyLDAvis: filename=pyLDAvis-3.3.1

You should consider upgrading via the 'c:\users\kasia\pyproj\nlp-for-business\my_env\scripts\python.exe -m pip install --upgrade pip' command.


In [36]:
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
panel

  default_term_info = default_term_info.sort_values(
