<a href="https://colab.research.google.com/github/kalra-nitish/NLP/blob/main/LatentSemanticAnalysisCountVectorizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import nltk
import numpy as np
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD



In [6]:
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/nlp_class/all_book_titles.txt


--2024-07-26 01:02:18--  https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/nlp_class/all_book_titles.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 127992 (125K) [text/plain]
Saving to: ‘all_book_titles.txt’


2024-07-26 01:02:18 (5.06 MB/s) - ‘all_book_titles.txt’ saved [127992/127992]



In [7]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [8]:
wordnet_lemmatizer = WordNetLemmatizer()

In [9]:
titles = [line.rstrip() for line in open('all_book_titles.txt')]

In [10]:
titles

['Philosophy of Sex and Love A Reader',
 'Readings in Judaism, Christianity, and Islam',
 'Microprocessors Principles and Applications',
 'Bernhard Edouard Fernow: Story of North American Forestry',
 'Encyclopedia of Buddhism',
 'Motorola Microprocessor Family: 68000, 68008, 68010, 68020, 68030, and 68040, Programming and Interfacing with Applications',
 'American Anthem: Student Edition Modern Era 2007',
 'How to Read Literature Like a Professor A Lively and Entertaining Guide to Reading Between the Lines',
 'Men Are from Mars, Women Are from Venus Secrets of Great Sex, Improving Communication, Lasting Intimacy and Fulfillment, Giving and Receiving Love, Secrets of Passion, Understanding Martian',
 'Religious Traditions of the World A Journey Through Africa, Mesoamerica, North America, Judaism, Christianity, Islam, Hinduism, Buddhism, China, an',
 "World's Wisdom Sacred Texts of the World's Religions",
 "Illustrated World's Religions A Guide to Our Wisdom Traditions",
 'Soul of Sex Cu

In [11]:
stops = set(stopwords.words('english'))

In [12]:
stops

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [13]:
stops = stops.union({'introduction', 'edition', 'series', 'application', 'approach', 'card', 'access', 'package', 'plus', 'etext', 'brief', 'vol', 'fundamental', 'guide', 'essential', 'printed', 'third', 'second', 'fourth', 'volume' })

In [17]:
def my_tokenizer(s):
  c = s.lower()
  tokens = nltk.tokenize.word_tokenize(s) # split string into words (tokens)
  tokens = [t for t in tokens if len(t) > 2] # remove short words, they're probably not useful
  tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] #put words in base form
  tokens = [t for t in tokens if t not in stops] # remove stopwords
  tokens = [t for t in tokens if not any(c.isdigit() for c in t)] #remove any digit like 3rd edition
  return tokens


In [18]:
vectorizer = CountVectorizer(binary= True, tokenizer=my_tokenizer)

In [19]:
X = vectorizer.fit_transform(titles)

In [20]:
# create indx > word map for plotting later
#conceptually what we want to do
# for word, index in vectorizer.vocabulary_.items():
# index_word_map[index] = word

#but it's already stored in the count vectorizer

index_word_map = vectorizer.get_feature_names_out()

In [21]:
#transpose X to make rows = terms, cols = documents
X = X.T

In [22]:
svd = TruncatedSVD()
Z = svd.fit_transform(X)

In [23]:
!pip install plotly



In [24]:
import plotly.express as px

In [25]:
fig = px.scatter(x=Z[:, 0], y=Z[:,1], text=index_word_map, size_max=60)
fig.update_traces(textposition='top center')
fig.show()