In [2]:
# Install nltk
!pip3 install nltk

[33mDEPRECATION: Loading egg at /opt/homebrew/lib/python3.11/site-packages/bzl-2.2.1-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m


In [3]:
paragraph = "The scoring method being used above takes the count of each word and represents the word in the vector by the number of counts of that particular word. What does a word having high word count signify? Does this mean that the word is important in retrieving information about documents? The answer is NO. Let me explain, if a word occurs many times in a document but also along with many other documents in our dataset, maybe it is because this word is just a frequent word; not because it is relevant or meaningful. One approach is to rescale the frequency of words by how often they appear in all documents so that the scores for frequent words like “the” that are also frequent across all documents are penalized. This approach is called term frequency-inverse document frequency or shortly known as Tf-Idf approach of scoring.TF-IDF is intended to reflect how relevant a term is in a given document. So how is Tf-Idf of a document in a dataset calculated?"

In [6]:
# Let's convert the above para to sentences
from nltk.tokenize import sent_tokenize
sentences = sent_tokenize(paragraph)
sentences

['The scoring method being used above takes the count of each word and represents the word in the vector by the number of counts of that particular word.',
 'What does a word having high word count signify?',
 'Does this mean that the word is important in retrieving information about documents?',
 'The answer is NO.',
 'Let me explain, if a word occurs many times in a document but also along with many other documents in our dataset, maybe it is because this word is just a frequent word; not because it is relevant or meaningful.',
 'One approach is to rescale the frequency of words by how often they appear in all documents so that the scores for frequent words like “the” that are also frequent across all documents are penalized.',
 'This approach is called term frequency-inverse document frequency or shortly known as Tf-Idf approach of scoring.TF-IDF is intended to reflect how relevant a term is in a given document.',
 'So how is Tf-Idf of a document in a dataset calculated?']

We can see so many "?", "." and other symbols in this above paragharph so we can remove them to filter the corpus

In [8]:
# let's clean the whole corpus [removing special characters etc]
import re
corpus = []
for cnt in range(len(sentences)):
    temp = re.sub('[^a-zA-Z]',' ',sentences[cnt])
    temp = temp.lower()
    corpus.append(temp)
        

    The re.sub() function in Python is commonly used for substituting patterns in strings based on RegEx.

    Group capturing in RegEx allows for the selective replacement of specific parts of a matched pattern while keeping other part of the string intact.

    The re.sub() function helps remove unnecessary text, convert text cases, clean input, correct spelling errors, and many more.

In [9]:
corpus

['the scoring method being used above takes the count of each word and represents the word in the vector by the number of counts of that particular word ',
 'what does a word having high word count signify ',
 'does this mean that the word is important in retrieving information about documents ',
 'the answer is no ',
 'let me explain  if a word occurs many times in a document but also along with many other documents in our dataset  maybe it is because this word is just a frequent word  not because it is relevant or meaningful ',
 'one approach is to rescale the frequency of words by how often they appear in all documents so that the scores for frequent words like  the  that are also frequent across all documents are penalized ',
 'this approach is called term frequency inverse document frequency or shortly known as tf idf approach of scoring tf idf is intended to reflect how relevant a term is in a given document ',
 'so how is tf idf of a document in a dataset calculated ']

## Stemming

In [12]:
from nltk.stem import PorterStemmer
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/krishangopal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
stemmer = PorterStemmer()
stemming_corpus = []
for doc in corpus:
    # word tokenization
    words = nltk.word_tokenize(doc)
    temp = []
    for word in words:
        if word not in set(stopwords.words('english')):
            temp.append(stemmer.stem(word))
    stemming_corpus.append(temp)

In [14]:
stemming_corpus

[['score',
  'method',
  'use',
  'take',
  'count',
  'word',
  'repres',
  'word',
  'vector',
  'number',
  'count',
  'particular',
  'word'],
 ['word', 'high', 'word', 'count', 'signifi'],
 ['mean', 'word', 'import', 'retriev', 'inform', 'document'],
 ['answer'],
 ['let',
  'explain',
  'word',
  'occur',
  'mani',
  'time',
  'document',
  'also',
  'along',
  'mani',
  'document',
  'dataset',
  'mayb',
  'word',
  'frequent',
  'word',
  'relev',
  'meaning'],
 ['one',
  'approach',
  'rescal',
  'frequenc',
  'word',
  'often',
  'appear',
  'document',
  'score',
  'frequent',
  'word',
  'like',
  'also',
  'frequent',
  'across',
  'document',
  'penal'],
 ['approach',
  'call',
  'term',
  'frequenc',
  'invers',
  'document',
  'frequenc',
  'shortli',
  'known',
  'tf',
  'idf',
  'approach',
  'score',
  'tf',
  'idf',
  'intend',
  'reflect',
  'relev',
  'term',
  'given',
  'document'],
 ['tf', 'idf', 'document', 'dataset', 'calcul']]

## Lemmatization

In [18]:
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()

In [19]:
# Download the grammar for lemmatization
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/krishangopal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [20]:
for doc in corpus:
    # word tokenization
    words = nltk.word_tokenize(doc)
    for word in words:
        if word not in set(stopwords.words('english')):
            print(lemma.lemmatize(word))

scoring
method
used
take
count
word
represents
word
vector
number
count
particular
word
word
high
word
count
signify
mean
word
important
retrieving
information
document
answer
let
explain
word
occurs
many
time
document
also
along
many
document
dataset
maybe
word
frequent
word
relevant
meaningful
one
approach
rescale
frequency
word
often
appear
document
score
frequent
word
like
also
frequent
across
document
penalized
approach
called
term
frequency
inverse
document
frequency
shortly
known
tf
idf
approach
scoring
tf
idf
intended
reflect
relevant
term
given
document
tf
idf
document
dataset
calculated


we can see word "signify" became "signifi" with stemming and "signify" with lemmatization