In [3]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download("wordnet")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/kaustubh-joshi/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/kaustubh-
[nltk_data]     joshi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/kaustubh-
[nltk_data]     joshi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
sentence1 = "Stemming and lemmatization are different techniques used to reduce words to their root form, but they produce varying results. Lemmatization is better than stemming"

In [5]:
import string
def Tokenize(sentence : str):
    punctuation = string.punctuation + '[]{}()<>'
    for char in punctuation:
        sentence = sentence.replace(char, " ")
    sentence = sentence.lower()
    tokens = sentence.split()
    return tokens
tokens = Tokenize(sentence1)
tokens

['stemming',
 'and',
 'lemmatization',
 'are',
 'different',
 'techniques',
 'used',
 'to',
 'reduce',
 'words',
 'to',
 'their',
 'root',
 'form',
 'but',
 'they',
 'produce',
 'varying',
 'results',
 'lemmatization',
 'is',
 'better',
 'than',
 'stemming']

In [6]:
def RemoveStopWords(token):
    stop_words = set(stopwords.words('english'))
    filtered_sentence = [word for word in token if not word in stop_words]
    return filtered_sentence
tokens = RemoveStopWords(tokens)
tokens

['stemming',
 'lemmatization',
 'different',
 'techniques',
 'used',
 'reduce',
 'words',
 'root',
 'form',
 'produce',
 'varying',
 'results',
 'lemmatization',
 'better',
 'stemming']

In [7]:

pos_tag_list = pos_tag(tokens)
pos_tag_list

[('stemming', 'VBG'),
 ('lemmatization', 'NN'),
 ('different', 'JJ'),
 ('techniques', 'NNS'),
 ('used', 'VBN'),
 ('reduce', 'VB'),
 ('words', 'NNS'),
 ('root', 'VBP'),
 ('form', 'NN'),
 ('produce', 'VBP'),
 ('varying', 'VBG'),
 ('results', 'NNS'),
 ('lemmatization', 'NN'),
 ('better', 'RBR'),
 ('stemming', 'NN')]

In [8]:
stemmer = PorterStemmer()
print('Stem words')
for x in tokens:
    print(f"{x} : {stemmer.stem(x)}")

Stem words
stemming : stem
lemmatization : lemmat
different : differ
techniques : techniqu
used : use
reduce : reduc
words : word
root : root
form : form
produce : produc
varying : vari
results : result
lemmatization : lemmat
better : better
stemming : stem


In [9]:
lemmatizer = WordNetLemmatizer()
for x in tokens:
    print(f"{x}:{lemmatizer.lemmatize(x)}")

stemming:stemming
lemmatization:lemmatization
different:different
techniques:technique
used:used
reduce:reduce
words:word
root:root
form:form
produce:produce
varying:varying
results:result
lemmatization:lemmatization
better:better
stemming:stemming


In [10]:
def calculateTF(token):
    term_freq ={}
    for word in token:
        if word not in term_freq:
            term_freq[word] = token.count(word)/len(token)
    return term_freq
calculateTF(tokens)

{'stemming': 0.13333333333333333,
 'lemmatization': 0.13333333333333333,
 'different': 0.06666666666666667,
 'techniques': 0.06666666666666667,
 'used': 0.06666666666666667,
 'reduce': 0.06666666666666667,
 'words': 0.06666666666666667,
 'root': 0.06666666666666667,
 'form': 0.06666666666666667,
 'produce': 0.06666666666666667,
 'varying': 0.06666666666666667,
 'results': 0.06666666666666667,
 'better': 0.06666666666666667}

In [11]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/kaustubh-
[nltk_data]     joshi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
def calculateTF_IDF(documents):
    documents = sent_tokenize(documents)
    document_map = {}
    document_tf = {}
    unique_words = set()
    word_idf = {}
    
    for i, document in enumerate(documents):
        tokenizedWords  = Tokenize(document)
        document_map[i] = tokenizedWords

        document_tf[i] = calculateTF(tokenizedWords)

        for word in tokenizedWords:
            unique_words.add(word)

    for word in unique_words:
        count = 0
        for _, tokenedWords in document_map.items():
            if word in tokenedWords:
                count += 1

        word_idf[word] = count

    return word_idf, document_tf
        

word_idf, document_tf = calculateTF_IDF(sentence1)
print(word_idf)

{'better': 1, 'used': 1, 'lemmatization': 2, 'techniques': 1, 'is': 1, 'their': 1, 'they': 1, 'different': 1, 'but': 1, 'reduce': 1, 'varying': 1, 'stemming': 2, 'root': 1, 'produce': 1, 'than': 1, 'words': 1, 'are': 1, 'and': 1, 'to': 1, 'form': 1, 'results': 1}


In [13]:
help(nltk)

Help on package nltk:

NAME
    nltk

DESCRIPTION
    The Natural Language Toolkit (NLTK) is an open source Python library
    for Natural Language Processing.  A free online book is available.
    (If you use the library for academic research, please cite the book.)
    
    Steven Bird, Ewan Klein, and Edward Loper (2009).
    Natural Language Processing with Python.  O'Reilly Media Inc.
    https://www.nltk.org/book/
    
    isort:skip_file
    
    @version: 3.8

PACKAGE CONTENTS
    app (package)
    book
    ccg (package)
    chat (package)
    chunk (package)
    classify (package)
    cli
    cluster (package)
    collections
    collocations
    compat
    corpus (package)
    data
    decorators
    downloader
    draw (package)
    featstruct
    grammar
    help
    inference (package)
    internals
    jsontags
    langnames
    lazyimport
    lm (package)
    metrics (package)
    misc (package)
    parse (package)
    probability
    sem (package)
    sentiment (package