In [33]:
import nltk
import string
from collections import Counter

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import spacy

In [34]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/krishna/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/krishna/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/krishna/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [35]:
text = """
Natural Language Processing is used in chatbots and search engines.
It helps machines understand human language.
"""

print("RAW TEXT:\n", text)

RAW TEXT:
 
Natural Language Processing is used in chatbots and search engines.
It helps machines understand human language.



In [36]:
sentences = sent_tokenize(text)
print("\nSENTENCE TOKENIZATION:")
print(sentences)


SENTENCE TOKENIZATION:
['\nNatural Language Processing is used in chatbots and search engines.', 'It helps machines understand human language.']


In [37]:
words = word_tokenize(text)
print("\nWORD TOKENIZATION:")
print(words)


WORD TOKENIZATION:
['Natural', 'Language', 'Processing', 'is', 'used', 'in', 'chatbots', 'and', 'search', 'engines', '.', 'It', 'helps', 'machines', 'understand', 'human', 'language', '.']


In [38]:
words = [word.lower() for word in words if word not in string.punctuation]
print("\nAFTER LOWERCASE & PUNCTUATION REMOVAL:")
print(words)


AFTER LOWERCASE & PUNCTUATION REMOVAL:
['natural', 'language', 'processing', 'is', 'used', 'in', 'chatbots', 'and', 'search', 'engines', 'it', 'helps', 'machines', 'understand', 'human', 'language']


In [39]:
stop_words = set(stopwords.words('english'))

clean_words = [word for word in words if word not in stop_words]
print("\nAFTER STOP WORD REMOVAL:")
print(clean_words)


AFTER STOP WORD REMOVAL:
['natural', 'language', 'processing', 'used', 'chatbots', 'search', 'engines', 'helps', 'machines', 'understand', 'human', 'language']


In [40]:
stemmer = PorterStemmer()
stems = [stemmer.stem(word) for word in clean_words]

print("\nSTEMMED WORDS:")
print(stems)


STEMMED WORDS:
['natur', 'languag', 'process', 'use', 'chatbot', 'search', 'engin', 'help', 'machin', 'understand', 'human', 'languag']


In [41]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(" ".join(clean_words))

lemmas = [token.lemma_ for token in doc]
print("\nLEMMATIZED WORDS:")
print(lemmas)


LEMMATIZED WORDS:
['natural', 'language', 'processing', 'use', 'chatbot', 'search', 'engine', 'help', 'machine', 'understand', 'human', 'language']


In [42]:
lemma_frequency = Counter(lemmas)

print("\nLEMMA FREQUENCY:")
for lemma, freq in lemma_frequency.items():
    print(f"{lemma} : {freq}")


LEMMA FREQUENCY:
natural : 1
language : 2
processing : 1
use : 1
chatbot : 1
search : 1
engine : 1
help : 1
machine : 1
understand : 1
human : 1
