In [14]:
!pip install nltk scikit-learn

Defaulting to user installation because normal site-packages is not writeable


In [15]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer


In [22]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger') 
nltk.download('averaged_perceptron_tagger_eng')
# Added line to download the tagger
nltk.download('wordnet')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Packag

True

In [17]:
# Sample document
doc = "Natural Language Processing (NLP) helps computers understand human language. It is a crucial part of Artificial Intelligence."


In [18]:
# Tokenization
tokens = word_tokenize(doc)
print("Tokens:", tokens)

Tokens: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'helps', 'computers', 'understand', 'human', 'language', '.', 'It', 'is', 'a', 'crucial', 'part', 'of', 'Artificial', 'Intelligence', '.']


In [23]:
# POS Tagging
pos_tags = pos_tag(tokens)
print("\nPOS Tags:", pos_tags)



POS Tags: [('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('(', '('), ('NLP', 'NNP'), (')', ')'), ('helps', 'VBZ'), ('computers', 'NNS'), ('understand', 'VBP'), ('human', 'JJ'), ('language', 'NN'), ('.', '.'), ('It', 'PRP'), ('is', 'VBZ'), ('a', 'DT'), ('crucial', 'JJ'), ('part', 'NN'), ('of', 'IN'), ('Artificial', 'NNP'), ('Intelligence', 'NNP'), ('.', '.')]


In [24]:
# Stop Words Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [w for w in tokens if w.lower() not in stop_words]
print("\nAfter Stopword Removal:", filtered_tokens)


After Stopword Removal: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'helps', 'computers', 'understand', 'human', 'language', '.', 'crucial', 'part', 'Artificial', 'Intelligence', '.']


In [25]:
# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(w) for w in filtered_tokens]
print("\nAfter Stemming:", stemmed_tokens)



After Stemming: ['natur', 'languag', 'process', '(', 'nlp', ')', 'help', 'comput', 'understand', 'human', 'languag', '.', 'crucial', 'part', 'artifici', 'intellig', '.']


In [26]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(w) for w in filtered_tokens]
print("\nAfter Lemmatization:", lemmatized_tokens)



After Lemmatization: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'help', 'computer', 'understand', 'human', 'language', '.', 'crucial', 'part', 'Artificial', 'Intelligence', '.']


In [27]:
# If you have multiple documents
documents = [doc, 
             "NLP techniques include tokenization, stemming, and lemmatization."]

# Create TF-IDF matrix
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

# Show TF-IDF values
print("\nTF-IDF Matrix:")
print(tfidf_matrix.toarray())



TF-IDF Matrix:
[[0.         0.23900309 0.23900309 0.23900309 0.23900309 0.23900309
  0.         0.23900309 0.23900309 0.23900309 0.47800618 0.
  0.23900309 0.17005267 0.23900309 0.23900309 0.23900309 0.
  0.         0.         0.23900309]
 [0.39204401 0.         0.         0.         0.         0.
  0.39204401 0.         0.         0.         0.         0.39204401
  0.         0.27894255 0.         0.         0.         0.39204401
  0.39204401 0.39204401 0.        ]]


In [28]:
# Feature names (words)
print("\nFeature Names:")
print(vectorizer.get_feature_names_out())



Feature Names:
['and' 'artificial' 'computers' 'crucial' 'helps' 'human' 'include'
 'intelligence' 'is' 'it' 'language' 'lemmatization' 'natural' 'nlp' 'of'
 'part' 'processing' 'stemming' 'techniques' 'tokenization' 'understand']


In [29]:
# Term Frequency (TF) calculation (per document)
term_freq = tfidf_matrix.sum(axis=1).A1
print("\nTerm Frequency (TF) for documents:")
print(term_freq)



Term Frequency (TF) for documents:
[3.75509898 2.63120663]


In [30]:
# Inverse Document Frequency (IDF) calculation (for each word across documents)
idf = vectorizer.idf_
print("\nInverse Document Frequency (IDF) values:")
for word, idf_value in zip(vectorizer.get_feature_names_out(), idf):
    print(f"Word: {word} - IDF: {idf_value}")



Inverse Document Frequency (IDF) values:
Word: and - IDF: 1.4054651081081644
Word: artificial - IDF: 1.4054651081081644
Word: computers - IDF: 1.4054651081081644
Word: crucial - IDF: 1.4054651081081644
Word: helps - IDF: 1.4054651081081644
Word: human - IDF: 1.4054651081081644
Word: include - IDF: 1.4054651081081644
Word: intelligence - IDF: 1.4054651081081644
Word: is - IDF: 1.4054651081081644
Word: it - IDF: 1.4054651081081644
Word: language - IDF: 1.4054651081081644
Word: lemmatization - IDF: 1.4054651081081644
Word: natural - IDF: 1.4054651081081644
Word: nlp - IDF: 1.0
Word: of - IDF: 1.4054651081081644
Word: part - IDF: 1.4054651081081644
Word: processing - IDF: 1.4054651081081644
Word: stemming - IDF: 1.4054651081081644
Word: techniques - IDF: 1.4054651081081644
Word: tokenization - IDF: 1.4054651081081644
Word: understand - IDF: 1.4054651081081644
