In [None]:
# Text Analytics
# 1. Extract Sample document and apply following document pre-processing methods:
#  Tokenization, POS Tagging, stop words removal, Stemming and Lemmatization.
# 2. Create representation of document by calculating Term Frequency and Inverse Document
# Frequency. 

In [3]:
import string #Imports Python's string module, which contains a collection of string constants
import nltk #Imports Natural Lang Toolkit(nltk) library, which provides tools for processing & analyzing human language data (text).
from nltk.tokenize import word_tokenize #Imports word_tokenize function from nltk.tokenize.It splits string (sentence) into individual words or tokens.
from nltk.corpus import stopwords #provides list of common words("the", "and", "is")that are typically removed in text analysis, as they don’t carry much meaning for most NLP tasks.
from nltk import pos_tag # This function tags words in a sentence with their part of speech (e.g., noun, verb, adjective).
from nltk.stem import PorterStemmer , WordNetLemmatizer
#PorterStemmer: A stemmer that reduces words to their root form (e.g., "running" → "run").
#WordNetLemmatizer: A lemmatizer that reduces words to their dictionary form (e.g., "better" → "good"). It uses WordNet, a lexical database.

In [4]:
nltk.download('punkt_tab') #These models are used for tokenizing text, i.e., splitting text into words or sentences.
nltk.download('averaged_perceptron_tagger_eng') #It tags words in a sentence with their grammatical roles 
nltk.download('stopwords') #words that are typically removed during text processing because they don't carry much meaning.
nltk.download('wordnet') #which is lexical database of English language. It helps to find synonyms, antonyms,& definitions of words.
nltk.download('omw-1.4') #Open Multilingual Wordnet (version 1.4). It is an extension of WordNet, supporting multiple languages.

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Manish\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Manish\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Manish\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Manish\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Manish\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [5]:
from nltk.tokenize import word_tokenize
#Imports word_tokenize function from nltk.tokenize module, which is used to split text into list of individual words or tokens.
text="this is example text,to test the word filtration." #Defines a sample string text that you want to tokenize.
tokens=word_tokenize(text) #Applies word_tokenize() function to text string, which splits text into a list of words or tokens.
print(tokens)

['this', 'is', 'example', 'text', ',', 'to', 'test', 'the', 'word', 'filtration', '.']


In [6]:
pos_tags = pos_tag(tokens)
#Applies the pos_tag() function to the tokens (list of words or tokens), which assigns part-of-speech tag to each token.Each word is tagged with POS label,such as noun(NN),verb (VB),etc.
print(pos_tags)

[('this', 'DT'), ('is', 'VBZ'), ('example', 'NN'), ('text', 'NN'), (',', ','), ('to', 'TO'), ('test', 'VB'), ('the', 'DT'), ('word', 'NN'), ('filtration', 'NN'), ('.', '.')]


In [7]:
stop_words = set(stopwords.words('english'))
#Imports the list of common stopwords(e.g.,"the","is","and",etc.) in English from nltk.corpus.stopwords module.
#Converts the list of stopwords into a set for faster look-up during filtering.
filtered_tokens = [word for word in tokens if word.lower() not in stop_words and word not in string.punctuation]
#word.lower() not in stop_words: Ensures that the word (in lowercase) is not a stopword.
#word not in string.punctuation: Ensures word is not a punctuation mark (using string.punctuation constant).
print("Tokens after Stop Words Removal:\n", filtered_tokens)
#Prints the list of tokens after removing the stopwords and punctuation.

Tokens after Stop Words Removal:
 ['example', 'text', 'test', 'word', 'filtration']


In [8]:
stemmer = PorterStemmer()
#Initializes PorterStemmer from nltk.stem module,which is used to reduce words to their root form (i.e.,stemming).For example,"running" becomes "run" and "better" becomes "better".
stemmed = [stemmer.stem(word) for word in filtered_tokens]
#Iterates over each word in filtered_tokens list (which contains tokens after removing stopwords and punctuation).
print("Stemmed Tokens:\n", stemmed)
#Prints the list of stemmed tokens, showing the root form of each word.

Stemmed Tokens:
 ['exampl', 'text', 'test', 'word', 'filtrat']


In [9]:
lemmatizer = WordNetLemmatizer()
#Creates instance of WordNetLemmatizer, which is tool from nltk library used to reduce words to their base or root form (known as lemmatization).
lemmatized = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("Lemmatized Tokens:\n", lemmatized) #Prints the list of tokens after lemmatization.

Lemmatized Tokens:
 ['example', 'text', 'test', 'word', 'filtration']


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
#used to convert collection of text documents into matrix of TF-IDF (Term Frequency-Inverse Document Frequency) features.
import pandas as pd
documents = [
    "The fox jumps over the lazy dog",
    "The dog sleeps in the garden",
    "Foxes are clever and fast animals"
]
#Defines a list of 3 text documents (documents) that will be processed to extract TF-IDF features.


tfidf = TfidfVectorizer()
#Initializes TfidfVectorizer object, which will be used to convert text into matrix of TF-IDF features.
tfidf_matrix = tfidf.fit_transform(documents)


#Display terms and tf-idf scores
df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())
print(df)

        and   animals       are    clever       dog      fast       fox  \
0  0.000000  0.000000  0.000000  0.000000  0.289695  0.000000  0.380914   
1  0.000000  0.000000  0.000000  0.000000  0.313316  0.000000  0.000000   
2  0.408248  0.408248  0.408248  0.408248  0.000000  0.408248  0.000000   

      foxes    garden        in     jumps      lazy      over    sleeps  \
0  0.000000  0.000000  0.000000  0.380914  0.380914  0.380914  0.000000   
1  0.000000  0.411973  0.411973  0.000000  0.000000  0.000000  0.411973   
2  0.408248  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   

        the  
0  0.579391  
1  0.626632  
2  0.000000  
