# Install nltk Library (if not already Installed)

In [1]:
!pip install nltk



# Given Or Extracted Text from Document

In [2]:
text = """Indian culture is one of the oldest and most diverse in the world, evolving over thousands of years through a synthesis of various traditions, religions, philosophies, and art forms. From the ancient Vedas to the epics of Mahabharata and Ramayana, Indian heritage has profoundly influenced literature, music, dance, and architecture. Temples like those in Khajuraho and Madurai reflect intricate craftsmanship, while classical dance forms such as Bharatanatyam and Kathak narrate mythological stories through expressive gestures. India’s spiritual philosophy, particularly its teachings on dharma, karma, and moksha, has inspired not only its citizens but also seekers across the globe. Festivals like Diwali, Holi, and Eid celebrate unity in diversity, blending religious devotion with vibrant social gatherings. Handicrafts, regional cuisines, traditional attire like sarees and turbans, and languages such as Sanskrit, Hindi, and Tamil further showcase the richness of India’s intangible cultural heritage. In modern times, while urbanization and globalization pose challenges to preserving these traditions, initiatives in education, tourism, and digital archiving are helping ensure that India’s cultural legacy continues to thrive for future generations."""

# 1.Tokenization

In [3]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## sentence Tokenization

In [4]:
sentences = sent_tokenize(text)
print(sentences)

['Indian culture is one of the oldest and most diverse in the world, evolving over thousands of years through a synthesis of various traditions, religions, philosophies, and art forms.', 'From the ancient Vedas to the epics of Mahabharata and Ramayana, Indian heritage has profoundly influenced literature, music, dance, and architecture.', 'Temples like those in Khajuraho and Madurai reflect intricate craftsmanship, while classical dance forms such as Bharatanatyam and Kathak narrate mythological stories through expressive gestures.', 'India’s spiritual philosophy, particularly its teachings on dharma, karma, and moksha, has inspired not only its citizens but also seekers across the globe.', 'Festivals like Diwali, Holi, and Eid celebrate unity in diversity, blending religious devotion with vibrant social gatherings.', 'Handicrafts, regional cuisines, traditional attire like sarees and turbans, and languages such as Sanskrit, Hindi, and Tamil further showcase the richness of India’s int

## Removing Punctuations

In [5]:
import string

exclude = string.punctuation

In [6]:
def remove_punct(text):
    for char in exclude:
        text = text.replace(char, " ")
    return text

In [7]:
text_modified = remove_punct(text)
print(text)

Indian culture is one of the oldest and most diverse in the world, evolving over thousands of years through a synthesis of various traditions, religions, philosophies, and art forms. From the ancient Vedas to the epics of Mahabharata and Ramayana, Indian heritage has profoundly influenced literature, music, dance, and architecture. Temples like those in Khajuraho and Madurai reflect intricate craftsmanship, while classical dance forms such as Bharatanatyam and Kathak narrate mythological stories through expressive gestures. India’s spiritual philosophy, particularly its teachings on dharma, karma, and moksha, has inspired not only its citizens but also seekers across the globe. Festivals like Diwali, Holi, and Eid celebrate unity in diversity, blending religious devotion with vibrant social gatherings. Handicrafts, regional cuisines, traditional attire like sarees and turbans, and languages such as Sanskrit, Hindi, and Tamil further showcase the richness of India’s intangible cultural 

## word Tokenization

In [8]:
words = word_tokenize(text_modified)
print(words)

['Indian', 'culture', 'is', 'one', 'of', 'the', 'oldest', 'and', 'most', 'diverse', 'in', 'the', 'world', 'evolving', 'over', 'thousands', 'of', 'years', 'through', 'a', 'synthesis', 'of', 'various', 'traditions', 'religions', 'philosophies', 'and', 'art', 'forms', 'From', 'the', 'ancient', 'Vedas', 'to', 'the', 'epics', 'of', 'Mahabharata', 'and', 'Ramayana', 'Indian', 'heritage', 'has', 'profoundly', 'influenced', 'literature', 'music', 'dance', 'and', 'architecture', 'Temples', 'like', 'those', 'in', 'Khajuraho', 'and', 'Madurai', 'reflect', 'intricate', 'craftsmanship', 'while', 'classical', 'dance', 'forms', 'such', 'as', 'Bharatanatyam', 'and', 'Kathak', 'narrate', 'mythological', 'stories', 'through', 'expressive', 'gestures', 'India', '’', 's', 'spiritual', 'philosophy', 'particularly', 'its', 'teachings', 'on', 'dharma', 'karma', 'and', 'moksha', 'has', 'inspired', 'not', 'only', 'its', 'citizens', 'but', 'also', 'seekers', 'across', 'the', 'globe', 'Festivals', 'like', 'Diwal

# 2.StopWord Removal

In [9]:
from nltk.corpus import stopwords

In [10]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
stop_words = stopwords.words('english')

In [12]:
def remove_stopwords(text):
    new_text = []
    for w in text_modified.split():
        if w.lower() not in stop_words:
            new_text.append(w)
    return " ".join(new_text)

In [13]:
text_modified_1 = remove_stopwords(text)
print(text_modified_1)

Indian culture one oldest diverse world evolving thousands years synthesis various traditions religions philosophies art forms ancient Vedas epics Mahabharata Ramayana Indian heritage profoundly influenced literature music dance architecture Temples like Khajuraho Madurai reflect intricate craftsmanship classical dance forms Bharatanatyam Kathak narrate mythological stories expressive gestures India’s spiritual philosophy particularly teachings dharma karma moksha inspired citizens also seekers across globe Festivals like Diwali Holi Eid celebrate unity diversity blending religious devotion vibrant social gatherings Handicrafts regional cuisines traditional attire like sarees turbans languages Sanskrit Hindi Tamil showcase richness India’s intangible cultural heritage modern times urbanization globalization pose challenges preserving traditions initiatives education tourism digital archiving helping ensure India’s cultural legacy continues thrive future generations


# 3.Stemming

In [14]:
from nltk.stem import PorterStemmer

In [15]:
ps = PorterStemmer()

In [16]:
def stemming(text):
    new_text = []
    for w in text.split():
        new_text.append(ps.stem(w))
    return " ".join(new_text)

In [17]:
print(stemming(text_modified_1))

indian cultur one oldest divers world evolv thousand year synthesi variou tradit religion philosophi art form ancient veda epic mahabharata ramayana indian heritag profoundli influenc literatur music danc architectur templ like khajuraho madurai reflect intric craftsmanship classic danc form bharatanatyam kathak narrat mytholog stori express gestur india’ spiritu philosophi particularli teach dharma karma moksha inspir citizen also seeker across globe festiv like diwali holi eid celebr uniti divers blend religi devot vibrant social gather handicraft region cuisin tradit attir like sare turban languag sanskrit hindi tamil showcas rich india’ intang cultur heritag modern time urban global pose challeng preserv tradit initi educ tourism digit archiv help ensur india’ cultur legaci continu thrive futur gener


# 4.Lemmatization

In [18]:
from nltk.stem import WordNetLemmatizer

In [19]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [20]:
wnl = WordNetLemmatizer()

In [21]:
lemmatized_Verbs = []
lemmatized_nouns = []

In [22]:
for w in text.split():
    lemmatized_Verbs.append(wnl.lemmatize(w, pos='v'))

for w in text.split():
    lemmatized_nouns.append(wnl.lemmatize(w, pos='n'))

In [23]:
print("\n-------------------Lemmatize_verbs----------------------------")
print(lemmatized_Verbs)
print("\n-------------------Lemmatized Nouns----------------------")
print(lemmatized_nouns)


-------------------Lemmatize_verbs----------------------------
['Indian', 'culture', 'be', 'one', 'of', 'the', 'oldest', 'and', 'most', 'diverse', 'in', 'the', 'world,', 'evolve', 'over', 'thousands', 'of', 'years', 'through', 'a', 'synthesis', 'of', 'various', 'traditions,', 'religions,', 'philosophies,', 'and', 'art', 'forms.', 'From', 'the', 'ancient', 'Vedas', 'to', 'the', 'epics', 'of', 'Mahabharata', 'and', 'Ramayana,', 'Indian', 'heritage', 'have', 'profoundly', 'influence', 'literature,', 'music,', 'dance,', 'and', 'architecture.', 'Temples', 'like', 'those', 'in', 'Khajuraho', 'and', 'Madurai', 'reflect', 'intricate', 'craftsmanship,', 'while', 'classical', 'dance', 'form', 'such', 'as', 'Bharatanatyam', 'and', 'Kathak', 'narrate', 'mythological', 'stories', 'through', 'expressive', 'gestures.', 'India’s', 'spiritual', 'philosophy,', 'particularly', 'its', 'teachings', 'on', 'dharma,', 'karma,', 'and', 'moksha,', 'have', 'inspire', 'not', 'only', 'its', 'citizens', 'but', 'al

# 5.TF-IDF

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [25]:
text1 = """Machine learning is a branch of artificial intelligence that focuses on building systems 
that can learn from and make decisions based on data. It includes algorithms such as decision trees, 
support vector machines, and neural networks."""

In [26]:
text2 = """Deep learning is a subset of machine learning that uses neural networks with many layers. 
It is particularly useful for tasks such as image recognition, natural language processing, 
and complex pattern recognition."""

In [27]:
vect = TfidfVectorizer()

In [28]:
tfidf_matrix = vect.fit_transform([text1, text2])

In [29]:
tfidf_array = tfidf_matrix.toarray()

In [36]:
vocabulary =vect.get_feature_names_out()
vocabulary

array(['algorithms', 'and', 'artificial', 'as', 'based', 'branch',
       'building', 'can', 'complex', 'data', 'decision', 'decisions',
       'deep', 'focuses', 'for', 'from', 'image', 'includes',
       'intelligence', 'is', 'it', 'language', 'layers', 'learn',
       'learning', 'machine', 'machines', 'make', 'many', 'natural',
       'networks', 'neural', 'of', 'on', 'particularly', 'pattern',
       'processing', 'recognition', 'subset', 'such', 'support',
       'systems', 'tasks', 'that', 'trees', 'useful', 'uses', 'vector',
       'with'], dtype=object)

In [31]:
df_tfidf = pd.DataFrame(tfidf_array, columns=vocabulary, index=["text1", "text2"])

In [33]:
df_tfidf.round(3)

Unnamed: 0,algorithms,and,artificial,as,based,branch,building,can,complex,data,...,such,support,systems,tasks,that,trees,useful,uses,vector,with
text1,0.175,0.249,0.175,0.125,0.175,0.175,0.175,0.175,0.0,0.175,...,0.125,0.175,0.175,0.0,0.249,0.175,0.0,0.0,0.175,0.0
text2,0.0,0.133,0.0,0.133,0.0,0.0,0.0,0.0,0.187,0.0,...,0.133,0.0,0.0,0.187,0.133,0.0,0.187,0.187,0.0,0.187
