<a href="https://colab.research.google.com/github/koushik1904/Natural-language-processing-NLB-/blob/main/NLP_LAB_07_2403a52057.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# This cell imports all required Python libraries.
# numpy and pandas are used for numerical operations and dataset handling.
# re is used for text cleaning using regular expressions.
# nltk is used for tokenization, stopword removal, lemmatization, and WordNet similarity.
# scikit-learn is used for TF-IDF vectorization and cosine similarity computation.

import numpy as np
import pandas as pd
import re

import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [8]:
# This cell downloads required NLTK datasets.
# punkt is used for tokenization.
# stopwords are used to remove common words.
# wordnet is used for semantic similarity computation.

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
# This cell creates a small dataset of 22 documents.
# The documents belong to four different topics:
# sports, politics, health, and technology.
# This dataset is manually created for experimental purposes.

documents = [
    "The football team won the championship",
    "Cricket players trained hard for the match",
    "The athlete broke the world record",
    "The government passed a new policy",
    "Elections will be held next month",
    "The president addressed the nation",
    "Doctors recommend regular exercise",
    "A healthy diet improves immunity",
    "The patient received medical treatment",
    "Artificial intelligence is transforming industries",
    "The new smartphone has advanced features",
    "Cybersecurity is important for data protection",
    "The doctor advised the patient to exercise",
    "The government invested in healthcare technology",
    "The team used technology to improve performance",
    "Medical research helps fight diseases",
    "The election results were announced",
    "Fitness and diet improve health",
    "New technology boosts sports analytics",
    "Healthcare policies affect citizens",
    "Athletes rely on fitness training",
    "Digital tools improve medical diagnosis"
]


In [4]:
# This cell converts the document list into a pandas DataFrame.
# A DataFrame makes it easier to visualize and process text data.

df = pd.DataFrame(documents, columns=["Text"])
df.head()


Unnamed: 0,Text
0,The football team won the championship
1,Cricket players trained hard for the match
2,The athlete broke the world record
3,The government passed a new policy
4,Elections will be held next month


In [5]:
# This cell initializes stopwords and lemmatizer.
# Stopwords are common words that add little meaning.
# WordNetLemmatizer reduces words to their base form.

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


In [9]:
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')   # ðŸ”´ THIS is the missing resource
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
# This function preprocesses text data.
# Steps include:
# 1. Convert text to lowercase
# 2. Remove punctuation and numbers
# 3. Tokenize the text into words
# 4. Remove stopwords
# 5. Apply lemmatization

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return tokens


In [11]:
# This cell applies the preprocessing function to all documents.
# The cleaned tokens are stored in a new column called 'Tokens'.

df["Tokens"] = df["Text"].apply(preprocess)
df.head()


Unnamed: 0,Text,Tokens
0,The football team won the championship,"[football, team, championship]"
1,Cricket players trained hard for the match,"[cricket, player, trained, hard, match]"
2,The athlete broke the world record,"[athlete, broke, world, record]"
3,The government passed a new policy,"[government, passed, new, policy]"
4,Elections will be held next month,"[election, held, next, month]"


In [12]:
# This cell converts text documents into numerical vectors using TF-IDF.
# TF-IDF assigns higher weight to important words
# and lower weight to frequently occurring words.

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df["Text"])


In [13]:
# This cell computes cosine similarity between all document vectors.
# Cosine similarity measures the angle between vectors
# and ranges from 0 (no similarity) to 1 (identical text).

cosine_sim = cosine_similarity(tfidf_matrix)


In [14]:
# This cell prints cosine similarity scores
# between the first document and other documents
# to observe similarity behavior.

for i in range(5):
    print(f"Cosine similarity between Doc 0 and Doc {i}: {cosine_sim[0][i]:.3f}")


Cosine similarity between Doc 0 and Doc 0: 1.000
Cosine similarity between Doc 0 and Doc 1: 0.087
Cosine similarity between Doc 0 and Doc 2: 0.191
Cosine similarity between Doc 0 and Doc 3: 0.111
Cosine similarity between Doc 0 and Doc 4: 0.000


In [15]:
# This function computes Jaccard similarity.
# It is calculated as:
# (Number of common words) / (Total unique words).
# It focuses only on exact word overlap.

def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union


In [16]:
# This cell calculates Jaccard similarity
# between the first document and other documents
# using token sets.

for i in range(5):
    sim = jaccard_similarity(set(df["Tokens"][0]), set(df["Tokens"][i]))
    print(f"Jaccard similarity between Doc 0 and Doc {i}: {sim:.3f}")


Jaccard similarity between Doc 0 and Doc 0: 1.000
Jaccard similarity between Doc 0 and Doc 1: 0.000
Jaccard similarity between Doc 0 and Doc 2: 0.000
Jaccard similarity between Doc 0 and Doc 3: 0.000
Jaccard similarity between Doc 0 and Doc 4: 0.000


In [17]:
# This function computes semantic similarity using WordNet.
# Wu-Palmer similarity is used to measure closeness of meanings
# between word synsets rather than exact word matching.

def wordnet_similarity(sent1, sent2):
    score = 0
    count = 0
    for w1 in sent1:
        syn1 = wordnet.synsets(w1)
        for w2 in sent2:
            syn2 = wordnet.synsets(w2)
            if syn1 and syn2:
                sim = syn1[0].wup_similarity(syn2[0])
                if sim:
                    score += sim
                    count += 1
    return score / count if count != 0 else 0


In [18]:
# This cell applies WordNet semantic similarity
# to at least 10 sentence pairs.
# It captures meaning even when words are different.

for i in range(10):
    sim = wordnet_similarity(df["Tokens"][i], df["Tokens"][i+1])
    print(f"WordNet similarity between Doc {i} and Doc {i+1}: {sim:.3f}")


WordNet similarity between Doc 0 and Doc 1: 0.135
WordNet similarity between Doc 1 and Doc 2: 0.281
WordNet similarity between Doc 2 and Doc 3: 0.206
WordNet similarity between Doc 3 and Doc 4: 0.286
WordNet similarity between Doc 4 and Doc 5: 0.195
WordNet similarity between Doc 5 and Doc 6: 0.193
WordNet similarity between Doc 6 and Doc 7: 0.175
WordNet similarity between Doc 7 and Doc 8: 0.209
WordNet similarity between Doc 8 and Doc 9: 0.228
WordNet similarity between Doc 9 and Doc 10: 0.291
