<a href="https://colab.research.google.com/github/hardik-kumar-10/UCS420/blob/main/Assignment10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Required imports
import re
import nltk
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.probability import FreqDist
from wordcloud import WordCloud
from textblob import TextBlob

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


In [None]:
# Original Paragraph
text = """Technology fascinates me the most. It continuously evolves and shapes the way we live.
From artificial intelligence to quantum computing, the possibilities are endless.
It enhances communication, automates tasks, and makes life easier.
New innovations inspire me to learn more every day."""

# 1. Lowercase and remove punctuation
text_clean = re.sub(r'[^\w\s]', '', text.lower())

# 2. Tokenize into sentences and words
sent_tokens = sent_tokenize(text_clean)
word_tokens = word_tokenize(text_clean)

# 3. Compare split() and word_tokenize()
split_words = text_clean.split()
print("Using split():", split_words)
print("Using word_tokenize():", word_tokens)

# 4. Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in word_tokens if word not in stop_words]

# 5. Word frequency distribution
fdist = FreqDist(filtered_words)
fdist.plot(10, title='Word Frequency (Excluding Stopwords)')


In [None]:
# 1. Extract alphabetic words
alpha_words = re.findall(r'\b[a-zA-Z]+\b', text_clean)

# 2. Remove stopwords
filtered_alpha = [word for word in alpha_words if word not in stop_words]

# 3. Stemming
stemmer = PorterStemmer()
stemmed = [stemmer.stem(word) for word in filtered_alpha]

# 4. Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word) for word in filtered_alpha]

# 5. Compare
print("Stemmed:", stemmed)
print("Lemmatized:", lemmatized)
print("Stemming reduces to base/root form, but can be less readable.\nLemmatization uses vocabulary and gives more meaningful forms.")


In [None]:
texts = [
    "This laptop has amazing performance and battery life.",
    "The camera quality of this phone is stunning.",
    "Customer service was very disappointing and slow."
]

# 1. Bag of Words
cv = CountVectorizer()
bow = cv.fit_transform(texts)
print("BoW Feature Names:", cv.get_feature_names_out())
print(bow.toarray())

# 2. TF-IDF
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(texts)
print("TF-IDF Feature Names:", tfidf.get_feature_names_out())

# 3. Top keywords per text
feature_names = tfidf.get_feature_names_out()
for i, row in enumerate(tfidf_matrix.toarray()):
    top_indices = row.argsort()[-3:][::-1]
    top_words = [(feature_names[idx], row[idx]) for idx in top_indices]
    print(f"Text {i+1} Top Keywords:", top_words)


In [None]:
text1 = "Artificial Intelligence enables machines to think like humans and automate tasks."
text2 = "Blockchain provides a secure way to record and transfer digital assets."

# Preprocess
def preprocess(text):
    tokens = word_tokenize(re.sub(r'[^\w\s]', '', text.lower()))
    return [word for word in tokens if word not in stop_words]

tokens1 = set(preprocess(text1))
tokens2 = set(preprocess(text2))

# a. Jaccard Similarity
jaccard_sim = len(tokens1 & tokens2) / len(tokens1 | tokens2)
print("Jaccard Similarity:", jaccard_sim)

# b. Cosine Similarity
tfidf = TfidfVectorizer()
vectors = tfidf.fit_transform([text1, text2])
cos_sim = cosine_similarity(vectors[0:1], vectors[1:2])[0][0]
print("Cosine Similarity:", cos_sim)

# c. Analysis
print("Jaccard is simple, shows overlap ratio. Cosine considers term frequency and importance, often more insightful.")


In [None]:
reviews = [
    "This product exceeded my expectations! Loved it.",
    "It's okay, not great but not bad either.",
    "Terrible experience. Will not buy again."
]

for review in reviews:
    blob = TextBlob(review)
    print(f"Review: {review}")
    print(f"Polarity: {blob.sentiment.polarity}, Subjectivity: {blob.sentiment.subjectivity}")
    sentiment = "Positive" if blob.sentiment.polarity > 0 else "Negative" if blob.sentiment.polarity < 0 else "Neutral"
    print(f"Sentiment: {sentiment}\n")

# Word cloud for positive reviews
positive_reviews = ' '.join([r for r in reviews if TextBlob(r).sentiment.polarity > 0])
wordcloud = WordCloud(width=600, height=400, background_color='white').generate(positive_reviews)

plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Positive Reviews')
plt.show()
