In [1]:
!pip install wordcloud
!pip install nltk




In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, r2_score


In [7]:
nltk_packages = ['punkt','stopwords','wordnet','omw-1.4','vader_lexicon', 'punkt_tab']
for pkg in nltk_packages:
    nltk.download(pkg)

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [8]:
import nltk
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("vader_lexicon")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [4]:
from google.colab import files
uploaded = files.upload()


Saving ad_text_performance_1000.csv to ad_text_performance_1000 (1).csv
Saving facebook_sentiment_1000.csv to facebook_sentiment_1000 (1).csv
Saving facebook_topics_1000.csv to facebook_topics_1000 (1).csv
Saving google_reviews_1000.csv to google_reviews_1000 (1).csv
Saving google_search_queries_1000.csv to google_search_queries_1000 (1).csv


In [9]:
df_sent = pd.read_csv('facebook_sentiment_1000.csv')
df_topics = pd.read_csv('facebook_topics_1000.csv')
df_search = pd.read_csv('google_search_queries_1000.csv')
df_reviews = pd.read_csv('google_reviews_1000.csv')
df_ads = pd.read_csv('ad_text_performance_1000.csv')

df_sent.head()


Unnamed: 0,comment_id,comment_text,sentiment
0,1,I love this product!,Positive
1,2,Not satisfied,Positive
2,3,Very disappointing,Neutral
3,4,Delivery was late,Negative
4,5,Fair enough,Positive


In [10]:
df_sent["clean"] = df_sent["comment_text"].apply(clean_text)
df_topics["clean"] = df_topics["comment_text"].apply(clean_text)
df_search["clean"] = df_search["search_query"].apply(clean_text)
df_reviews["clean"] = df_reviews["review_text"].apply(clean_text)
df_ads["clean"] = df_ads["ad_text"].apply(clean_text)


Use Case 1: Sentiment Analysis


In [11]:
sia = SentimentIntensityAnalyzer()

df_sent["compound"] = df_sent["comment_text"].apply(lambda x: sia.polarity_scores(str(x))["compound"])

def vader_label(c):
    if c >= 0.05: return "Positive"
    elif c <= -0.05: return "Negative"
    else: return "Neutral"

df_sent["vader_label"] = df_sent["compound"].apply(vader_label)
df_sent.head(10)


Unnamed: 0,comment_id,comment_text,sentiment,clean,compound,vader_label
0,1,I love this product!,Positive,love product,0.6696,Positive
1,2,Not satisfied,Positive,satisfied,-0.3252,Negative
2,3,Very disappointing,Neutral,disappointing,-0.5413,Negative
3,4,Delivery was late,Negative,delivery late,0.0,Neutral
4,5,Fair enough,Positive,fair enough,0.3182,Positive
5,6,Could be better,Positive,could better,0.4404,Positive
6,7,Nothing special,Negative,nothing special,-0.3089,Negative
7,8,Amazing quality!,Negative,amazing quality,0.6239,Positive
8,9,Excellent service!,Neutral,excellent service,0.6114,Positive
9,10,Average experience,Positive,average experience,0.0,Neutral


In [12]:
X = df_sent["clean"]
y = df_sent["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipe = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", LogisticRegression(max_iter=300))
])

pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)

print(classification_report(y_test, pred))


              precision    recall  f1-score   support

    Negative       0.28      0.53      0.36        57
     Neutral       0.46      0.32      0.38        71
    Positive       0.33      0.19      0.25        72

    accuracy                           0.34       200
   macro avg       0.36      0.35      0.33       200
weighted avg       0.36      0.34      0.33       200



Use Case 2: Topic Modeling (LDA)

In [13]:
vectorizer = CountVectorizer(max_df=0.9, min_df=5, stop_words="english")
dtm = vectorizer.fit_transform(df_topics["clean"])

lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(dtm)

terms = vectorizer.get_feature_names_out()

for idx, topic in enumerate(lda.components_):
    print(f"Topic #{idx+1}:")
    print([terms[i] for i in topic.argsort()[-10:]])
    print()


Topic #1:
['price', 'high', 'late', 'delivery', 'design', 'love', 'need', 'discount', 'better', 'quality']

Topic #2:
['price', 'delivery', 'late', 'need', 'discount', 'design', 'love', 'customer', 'helpful', 'service']

Topic #3:
['service', 'helpful', 'love', 'design', 'discount', 'need', 'late', 'delivery', 'high', 'price']

Topic #4:
['price', 'high', 'late', 'delivery', 'design', 'love', 'need', 'discount', 'damaged', 'packaging']

Topic #5:
['delivery', 'late', 'love', 'design', 'resolved', 'support', 'issue', 'team', 'discount', 'need']



Use Case 3: Keyword Extraction (Google Analytics)

In [14]:
tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=1000)
matrix = tfidf.fit_transform(df_search["clean"])

avg_scores = matrix.mean(axis=0).A1
terms = tfidf.get_feature_names_out()

top_keywords = pd.DataFrame({"term": terms, "score": avg_scores}).sort_values(
    "score", ascending=False
)

top_keywords.head(25)


Unnamed: 0,term,score
23,order,0.049652
37,track,0.049652
38,track order,0.049652
31,shipping time,0.049075
30,shipping,0.049075
36,time,0.049075
14,emi,0.048497
15,emi availability,0.048497
2,availability,0.048497
10,deal,0.047343


Use Case 4: Google Review Classification

In [15]:
X = df_reviews["clean"]
y = df_reviews["category"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("nb", MultinomialNB())
])

clf.fit(X_train, y_train)
pred = clf.predict(X_test)

print(classification_report(y_test, pred))


                 precision    recall  f1-score   support

       Ambience       0.24      0.23      0.24        39
    Cleanliness       0.00      0.00      0.00        44
        Pricing       0.21      0.25      0.23        44
Product Quality       0.17      0.39      0.24        31
        Service       0.26      0.26      0.26        42

       accuracy                           0.21       200
      macro avg       0.18      0.23      0.19       200
   weighted avg       0.18      0.21      0.19       200



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Use Case 5: Ad Text Performance (CTR Regression)

In [16]:
X = df_ads["clean"]
y = df_ads["click_through_rate"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tfidf = TfidfVectorizer()
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

reg = RandomForestRegressor(n_estimators=100, random_state=42)
reg.fit(X_train_vec, y_train)

pred = reg.predict(X_test_vec)
print("R2 Score:", r2_score(y_test, pred))


R2 Score: -0.006992703558799995
