In [13]:
# 📘 Sentiment & Thematic Analysis - Omega Consultancy

# 🔧 Setup
import pandas as pd
import sys
import os
from pathlib import Path

sys.path.append(os.path.abspath(".."))

# 📦 Import custom modules
import src.sentiment
importlib.reload(src.sentiment)
from src.sentiment import SentimentAnalyzer
from src.utils.keyword_extractor import KeywordExtractor
from src.utils.theme_grouper import ThemeGrouper
from src.utils.aggregator import SentimentAggregator

# 📂 Load Cleaned Data
data_dir = Path("../data/clean")
cbe_df = pd.read_csv(data_dir / "cleaned_cbe_reviews.csv")
boa_df = pd.read_csv(data_dir / "cleaned_boa_reviews.csv")
db_df  = pd.read_csv(data_dir / "cleaned_db_reviews.csv")

# 🧠 Initialize NLP Modules
analyzer = SentimentAnalyzer()
extractor = KeywordExtractor()
classifier = ThemeGrouper()

# 🔁 Analyze Each Bank
all_banks = []
for df, bank in zip([cbe_df, boa_df, db_df], ["CBE", "BOA", "DB"]):
    print(f"🔍 Processing {bank}...")

    # Sentiment analysis
    bert_labels = []
    bert_scores = []
    vader_labels = []

    for review in df["review"]:
        label, score = analyzer.analyze_single(review)
        bert_labels.append(label)
        bert_scores.append(score)

        vader_label, _ = analyzer.analyze_single(review, method="vader")
        vader_labels.append(vader_label)

    df["bert_sentiment"] = bert_labels
    df["bert_score"] = bert_scores
    df["vader_sentiment"] = vader_labels
    df["bank"] = bank

    # Thematic classification
    df["theme"] = df["review"].apply(classifier.assign_theme)

    all_banks.append(df)

# 🧾 Combine & Save Final CSV
final_df = pd.concat(all_banks, ignore_index=True)
final_df.to_csv("../data/sentiment_themes_labeled.csv", index=False)
print("✅ Final dataset with sentiment and themes saved!")

# 📊 Aggregation
agg = SentimentAggregator(final_df)

# 1. Average BERT score by bank and rating
bank_rating_summary = agg.aggregate_by_bank_and_rating()
print("\n📈 Average Sentiment by Bank and Rating:")
display(bank_rating_summary)

# 2. Count of sentiment labels by bank
sentiment_dist = agg.aggregate_sentiment_counts()
print("\n📊 Sentiment Distribution by Bank:")
display(sentiment_dist)

# 🔍 Keyword Extraction
for bank in ["CBE", "BOA", "DB"]:
    print(f"\n📌 Top Keywords for {bank}:")
    df_bank = final_df[final_df["bank"] == bank]
    keywords = extractor.extract_keywords(df_bank["review"].tolist())
    print(", ".join(keywords[:15]))  # Show top 15


Device set to use cpu


🔍 Processing CBE...
🔍 Processing BOA...
🔍 Processing DB...
✅ Final dataset with sentiment and themes saved!

📈 Average Sentiment by Bank and Rating:


Unnamed: 0,bank,rating,avg_bert_score,review_count
0,BOA,1,0.980119,241
1,BOA,2,0.955582,22
2,BOA,3,0.966563,35
3,BOA,4,0.955578,17
4,BOA,5,0.961208,174
5,CBE,1,0.978501,60
6,CBE,2,0.983766,18
7,CBE,3,0.986362,28
8,CBE,4,0.961411,51
9,CBE,5,0.974511,276



📊 Sentiment Distribution by Bank:


Unnamed: 0,bank,bert_sentiment,count
0,BOA,NEGATIVE,319
1,BOA,POSITIVE,170
2,CBE,NEGATIVE,162
3,CBE,POSITIVE,271
4,DB,NEGATIVE,89
5,DB,POSITIVE,300



📌 Top Keywords for CBE:
app, good, best, cbe, nice, bank, screenshot, like, great, update, application, good app, use, apps, easy

📌 Top Keywords for BOA:
app, good, bank, work, working, boa, banking, mobile, doesnt, worst, use, best, developer, application, dont

📌 Top Keywords for DB:
app, dashen, best, good, bank, super, banking, amazing, use, dashen bank, easy, fast, super app, application, features
