# explore_sentiment.py: Python script extracted from notebooks/explore_sentiment.ipynb
# Purpose: Test sentiment analysis and keyword extraction for Task 2


In [1]:
import pandas as pd
import spacy
from transformers import pipeline
from collections import Counter

  from .autonotebook import tqdm as notebook_tqdm


# Load cleaned reviews

In [3]:
df = pd.read_csv('../data/processed/reviews_clean.csv')
print(df.shape)
print(df.head())

(1184, 13)
                                              review  rating        date  \
0  "Why don’t your ATMs support account-to-accoun...       4  2025-06-05   
1                        what is this app problem???       1  2025-06-05   
2       the app is proactive and a good connections.       5  2025-06-05   
3    I cannot send to cbebirr app. through this app.       3  2025-06-05   
4                                               good       4  2025-06-05   

                          bank       source  \
0  Commercial Bank of Ethiopia  Google Play   
1  Commercial Bank of Ethiopia  Google Play   
2  Commercial Bank of Ethiopia  Google Play   
3  Commercial Bank of Ethiopia  Google Play   
4  Commercial Bank of Ethiopia  Google Play   

                        cleaned_review  char_count  word_count  \
0  support transfer like country south        35.0         5.0   
1                              problem         7.0         1.0   
2                      good connection        15.0 

# Initialize DistilBERT sentiment pipeline

In [4]:
sentiment_analyzer = pipeline('sentiment-analysis', 
model='distilbert-base-uncased-finetuned-sst-2-english')

Device set to use cpu



# Sample 50 reviews

In [6]:
sample_df = df.sample(1000, random_state=42)

# Apply sentiment analysis

In [7]:
def get_sentiment(text):
    result = sentiment_analyzer(text[:512])[0]  # Truncate to 512 tokens
    return result['label'], result['score']

sample_df['sentiment'] = sample_df['review'].apply(lambda x: get_sentiment(x)[0])
sample_df['sentiment_score'] = sample_df['review'].apply(lambda x: get_sentiment(x)[1])
print(sample_df[['review', 'sentiment', 'sentiment_score']].head())    

                                                 review sentiment  \
319                                                  ok  POSITIVE   
956   All-in-one finance & e-commerce super app! Pay...  POSITIVE   
1094                                  Wow Excellent app  POSITIVE   
86                                    I hate this app 😒  NEGATIVE   
990                   it is the most amazing mobile app  POSITIVE   

      sentiment_score  
319          0.999785  
956          0.994359  
1094         0.999844  
86           0.999638  
990          0.999872  


# Aggregate by bank and rating

In [8]:
sentiment_by_bank = sample_df.groupby(['bank', 'rating'])['sentiment_score'].mean().unstack()
print(sentiment_by_bank)

rating                              1         2         3         4         5
bank                                                                         
Bank of Abyssinia            0.979258  0.950136  0.960817  0.952379  0.959533
Commercial Bank of Ethiopia  0.971017  0.978796  0.974088  0.958783  0.977350
Dashen Bank                  0.993916  0.947908  0.997680  0.973680  0.983484



# Load spaCy model

In [9]:
nlp = spacy.load('en_core_web_sm')


# Function to extract keywords

In [10]:
def extract_keywords(text):
    doc = nlp(text.lower())
    keywords = [token.text for token in doc if token.pos_ in ['NOUN', 'PROPN'] or token.dep_ == 'compound']
    return keywords

# Apply to sample

In [11]:
sample_df['keywords'] = sample_df['review'].apply(extract_keywords)
print(sample_df[['review', 'keywords']].head())

                                                 review  \
319                                                  ok   
956   All-in-one finance & e-commerce super app! Pay...   
1094                                  Wow Excellent app   
86                                    I hate this app 😒   
990                   it is the most amazing mobile app   

                                    keywords  
319                                       []  
956   [finance, e, -, app, pay, bankandshop]  
1094                                   [app]  
86                                  [app, 😒]  
990                                    [app]  


# Group keywords by bank

In [None]:
keywords_by_bank = sample_df.groupby('bank')['keywords'].apply(lambda x: Counter([kw for sublist in x for kw in sublist]))
for bank, counter in keywords_by_bank.items():
    print(f"{bank}: {counter.most_common(10)}")


# Example manual theme clustering

In [13]:
themes = {
    'Account Access Issues': ['login', 'password', 'authentication'],
    'Transaction Performance': ['transfer', 'payment', 'slow'],
    'User Interface': ['ui', 'design', 'navigation']
}
print('Sample Themes:', themes)

Sample Themes: {'Account Access Issues': ['login', 'password', 'authentication'], 'Transaction Performance': ['transfer', 'payment', 'slow'], 'User Interface': ['ui', 'design', 'navigation']}
