In [1]:
import pandas as pd

In [4]:
df = pd.read_csv("../data/clean_reviews.csv")  
print(df.head())

                                     review  rating        date bank  \
0                                ምንም የማይ ሰራ       1  2025-11-26  BOA   
1                                 very good       5  2025-11-25  BOA   
2  most of the time is not working properly       1  2025-11-25  BOA   
3                              good service       5  2025-11-25  BOA   
4                            not use for me       3  2025-11-23  BOA   

        source  
0  Google Play  
1  Google Play  
2  Google Play  
3  Google Play  
4  Google Play  


In [5]:
print(df.shape)


(1789, 5)


In [7]:
import re
import nltk
from nltk.corpus import stopwords

In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\teshi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [9]:
stop_words = set(stopwords.words('english'))

In [10]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

In [13]:
df['cleaned_text'] = df['review'].apply(preprocess)

In [14]:
print(df[['review', 'cleaned_text']].head())

                                     review           cleaned_text
0                                ምንም የማይ ሰራ                       
1                                 very good                   good
2  most of the time is not working properly  time working properly
3                              good service           good service
4                            not use for me                    use


Sentiment analysis started

In [15]:
from nltk.sentiment import SentimentIntensityAnalyzer

In [16]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\teshi\AppData\Roaming\nltk_data...


True

In [17]:
sia = SentimentIntensityAnalyzer()


In [18]:
def get_sentiment(text):
    score = sia.polarity_scores(text)['compound']
    if score >= 0.05:
        return 'positive', score
    elif score <= -0.05:
        return 'negative', score
    else:
        return 'neutral', score

In [19]:
df[['sentiment_label', 'sentiment_score']] = df['cleaned_text'].apply(lambda x: pd.Series(get_sentiment(x)))


In [20]:
print(df[['cleaned_text', 'sentiment_label', 'sentiment_score']].head())

            cleaned_text sentiment_label  sentiment_score
0                                neutral           0.0000
1                   good        positive           0.4404
2  time working properly         neutral           0.0000
3           good service        positive           0.4404
4                    use         neutral           0.0000


Key word extraction

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=100)  # unigrams & bigrams
tfidf_matrix = tfidf.fit_transform(df['cleaned_text'])

# Get feature names and their importance
feature_names = tfidf.get_feature_names_out()
importance = tfidf_matrix.toarray().sum(axis=0)

keywords = pd.DataFrame({'keyword': feature_names, 'importance': importance})
keywords = keywords.sort_values(by='importance', ascending=False)
print(keywords.head(20))


        keyword  importance
44         good  203.815916
5           app  203.423781
14         best   93.688669
60         nice   68.019864
10         bank   64.515777
11      banking   43.491879
89          use   38.771452
49         like   35.034529
37    excellent   34.994959
22       dashen   34.891723
7   application   34.726421
63          one   33.847635
94      working   33.655779
62           ok   33.340265
46        great   32.906754
39         fast   32.159410
93         work   31.727015
99          wow   30.358187
53       mobile   29.474181
30         easy   28.339058


In [22]:
themes = {
    'Account Access Issues': ['login', 'password', 'account', 'access'],
    'Transaction Performance': ['transfer', 'payment', 'slow', 'delay'],
    'UI & Experience': ['interface', 'easy', 'app', 'design'],
    'Customer Support': ['support', 'help', 'service', 'response']
}


In [23]:
def assign_theme(text):
    matched = []
    for theme, keywords_list in themes.items():
        for kw in keywords_list:
            if kw in text:
                matched.append(theme)
                break
    return matched if matched else ['Other']

df['themes'] = df['cleaned_text'].apply(assign_theme)
print(df[['cleaned_text', 'themes']].head())


            cleaned_text              themes
0                                    [Other]
1                   good             [Other]
2  time working properly             [Other]
3           good service  [Customer Support]
4                    use             [Other]


In [24]:
df.to_csv("../data/task2_reviews_analysis.csv", index=False)
