## Import data from the csv file

In [1]:
import pandas as pd
import os

df = pd.read_csv(os.path.join('data','sentiment-analysis-dataset-google-play-app-reviews.csv'))
df = df[['content','score']] # select content and score
df.dropna()
df['sentiment'] = df['score'].apply(lambda x: 'positive' if x >= 4 else 'negative' if x <= 2 else 'neutral')
df = df[['content','sentiment']]
df = df[df['sentiment'] != 'neutral']  # Exclude neutral reviews

print(df.shape)
print(df.head())

(10942, 2)
                                             content sentiment
0  I love this app, but I do have one major gripe...  negative
1  Trash. Yes, it has some nice nifty features bu...  negative
2  OMG the UI is awful, seriously you have popup ...  negative
3  I've been using the app for a while and since ...  negative
4  Unable to register with an email. Clicking"con...  negative


## Clear content

In [2]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import contractions

nltk.download(['punkt', 'wordnet', 'stopwords','punkt_tab'])
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Harry\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Harry\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Harry\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Harry\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [3]:
def clear_content(content):
    '''this function will clear the text by following steps:'''
    # step 1: expand contractions 
    content = contractions.fix(content) 
    # step 2: convert text to lower 
    content = content.lower()
    # step 3: remove special characters
    content = re.sub(r'[^a-zA-Z\s]', '', content) 
    # step 4: tokenization
    tokens = word_tokenize(content)
    # step 5: lemmatization
    cleared = []
    for word in tokens:
        if (word not in stop_words) and len(word) > 2: # exclude stop words and small words like a, an, it, as
            cleared.append(lemmatizer.lemmatize(word))
    
    return ' '.join(cleared)

In [4]:
df['content'] = df['content'].apply(clear_content)
df.head()
df.describe

<bound method NDFrame.describe of                                                  content sentiment
0      love app one major gripe want option buy premi...  negative
1      trash yes nice nifty feature lack complete nec...  negative
2      omg awful seriously popup premium every second...  negative
3      using app since last week acting weird receive...  negative
4      unable register email clickingcontinue email t...  negative
...                                                  ...       ...
16087  used several year one best digital planner fan...  positive
16088  love love keep day forever cross like piece pa...  positive
16089                                          great app  positive
16090                          helpful user friendly app  positive
16091  used app year really find helpful like synched...  positive

[10942 rows x 2 columns]>

## Vectorization

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,1), stop_words='english')
X = tfidf.fit_transform(df['content'])
y = df['sentiment']

## Train

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)

# logistic regression
lr_model = LogisticRegression(max_iter=5000, class_weight='balanced', random_state=0)
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.84      0.90      0.87      1038
    positive       0.91      0.85      0.88      1151

    accuracy                           0.88      2189
   macro avg       0.88      0.88      0.88      2189
weighted avg       0.88      0.88      0.88      2189



## Feature Extraction

In [7]:
# feature extractions
feature_names = tfidf.get_feature_names_out()
coef_df = pd.DataFrame({
        'feature': feature_names,
        'coefficient': lr_model.coef_[0]
    })

# normalize the coefficient for display (% of the color)
mean_coef = coef_df['coefficient'].mean()
std_coef = coef_df['coefficient'].std()
coef_df['normalized_coefficient'] = (coef_df['coefficient'] - mean_coef) / std_coef

positive_keywords = coef_df.sort_values('coefficient', ascending=False).head(30) # pick top 20
negative_keywords = coef_df.sort_values('coefficient').head(30)

positive_keywords_dict = positive_keywords.set_index('feature')['normalized_coefficient'].to_dict()
negative_keywords_dict = negative_keywords.set_index('feature')['normalized_coefficient'].to_dict()

print(coef_df.sort_values('coefficient', ascending=False))
print("\nTop Positive Keywords:")
print(positive_keywords_dict)
print("\nTop Negative Keywords:")
print(negative_keywords_dict)

           feature  coefficient  normalized_coefficient
2380          love     5.549449               11.126464
1773         great     5.390792               10.808801
1313          easy     4.608200                9.241892
503           best     4.324643                8.674153
4905          wish     3.719080                7.461692
...            ...          ...                     ...
845    complicated    -2.626188               -5.242830
4661  uninstalling    -2.649653               -5.289811
4659     uninstall    -2.727287               -5.445251
1113       deleted    -2.946235               -5.883629
4744       useless    -4.585184               -9.165141

[5000 rows x 3 columns]

Top Positive Keywords:
{'love': 11.12646411915547, 'great': 10.808800587495753, 'easy': 9.241891519846098, 'best': 8.674153363777542, 'wish': 7.461691787196903, 'amazing': 7.1930007348137766, 'excellent': 6.861168179124314, 'helpful': 6.80387851382512, 'perfect': 6.747182868473055, 'thank': 6.72604610

## Other methods for keywords

In [8]:
from rake_nltk import Rake

def extract_rake_keywords(texts, top_n=20):
    """
    Extracts 1-gram keywords using the RAKE algorithm.
    """
    r = Rake()
    keywords = []
    
    for text in texts:
        # split text into 1-gram
        words = text.split()
        preprocessed_text = ' '.join(words)

        # extract keywords
        r.extract_keywords_from_text(preprocessed_text)
        for phrase in r.get_ranked_phrases():
            if len(phrase.split()) == 1:  # ensure it's a single word
                keywords.append(phrase)
    
    return pd.Series(keywords).value_counts().head(top_n)



In [9]:
# Extract keywords using RAKE for positive and negative reviews separately
positive_rake_keywords = extract_rake_keywords(df[df['sentiment'] == 'positive']['content']).index.tolist()
negative_rake_keywords = extract_rake_keywords(df[df['sentiment'] == 'negative']['content']).index.tolist()

print("\nRAKE Positive Keywords:")
print(positive_rake_keywords)
print("\nRAKE Negative Keywords:")
print(negative_rake_keywords)


RAKE Positive Keywords:
['good', 'great', 'nice', 'love', 'excellent', 'useful', 'awesome', 'amazing', 'helpful', 'perfect', 'best', 'cool', 'like', 'bad', 'super', 'superb', 'fun', 'wonderful', 'loved', 'obsessed']

RAKE Negative Keywords:
['good', 'complicated', 'bad', 'confusing', 'worst', 'nice', 'working', 'hate', 'terrible', 'expensive', 'useless', 'suck', 'crashing', 'complex', 'okay', 'intuitive', 'reminder', 'sure', 'usefull', 'acceptable']


In [10]:
# from keybert import KeyBERT # not performing well

# def extract_keybert_keywords(texts, top_n=20):
#     """Extracts keywords using KeyBERT."""
#     combined_text = ' '.join(texts)
#     kw_model = KeyBERT()
#     keywords = kw_model.extract_keywords(
#         combined_text,
#         keyphrase_ngram_range=(1,2),
#         stop_words='english',
#         top_n=top_n
#     )
#     return dict(keywords)

In [11]:
# # Extract keywords using KeyBERT for positive and negative reviews separately
# positive_keybert_keywords = extract_keybert_keywords(df[df['sentiment'] == 'positive']['content'])
# negative_keybert_keywords = extract_keybert_keywords(df[df['sentiment'] == 'negative']['content'])

# print("\nKeyBERT Positive Keywords:")
# print(list(positive_keybert_keywords.keys()))
# print("\nKeyBERT Negative Keywords:")
# print(list(negative_keybert_keywords.keys()))

## Display - Linear Regression Scores

In [12]:
# download pretrained model to models\trained_model from https://figshare.com/articles/dataset/GoogleNews-vectors-negative300/23601195?file=41403483
from gensim.models import KeyedVectors
import os

# Load the model in binary format
model_path = os.path.join('trained_model','GoogleNews-vectors-negative300.bin.gz')
word2vec_model = KeyedVectors.load_word2vec_format(model_path, binary=True)


In [13]:
# demo
similarity = word2vec_model.similarity('crashing', 'crashes')
print(f"Cosine similarity: {similarity}")

Cosine similarity: 0.47624388337135315


In [14]:
import numpy as np

def score_to_trans(x):
    '''transformation function (2sigmoid(x)-1) that maps values to number between +1 & -1'''
    return 2*1/(1+np.exp(-x)) -1 

def calculate_score_lr(new_input: str, positive_words: dict, negative_words: dict, similarity_threshold=0.5):
    '''Assign the adjusted weight of each fragment in new input'''
    result = {}
    new_review_cleared = clear_content(new_input)
    new_review_list = new_review_cleared.split(" ")
    agg_words = positive_words | negative_words
    for frag in new_review_list:
        acc_score = 0
        matches = 0
        for k, v in agg_words.items():
            try:
                similarity = word2vec_model.similarity(frag, k)
                if similarity >= similarity_threshold:
                    acc_score += similarity * v # generate new score for color
                    matches += 1
            except:
                pass # skip the fragments not in the 

        result[frag] = 0 if matches == 0 else acc_score/matches
    '''Generate the transparency based on normal cdf'''
    for w in result:
        if result[w] != 0: # faster processing
            result[w] = score_to_trans(result[w])
    return result


In [15]:
new_review = "I've been using this app for weeks and it's absolutely terrible! Constant crashes and poor performance make it unusable."
# new_review = "Creating an account is supposed to ALLOW you, not FORCE you. Uninstalled without trying it"
# new_review = "This last update has brought all kinds of bugs. Lists disappearing randomly, lists getting renamed, permissions randomly getting changed. All of these problems indicated serious bugs in the data model and potentially spillage of customer data. I would suggest looking into a new tasks app because the drop in quality has been dramatic."
# preprocess
new_review_cleared = clear_content(new_review)
new_review_list = new_review_cleared.split(" ")
# new_review_list = new_review.split(" ")
print(new_review_list)


['using', 'app', 'week', 'absolutely', 'terrible', 'constant', 'crash', 'poor', 'performance', 'make', 'unusable']


In [16]:
scores = calculate_score_lr(new_review, positive_keywords_dict, negative_keywords_dict, similarity_threshold=0.5)
print(scores)


{'using': 0, 'app': 0, 'week': 0, 'absolutely': 0.09908282164706539, 'terrible': 0.765633482377807, 'constant': 0, 'crash': -0.8646707037832555, 'poor': 0, 'performance': 0, 'make': 0, 'unusable': -0.991033953997229}


In [17]:
def highlight_sentence_html(sentence:str, sentiment_dict:dict, similarity_threshold=0.5):
    words = sentence.split()
    highlighted_words = [] # for display
    
    for word in words:
        clean_word = word.lower().strip('.,!?;:()[]{}""\'')

        # assign score to the raw text
        for k in sentiment_dict:
            try: # secondary mapping
                similarity = word2vec_model.similarity(clean_word, k)
                if similarity >= similarity_threshold:
                    value = similarity*sentiment_dict[k]      
                    break # break when find the first match
                else: # match below treshold
                    value = 0
            except: # no match
                value = 0
        
        # formatted output
        if value < 0:
            # negative word -> red
            brightness = 255 - int((abs(value) * 255))
            color = f"rgb(255, {brightness}, {brightness})"
            highlighted_words.append(f'<span style="background-color: {color};">{word}</span>')
        elif value > 0:
            # positive words -> green
            brightness = 255 - int((value * 255))
            color = f"rgb({brightness}, 255, {brightness})"
            highlighted_words.append(f'<span style="background-color: {color};">{word}</span>')
        else: # below treshold or no match, directly append
            highlighted_words.append(word)
    
    return ' '.join(highlighted_words)

highlighted_html = highlight_sentence_html(new_review, scores)

# save to an HTML 
with open("highlighted_sentence.html", "w") as file:
    file.write(f"<html><body style='font-size: 18px; padding: 20px;'>{highlighted_html}</body></html>")


## Display - RAKE Score
Idea: calulate the average similarity to positive and negative to assign scores to each words.

In [18]:
def word_to_center_similarity(word:str, word_list:list):
    '''this function calculates the similarity between a word to the center point of a list of words'''
    # find center point
    vectors = []
    for w in word_list:
        vectors.append(word2vec_model[w])
    centroid_vector = np.mean(vectors, axis=0) 
    word_vecotor = word2vec_model[word]
    # similarity
    similarity = np.dot(centroid_vector, word_vecotor) / (np.linalg.norm(centroid_vector) * np.linalg.norm(word_vecotor))

    return similarity


def calculate_score_rake(new_input: str, positive_words: list, negative_words: list, treshold = 0.1):
    '''This function calculates the average score for each keywords'''
    result = {}
    new_review_cleared = clear_content(new_input)
    new_review_list = new_review_cleared.split(" ")

    for frag in new_review_list: # compare similarity of the word to center of the list
        if frag not in word2vec_model:
            result[frag] = 0
        else:
            pos_similarity = word_to_center_similarity(frag, positive_words)
            neg_similarity = word_to_center_similarity(frag, negative_words)
            score = pos_similarity-neg_similarity
            if score > treshold:
                result[frag] = 1
            elif score < -treshold:
                result[frag] = -1
            else:
                result[frag] = 0

    return result

In [19]:
scores_rake = calculate_score_rake(new_review, positive_rake_keywords, negative_rake_keywords)
print(scores_rake)

{'using': 0, 'app': 0, 'week': 0, 'absolutely': 0, 'terrible': 0, 'constant': 0, 'crash': -1, 'poor': 0, 'performance': 0, 'make': 0, 'unusable': -1}


In [20]:
highlighted_html = highlight_sentence_html(new_review, scores_rake)
with open("highlighted_sentence_rake.html", "w") as file:
    file.write(f"<html><body style='font-size: 18px; padding: 20px;'>{highlighted_html}</body></html>")
