In [445]:
import nltk
import re
import gensim
import pandas as pd
import gensim.corpora as corpora
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from pprint import pprint



nltk.download("stopwords")
#used for lemmatizing
nltk.download("wordnet")
# 
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Noor\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Noor\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# TASK 1: GATHERING REVIEWS

### Google Play Scrapper

I used the following scrapper code that was provided by the professor
(source: https://www.linkedin.com/pulse/how-scrape-google-play-reviews-4-simple-steps-using-python-kundi/ )


from google_play_scraper import app, Sort, reviews_all

import pandas as pd

import numpy as np


us_reviews = reviews_all(

    'com.Slack',
    
    sort=Sort.NEWEST,
    
)

    
df_busu = pd.DataFrame(np.array(us_reviews),columns=['review'])


df_busu = df_busu.join(pd.DataFrame(df_busu.pop('review').tolist()))




In [508]:
#import data frames
df_wire= pd.read_csv(r"export_wire_UTF8.csv", encoding=('ISO-8859-1'))
df_slack= pd.read_csv(r"export_slack_UTF8.csv", encoding=('ISO-8859-1'))
df_spike= pd.read_csv(r"export_spike_UTF8.csv", encoding=('ISO-8859-1'))
df_signal= pd.read_csv(r"export_signal_UTF8.csv", encoding=('ISO-8859-1'))
df_zoho= pd.read_csv(r"export_zoho_UTF8.csv", encoding=('ISO-8859-1'))

#drop unneeded columns
df_wire.drop(df_wire.columns[[0,2,5,6,7,8,9]], axis=1, inplace=True)
df_slack.drop(df_slack.columns[[0,2,5,6,7,8,9]], axis=1, inplace=True)
df_spike.drop(df_spike.columns[[0,2,5,6,7,8,9]], axis=1, inplace=True)
df_signal.drop(df_signal.columns[[0,2,5,6,7,8,9]], axis=1, inplace=True)
df_zoho.drop(df_zoho.columns[[0,2,5,6,7,8,9]], axis=1, inplace=True)

#add 'package name' column
df_wire.insert(0, 'Package Name', "com.wire")
df_slack.insert(0, 'Package Name', "com.Slack")
df_spike.insert(0, 'Package Name', "com.pingapp.app")
df_signal.insert(0, 'Package Name', "org.thoughtcrime.securesms")
df_zoho.insert(0, 'Package Name', "com.zoho.chat")

#rename columns as per project instructions
df_wire.columns = ['Package Name','Reviewer Name', 'Review', 'Rating']
df_slack.columns = ['Package Name','Reviewer Name', 'Review', 'Rating']
df_spike.columns = ['Package Name','Reviewer Name', 'Review', 'Rating']
df_signal.columns = ['Package Name','Reviewer Name', 'Review', 'Rating']
df_zoho.columns = ['Package Name','Reviewer Name', 'Review', 'Rating']

df_slack.head()

Unnamed: 0,Package Name,Reviewer Name,Review,Rating
0,com.Slack,Tiara Udziela,No longer functional. Used to be pretty great ...,1
1,com.Slack,Alexander Baumann,Update: support reached out and later and fixe...,3
2,com.Slack,Clint McGill,It was fine until my work consolidated multipl...,3
3,com.Slack,Joshua Hicks,Mostly works and I live off of it when not at ...,3
4,com.Slack,Tawittle,Didn't work. Can't view the slack workplace on...,1


# TASK 2: PREPROCESS YOUR TEXT

### I. Removing punctuation and special characters

In [447]:
df_wire["Review"]= df_wire["Review"].str.replace(r'[^\w\s]',"", regex=True)
df_wire["Reviewer Name"]= df_wire["Reviewer Name"].str.replace(r'[^\w\s]',"",regex=True)
df_wire["Package Name"]= df_wire["Package Name"].str.replace(r'.',"",regex=True)

df_slack["Review"]= df_slack["Review"].str.replace(r'[^\w\s]',"", regex=True)
df_slack["Reviewer Name"]= df_slack["Reviewer Name"].str.replace(r'[^\w\s]',"",regex=True)
df_slack["Package Name"]= df_slack["Package Name"].str.replace(r'.',"",regex=True)

df_spike["Review"]= df_spike["Review"].str.replace(r'[^\w\s]',"", regex=True)
df_spike["Reviewer Name"]= df_spike["Reviewer Name"].str.replace(r'[^\w\s]',"",regex=True)
df_spike["Package Name"]= df_spike["Package Name"].str.replace(r'.',"",regex=True)

df_signal["Review"]= df_signal["Review"].str.replace(r'[^\w\s]',"", regex=True)
df_signal["Reviewer Name"]= df_signal["Reviewer Name"].str.replace(r'[^\w\s]',"",regex=True)
df_signal["Package Name"]= df_signal["Package Name"].str.replace(r'.',"",regex=True)

df_zoho["Review"]= df_zoho["Review"].str.replace(r'[^\w\s]',"", regex=True)
df_zoho["Reviewer Name"]= df_zoho["Reviewer Name"].str.replace(r'[^\w\s]',"",regex=True)
df_zoho["Package Name"]= df_zoho["Package Name"].str.replace(r'.',"",regex=True)

### II. Removing emojis

In [448]:
# I exported my xlsx files as Unicode (UTF-8) and encoding with ISO-8859-1.
# By doing this, emojis cannot be displayed anymore. All emojis were converted into "??" characters
# I removed all punctuation in the previous step, therefore removing all "emojis"

### III. Convert numbers into strings

In [449]:
df_wire["Reviewer Name"]= df_wire["Reviewer Name"].apply(str)
df_wire["Review"]= df_wire["Review"].apply(str)
df_wire["Rating"]= df_wire["Rating"].apply(str)

df_slack["Reviewer Name"]= df_slack["Reviewer Name"].apply(str)
df_slack["Review"]= df_slack["Review"].apply(str)
df_slack["Rating"]= df_slack["Rating"].apply(str)

df_spike["Reviewer Name"]= df_spike["Reviewer Name"].apply(str)
df_spike["Review"]= df_spike["Review"].apply(str)
df_spike["Rating"]= df_spike["Rating"].apply(str)

df_signal["Reviewer Name"]= df_signal["Reviewer Name"].apply(str)
df_signal["Review"]= df_signal["Review"].apply(str)
df_signal["Rating"]= df_signal["Rating"].apply(str)

df_zoho["Reviewer Name"]= df_zoho["Reviewer Name"].apply(str)
df_zoho["Review"]= df_zoho["Review"].apply(str)
df_zoho["Rating"]= df_zoho["Rating"].apply(str)

### IV. Remove white spaces (leading and trailing spaces)

In [450]:
df_wire= df_wire.replace(r"^ +| +$", r"", regex=True)
df_slack= df_slack.replace(r"^ +| +$", r"", regex=True)
df_spike= df_spike.replace(r"^ +| +$", r"", regex=True)
df_signal= df_signal.replace(r"^ +| +$", r"", regex=True)
df_zoho= df_zoho.replace(r"^ +| +$", r"", regex=True)

### V. Convert to lowercase

In [451]:
df_wire["Package Name"] = df_wire["Package Name"].str.lower()
df_wire["Reviewer Name"]= df_wire["Reviewer Name"].str.lower()
df_wire["Review"]= df_wire["Review"].str.lower()
df_wire["Rating"]= df_wire["Rating"].str.lower()

df_slack["Package Name"] = df_slack["Package Name"].str.lower()
df_slack["Reviewer Name"]= df_slack["Reviewer Name"].str.lower()
df_slack["Review"]= df_slack["Review"].str.lower()
df_slack["Rating"]= df_slack["Rating"].str.lower()

df_spike["Package Name"] = df_spike["Package Name"].str.lower()
df_spike["Reviewer Name"]= df_spike["Reviewer Name"].str.lower()
df_spike["Review"]= df_spike["Review"].str.lower()
df_spike["Rating"]= df_spike["Rating"].str.lower()

df_signal["Package Name"] = df_signal["Package Name"].str.lower()
df_signal["Reviewer Name"]= df_signal["Reviewer Name"].str.lower()
df_signal["Review"]= df_signal["Review"].str.lower()
df_signal["Rating"]= df_signal["Rating"].str.lower()

df_zoho["Package Name"] = df_zoho["Package Name"].str.lower()
df_zoho["Reviewer Name"]= df_zoho["Reviewer Name"].str.lower()
df_zoho["Review"]= df_zoho["Review"].str.lower()
df_zoho["Rating"]= df_zoho["Rating"].str.lower()

Unnamed: 0,Package Name,Reviewer Name,Review,Rating
0,comwire,glen w,ive used this app since 2015 at first it was a...,1
1,comwire,k h,downloaded this app for personal use and i abs...,5
2,comwire,a google user,not a bad alternative to signal needs more cus...,2
3,comwire,a google user,very clean no ads got all the major functions ...,5
4,comwire,a google user,the core messaging functionality is far too un...,1


### VI. Remove stopwords 

In [452]:
#tokenize first
df_wire['Review']= df_wire['Review'].apply(lambda row: nltk.word_tokenize(row))
#remove stop words
df_wire['Review']= df_wire['Review'].apply(lambda x: [item for item in x if item not in stop_words])

#tokenize first
df_slack['Review']= df_slack['Review'].apply(lambda row: nltk.word_tokenize(row))
#remove stop words
df_slack['Review']= df_slack['Review'].apply(lambda x: [item for item in x if item not in stop_words])

#tokenize first
df_spike['Review']= df_spike['Review'].apply(lambda row: nltk.word_tokenize(row))
#remove stop words
df_spike['Review']= df_spike['Review'].apply(lambda x: [item for item in x if item not in stop_words])

#tokenize first
df_signal['Review']= df_signal['Review'].apply(lambda row: nltk.word_tokenize(row))
#remove stop words
df_signal['Review']= df_signal['Review'].apply(lambda x: [item for item in x if item not in stop_words])

#tokenize first
df_zoho['Review']= df_zoho['Review'].apply(lambda row: nltk.word_tokenize(row))
#remove stop words
df_zoho['Review']= df_zoho['Review'].apply(lambda x: [item for item in x if item not in stop_words])

Unnamed: 0,Package Name,Reviewer Name,Review,Rating
0,comzohochat,lisa robinson,"[allowed, categorize, texts, executives, vs, d...",5
1,comzohochat,bob carter,"[looks, like, since, anyone, rated, thjs, app,...",2
2,comzohochat,rifael levine,"[app, pretty, good, issues, runs, slow, scroll...",3
3,comzohochat,ramdas das,"[user, friendly, meaning, intuitive, unsuccess...",2
4,comzohochat,a google user,"[yahoo, messenger, quit, analyzed, many, one, ...",5


### VII. Lemmatize the reviews

In [453]:
def lema(review): 
    lemmatized_list =[]

    for i in range(len(review)):
        lemmatized_list.append(lemmatizer.lemmatize(review[i]))
    return lemmatized_list
    
for row in range(len(df_zoho["Review"])):
    df_zoho['Review'][row] = lema(df_zoho["Review"][row])
    
for row in range(len(df_zoho["Review"])):
    df_wire['Review'][row] = lema(df_wire["Review"][row])

for row in range(len(df_zoho["Review"])):
    df_slack['Review'][row] = lema(df_slack["Review"][row])

for row in range(len(df_zoho["Review"])):
    df_signal['Review'][row] = lema(df_signal["Review"][row])   
    
for row in range(len(df_zoho["Review"])):
    df_spike['Review'][row] = lema(df_spike["Review"][row])

### VIII. Output 15 sample pre-processed reviews in your notebook

In [509]:
df_zoho.head(15)

Unnamed: 0,Package Name,Reviewer Name,Review,Rating
0,com.zoho.chat,Lisa Robinson,It has allowed me to categorize my texts for e...,5
1,com.zoho.chat,Bob Carter,It looks like it has been a while since anyone...,2
2,com.zoho.chat,Rifael Levine,"The app is pretty good, but there are a few is...",3
3,com.zoho.chat,Ramdas Das,"Not very user friendly, meaning that it's not ...",2
4,com.zoho.chat,A Google user,"When Yahoo messenger quit, we analyzed many an...",5
5,com.zoho.chat,Manjunath Laksh,"Simple messaging app. My favorite feature is, ...",5
6,com.zoho.chat,Ben,Good tool! Cannot reccommend it enough. The re...,4
7,com.zoho.chat,ninad kambli,initially i was not able to join the meets. no...,4
8,com.zoho.chat,Agbo Gabriel E,Cool app. Please add the break option and coun...,4
9,com.zoho.chat,HJ Molenkamp,Better than whatsapp. Reliable. Nice look and ...,4


# TASK 3: SENTIMENT ANALYSIS 

### I. Use Textblob for calculating Reviews Sentiment 

In [470]:
#copy reviews onto new df
df_textblob_wire = pd.DataFrame(columns=["Review","Polarity"]) 
df_textblob_wire["Review"] = df_wire[['Review']].copy()
df_textblob_wire.insert(0, 'Package Name', "com.wire")

df_textblob_slack = pd.DataFrame(columns=["Review","Polarity"]) 
df_textblob_slack["Review"] = df_slack[['Review']].copy()
df_textblob_slack.insert(0, 'Package Name', "com.Slack")

df_textblob_signal = pd.DataFrame(columns=["Review","Polarity"]) 
df_textblob_signal["Review"] = df_signal[['Review']].copy()
df_textblob_signal.insert(0, 'Package Name', "org.thoughtcrime.securesms")

df_textblob_spike = pd.DataFrame(columns=["Review","Polarity"]) 
df_textblob_spike["Review"] = df_spike[['Review']].copy()
df_textblob_spike.insert(0, 'Package Name', "com.pingapp.app")

df_textblob_zoho = pd.DataFrame(columns=["Review","Polarity"]) 
df_textblob_zoho["Review"] = df_zoho[['Review']].copy()
df_textblob_zoho.insert(0, 'Package Name', "com.zoho.chat")
      
def blobber(review, df,row):
    df.at[row,'Polarity'] = TextBlob(review).sentiment

#merge list into one string
def joinlist(review,row, newDF):
    for i in range(len(review)):
        temp1 = " ".join(review)
    newDF.at[row,'Review']= temp1

#wire
for row in range(251):
    joinlist(df_wire["Review"][row],row,df_textblob_wire)
for row in range(251):
     blobber(df_textblob_wire["Review"][row], df_textblob_wire,row)
        
#slack
for row in range(251):
    joinlist(df_slack["Review"][row],row,df_textblob_slack)
for row in range(251):
     blobber(df_textblob_slack["Review"][row], df_textblob_slack,row)

#signal
for row in range(251):
    joinlist(df_signal["Review"][row],row,df_textblob_signal)
for row in range(251):
     blobber(df_textblob_signal["Review"][row], df_textblob_signal,row)
    
#spike
for row in range(251):
    joinlist(df_spike["Review"][row],row,df_textblob_spike)
for row in range(251):
     blobber(df_textblob_spike["Review"][row], df_textblob_spike,row)
    
#zoho
for row in range(251):
    joinlist(df_zoho["Review"][row],row,df_textblob_zoho)
for row in range(251):
     blobber(df_textblob_zoho["Review"][row], df_textblob_zoho,row)

Unnamed: 0,Package Name,Review,Polarity
0,com.pingapp.app,finally email client everything need great per...,"(0.34232804232804237, 0.5994708994708995)"
1,com.pingapp.app,better really wanted like app load one account...,"(0.3166666666666667, 0.49999999999999994)"
2,com.pingapp.app,start cant find draft dont finish email never ...,"(0.22807192807192805, 0.5008685758685758)"
3,com.pingapp.app,want love spike great email client clean inter...,"(0.33, 0.6050000000000001)"
4,com.pingapp.app,ive waiting year app like smartphones computer...,"(0.07500000000000001, 0.6375000000000001)"
5,com.pingapp.app,ui nice take minute get used excellent cross p...,"(0.4, 0.586111111111111)"
6,com.pingapp.app,recently downloaded say almost perfect ive tri...,"(-0.05312500000000001, 0.45416666666666666)"
7,com.pingapp.app,love email client conversational customizable ...,"(0.43000000000000005, 0.6699999999999999)"
8,com.pingapp.app,ive tried countless email apps pretty unique c...,"(0.28229166666666666, 0.7479166666666667)"
9,com.pingapp.app,want like app look like lot good feature doesn...,"(-0.01666666666666668, 0.35000000000000003)"


### II. Textblob output tables

In [489]:
df_textblob_wire.head(10)

Unnamed: 0,Package Name,Review,Polarity
0,com.wire,ive used app since 2015 first great keep relea...,"(0.10714285714285714, 0.3629251700680272)"
1,com.wire,downloaded app personal use absolutely love fa...,"(0.14404761904761906, 0.6357142857142858)"
2,com.wire,bad alternative signal need customizability po...,"(-0.14375000000000002, 0.42916666666666664)"
3,com.wire,clean ad got major function video call audio c...,"(0.2791666666666667, 0.4357142857142858)"
4,com.wire,core messaging functionality far unreliable of...,"(-0.15000000000000002, 0.4361111111111111)"
5,com.wire,wire great privacy great encryption needing ph...,"(0.45, 0.55)"
6,com.wire,ive using app couple year communicate family d...,"(0.18333333333333335, 0.4916666666666667)"
7,com.wire,seems everytime try use voice changer app woul...,"(0.19999999999999998, 0.625)"
8,com.wire,could really awesome app one flaw hard get app...,"(0.1787878787878788, 0.5257575757575758)"
9,com.wire,feature messaging app ive tried matter uninsta...,"(0.19999999999999998, 0.5)"


In [490]:
df_textblob_slack.head(10)

Unnamed: 0,Package Name,Review,Polarity
0,com.Slack,longer functional used pretty great minor quir...,"(-0.027777777777777773, 0.538888888888889)"
1,com.Slack,update support reached later fixed 1 2 ty 3 st...,"(0.12999999999999998, 0.29000000000000004)"
2,com.Slack,fine work consolidated multiple account one ac...,"(-0.00714285714285714, 0.4197619047619048)"
3,com.Slack,mostly work live laptop last month getting err...,"(0.037272727272727256, 0.47333333333333333)"
4,com.Slack,didnt work cant view slack workplace mobile wo...,"(0.22708333333333336, 0.4979166666666667)"
5,com.Slack,issue believe related android 13 google pixel ...,"(-0.025454545454545462, 0.40181818181818174)"
6,com.Slack,hadnt used awhile opened one phone said needed...,"(0.13333333333333333, 0.26666666666666666)"
7,com.Slack,dont get fix break wasnt broken used able star...,"(0.024999999999999994, 0.25625)"
8,com.Slack,rarely leave review said app unusable aside no...,"(0.11000000000000001, 0.41)"
9,com.Slack,past week half app stopped working properly me...,"(-0.023333333333333338, 0.2333333333333333)"


In [491]:
df_textblob_signal.head(10)

Unnamed: 0,Package Name,Review,Polarity
0,org.thoughtcrime.securesms,love app using whenever possible however thing...,"(-0.020000000000000004, 0.6625)"
1,org.thoughtcrime.securesms,great app used year discontinuation sm support...,"(0.3891233766233766, 0.627435064935065)"
2,org.thoughtcrime.securesms,used great app taking near 16 hour message del...,"(0.15833333333333333, 0.575)"
3,org.thoughtcrime.securesms,signal great app used several year part greatn...,"(0.05000000000000001, 0.7291666666666666)"
4,org.thoughtcrime.securesms,worked excellently video callsat first nobody ...,"(0.4166666666666667, 0.4777777777777778)"
5,org.thoughtcrime.securesms,great le used exceptional app messaging phone ...,"(0.09309523809523809, 0.5488095238095239)"
6,org.thoughtcrime.securesms,app perfect many user signal decided remove sm...,"(0.28824404761904765, 0.5889880952380954)"
7,org.thoughtcrime.securesms,removing support sm split messaging apps peopl...,"(0.1796875, 0.3203125)"
8,org.thoughtcrime.securesms,app great private endtoend encryptesmd convers...,"(0.3416666666666667, 0.6061011904761905)"
9,org.thoughtcrime.securesms,goto text app year highly recommended removing...,"(0.11833333333333333, 0.465)"


In [492]:
df_textblob_spike.head(10)

Unnamed: 0,Package Name,Review,Polarity
0,com.pingapp.app,finally email client everything need great per...,"(0.34232804232804237, 0.5994708994708995)"
1,com.pingapp.app,better really wanted like app load one account...,"(0.3166666666666667, 0.49999999999999994)"
2,com.pingapp.app,start cant find draft dont finish email never ...,"(0.22807192807192805, 0.5008685758685758)"
3,com.pingapp.app,want love spike great email client clean inter...,"(0.33, 0.6050000000000001)"
4,com.pingapp.app,ive waiting year app like smartphones computer...,"(0.07500000000000001, 0.6375000000000001)"
5,com.pingapp.app,ui nice take minute get used excellent cross p...,"(0.4, 0.586111111111111)"
6,com.pingapp.app,recently downloaded say almost perfect ive tri...,"(-0.05312500000000001, 0.45416666666666666)"
7,com.pingapp.app,love email client conversational customizable ...,"(0.43000000000000005, 0.6699999999999999)"
8,com.pingapp.app,ive tried countless email apps pretty unique c...,"(0.28229166666666666, 0.7479166666666667)"
9,com.pingapp.app,want like app look like lot good feature doesn...,"(-0.01666666666666668, 0.35000000000000003)"


In [493]:
df_textblob_zoho.head(10)

Unnamed: 0,Package Name,Review,Polarity
0,com.zoho.chat,allowed categorize text executive v driver two...,"(0.0, 0.0)"
1,com.zoho.chat,look like since anyone rated thjs app appears ...,"(0.18, 0.375)"
2,com.zoho.chat,app pretty good issue run slow scrolling chopp...,"(0.14488636363636362, 0.4854545454545455)"
3,com.zoho.chat,user friendly meaning intuitive unsuccessful i...,"(0.20694444444444446, 0.3458333333333334)"
4,com.zoho.chat,yahoo messenger quit analyzed many one two see...,"(0.3997354497354497, 0.5751322751322752)"
5,com.zoho.chat,simple messaging app favorite feature edit mes...,"(0.275, 0.5955357142857143)"
6,com.zoho.chat,good tool reccommend enough remote work functi...,"(0.20476190476190476, 0.4)"
7,com.zoho.chat,initially able join meet able join meet app cr...,"(0.5, 0.625)"
8,com.zoho.chat,cool app please add break option countdown ava...,"(0.375, 0.525)"
9,com.zoho.chat,better whatsapp reliable nice look feel could ...,"(0.3666666666666667, 0.5166666666666667)"


### III. Use Vader for calculating Reviews Sentiment

In [494]:
#copy reviews onto new df
df_vader_wire = pd.DataFrame(columns=["Review","Polarity"]) 
df_vader_wire["Review"] = df_wire[['Review']].copy()
df_vader_wire.insert(0, 'Package Name', "com.wire")

df_vader_slack = pd.DataFrame(columns=["Review","Polarity"]) 
df_vader_slack["Review"] = df_slack[['Review']].copy()
df_vader_slack.insert(0, 'Package Name', "com.Slack")

df_vader_signal = pd.DataFrame(columns=["Review","Polarity"]) 
df_vader_signal["Review"] = df_signal[['Review']].copy()
df_vader_signal.insert(0, 'Package Name', "org.thoughtcrime.securesms")

df_vader_spike = pd.DataFrame(columns=["Review","Polarity"]) 
df_vader_spike["Review"] = df_spike[['Review']].copy()
df_vader_spike.insert(0, 'Package Name', "com.pingapp.app")

df_vader_zoho = pd.DataFrame(columns=["Review","Polarity"]) 
df_vader_zoho["Review"] = df_zoho[['Review']].copy()
df_vader_zoho.insert(0, 'Package Name', "com.zoho.chat")

vader_obj = SentimentIntensityAnalyzer()

def vader_func(review, df, row):
    df.at[row,'Polarity'] = vader_obj.polarity_scores(review)
    
#wire
for row in range(251):
    joinlist(df_wire["Review"][row],row,df_vader_wire)
for row in range(251):
     vader_func(df_vader_wire["Review"][row], df_vader_wire,row)
    
#slack
for row in range(251):
    joinlist(df_slack["Review"][row],row,df_vader_slack)
for row in range(251):
     vader_func(df_vader_slack["Review"][row], df_vader_slack,row)
    
#spike
for row in range(251):
    joinlist(df_spike["Review"][row],row,df_vader_spike)
for row in range(251):
     vader_func(df_vader_spike["Review"][row], df_vader_spike,row)

#signal    
for row in range(251):
    joinlist(df_signal["Review"][row],row,df_vader_signal)
for row in range(251):
     vader_func(df_vader_signal["Review"][row], df_vader_signal,row)
    
#zoho    
for row in range(251):
    joinlist(df_zoho["Review"][row],row,df_vader_zoho)
for row in range(251):
     vader_func(df_vader_zoho["Review"][row], df_vader_zoho,row)

### IV. Vader output tables

In [495]:
df_vader_wire.head(10)

Unnamed: 0,Package Name,Review,Polarity
0,com.wire,ive used app since 2015 first great keep relea...,"{'neg': 0.035, 'neu': 0.845, 'pos': 0.12, 'com..."
1,com.wire,downloaded app personal use absolutely love fa...,"{'neg': 0.0, 'neu': 0.674, 'pos': 0.326, 'comp..."
2,com.wire,bad alternative signal need customizability po...,"{'neg': 0.159, 'neu': 0.611, 'pos': 0.23, 'com..."
3,com.wire,clean ad got major function video call audio c...,"{'neg': 0.05, 'neu': 0.734, 'pos': 0.216, 'com..."
4,com.wire,core messaging functionality far unreliable of...,"{'neg': 0.158, 'neu': 0.717, 'pos': 0.125, 'co..."
5,com.wire,wire great privacy great encryption needing ph...,"{'neg': 0.184, 'neu': 0.589, 'pos': 0.227, 'co..."
6,com.wire,ive using app couple year communicate family d...,"{'neg': 0.089, 'neu': 0.681, 'pos': 0.23, 'com..."
7,com.wire,seems everytime try use voice changer app woul...,"{'neg': 0.11, 'neu': 0.781, 'pos': 0.109, 'com..."
8,com.wire,could really awesome app one flaw hard get app...,"{'neg': 0.111, 'neu': 0.697, 'pos': 0.192, 'co..."
9,com.wire,feature messaging app ive tried matter uninsta...,"{'neg': 0.053, 'neu': 0.894, 'pos': 0.053, 'co..."


In [496]:
df_vader_slack.head(10)

Unnamed: 0,Package Name,Review,Polarity
0,com.Slack,longer functional used pretty great minor quir...,"{'neg': 0.107, 'neu': 0.618, 'pos': 0.275, 'co..."
1,com.Slack,update support reached later fixed 1 2 ty 3 st...,"{'neg': 0.091, 'neu': 0.764, 'pos': 0.145, 'co..."
2,com.Slack,fine work consolidated multiple account one ac...,"{'neg': 0.263, 'neu': 0.627, 'pos': 0.111, 'co..."
3,com.Slack,mostly work live laptop last month getting err...,"{'neg': 0.203, 'neu': 0.797, 'pos': 0.0, 'comp..."
4,com.Slack,didnt work cant view slack workplace mobile wo...,"{'neg': 0.215, 'neu': 0.52, 'pos': 0.264, 'com..."
5,com.Slack,issue believe related android 13 google pixel ...,"{'neg': 0.047, 'neu': 0.953, 'pos': 0.0, 'comp..."
6,com.Slack,hadnt used awhile opened one phone said needed...,"{'neg': 0.0, 'neu': 0.92, 'pos': 0.08, 'compou..."
7,com.Slack,dont get fix break wasnt broken used able star...,"{'neg': 0.192, 'neu': 0.74, 'pos': 0.068, 'com..."
8,com.Slack,rarely leave review said app unusable aside no...,"{'neg': 0.136, 'neu': 0.803, 'pos': 0.061, 'co..."
9,com.Slack,past week half app stopped working properly me...,"{'neg': 0.091, 'neu': 0.618, 'pos': 0.291, 'co..."


In [497]:
df_vader_signal.head(10)

Unnamed: 0,Package Name,Review,Polarity
0,org.thoughtcrime.securesms,love app using whenever possible however thing...,"{'neg': 0.115, 'neu': 0.797, 'pos': 0.088, 'co..."
1,org.thoughtcrime.securesms,great app used year discontinuation sm support...,"{'neg': 0.102, 'neu': 0.605, 'pos': 0.293, 'co..."
2,org.thoughtcrime.securesms,used great app taking near 16 hour message del...,"{'neg': 0.096, 'neu': 0.832, 'pos': 0.073, 'co..."
3,org.thoughtcrime.securesms,signal great app used several year part greatn...,"{'neg': 0.063, 'neu': 0.631, 'pos': 0.305, 'co..."
4,org.thoughtcrime.securesms,worked excellently video callsat first nobody ...,"{'neg': 0.075, 'neu': 0.823, 'pos': 0.102, 'co..."
5,org.thoughtcrime.securesms,great le used exceptional app messaging phone ...,"{'neg': 0.218, 'neu': 0.637, 'pos': 0.145, 'co..."
6,org.thoughtcrime.securesms,app perfect many user signal decided remove sm...,"{'neg': 0.083, 'neu': 0.58, 'pos': 0.336, 'com..."
7,org.thoughtcrime.securesms,removing support sm split messaging apps peopl...,"{'neg': 0.115, 'neu': 0.675, 'pos': 0.209, 'co..."
8,org.thoughtcrime.securesms,app great private endtoend encryptesmd convers...,"{'neg': 0.08, 'neu': 0.554, 'pos': 0.367, 'com..."
9,org.thoughtcrime.securesms,goto text app year highly recommended removing...,"{'neg': 0.049, 'neu': 0.716, 'pos': 0.236, 'co..."


In [498]:
df_vader_spike.head(10)

Unnamed: 0,Package Name,Review,Polarity
0,com.pingapp.app,finally email client everything need great per...,"{'neg': 0.0, 'neu': 0.649, 'pos': 0.351, 'comp..."
1,com.pingapp.app,better really wanted like app load one account...,"{'neg': 0.036, 'neu': 0.619, 'pos': 0.345, 'co..."
2,com.pingapp.app,start cant find draft dont finish email never ...,"{'neg': 0.216, 'neu': 0.601, 'pos': 0.183, 'co..."
3,com.pingapp.app,want love spike great email client clean inter...,"{'neg': 0.074, 'neu': 0.49, 'pos': 0.436, 'com..."
4,com.pingapp.app,ive waiting year app like smartphones computer...,"{'neg': 0.06, 'neu': 0.803, 'pos': 0.137, 'com..."
5,com.pingapp.app,ui nice take minute get used excellent cross p...,"{'neg': 0.051, 'neu': 0.657, 'pos': 0.292, 'co..."
6,com.pingapp.app,recently downloaded say almost perfect ive tri...,"{'neg': 0.074, 'neu': 0.786, 'pos': 0.141, 'co..."
7,com.pingapp.app,love email client conversational customizable ...,"{'neg': 0.088, 'neu': 0.557, 'pos': 0.355, 'co..."
8,com.pingapp.app,ive tried countless email apps pretty unique c...,"{'neg': 0.124, 'neu': 0.629, 'pos': 0.247, 'co..."
9,com.pingapp.app,want like app look like lot good feature doesn...,"{'neg': 0.05, 'neu': 0.656, 'pos': 0.294, 'com..."


In [499]:
df_vader_zoho.head(10)

Unnamed: 0,Package Name,Review,Polarity
0,com.zoho.chat,allowed categorize text executive v driver two...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
1,com.zoho.chat,look like since anyone rated thjs app appears ...,"{'neg': 0.167, 'neu': 0.542, 'pos': 0.291, 'co..."
2,com.zoho.chat,app pretty good issue run slow scrolling chopp...,"{'neg': 0.0, 'neu': 0.668, 'pos': 0.332, 'comp..."
3,com.zoho.chat,user friendly meaning intuitive unsuccessful i...,"{'neg': 0.126, 'neu': 0.707, 'pos': 0.168, 'co..."
4,com.zoho.chat,yahoo messenger quit analyzed many one two see...,"{'neg': 0.0, 'neu': 0.523, 'pos': 0.477, 'comp..."
5,com.zoho.chat,simple messaging app favorite feature edit mes...,"{'neg': 0.119, 'neu': 0.594, 'pos': 0.287, 'co..."
6,com.zoho.chat,good tool reccommend enough remote work functi...,"{'neg': 0.0, 'neu': 0.794, 'pos': 0.206, 'comp..."
7,com.zoho.chat,initially able join meet able join meet app cr...,"{'neg': 0.209, 'neu': 0.511, 'pos': 0.281, 'co..."
8,com.zoho.chat,cool app please add break option countdown ava...,"{'neg': 0.0, 'neu': 0.559, 'pos': 0.441, 'comp..."
9,com.zoho.chat,better whatsapp reliable nice look feel could ...,"{'neg': 0.0, 'neu': 0.749, 'pos': 0.251, 'comp..."


### V. How does the sentiments retrieved by Textblob and Vader compare with each other? How do you interpret the similarity/difference? Which one is the best option for review analysis of your apps Why?

Textblob is typically best used with long, more formal pieces of data such as articles and journals

Vader is typically best used with short, more informal pieces of data that tend to be noisier, such as tweets and reviews

Textblob's output consists of a polarity score between [-1,1] and a subjectivity score between [0,1]

Vader's output consists of a positive, negative, and neutral sentiment score as well as a compound score, which is the overall score

I was able to observe these differences with the sentiments retrieved in this project

Although Vader is known for being the preffered sentiment analysis tool for informal data, I preferred using textblob because of the subjectivity score. Clustering reviews based on subjectivity can be valuable for 2 main reasons:

1) High subjectivity can help to highlight the more "passionate" reviews. These usually consist of some sort of "extreme" emotion. e.g very happy, very angry, very dissapointed. This data could help me identify what the app is doing right and what the app is doing wrong. It can also help bring bugs to my attention. This is beneficial because it can help in the prioritization process
2) Low subjectivity can help to highlight the more objective reviews. Whether the reviewer has left a negative or positive review, these reviews are more likely to be credible and offer valuable insight into the app

For a review analysis, I would chose Textblob to inspect reviews that are on both extreme ends of the subjectivity score. In other words, i would analyse the reviews that were very subjective and very objective. I would do so for the reasons i listed above.








# TASK 4: TOPIC MODELING Using LDA 

### I. Among your apps (the SUD + competitor+ similar apps), choose the one with the highest number of reviews.
I chose the Slack application because it has the highest number of reviews

### II. Extract minimum of 10 LDA topics, each being described by at least 7 words 

In [501]:
texts = df_slack['Review']
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

num_topics = 10
lda_model = gensim.models.LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=10, workers=2)

for idx, topic in lda_model.print_topics(-1):
    words = [word.split('*')[1].strip().strip('"') for word in topic.split(' + ')][:7]
    print(f"Topic {idx+1}: {' '.join(words)}")

Topic 1: app message notification notifications messages get dont
Topic 2: good app slack battery get service release
Topic 3: new like app slack mobile phone dont
Topic 4: nice app work excellent perfect tool great
Topic 5: app cant slack phone email use call
Topic 6: great app best ever user works bad
Topic 7: slack please app 5 mode stars add
Topic 8: slow doesnt upload working cant cool app
Topic 9: app great team love use slack awesome
Topic 10: update version app latest android crashes amazing


### III. How do these review topics (i.e. summarized user feedback) compare to the software feature clusters you extracted in Assignment 1?
In Assignment 1, i identified 3 clusters:

#### 1) Cluster 1: Communication and Team Collaboration
Cluster 1 represents the core functionality of my app. Therefore, it is not surprising to see that the majority of topics generated are aligned with cluster 1. The topics aligned with cluster 1 are: Topic 1, Topic 3, Topic 4, Topic 5, Topic 9
    



#### 2) Cluster 2: Accessibility and Availability
Although cluster 2 was not identified as part of the core functionality, some of the topics were still aligned with it as well. The topics that aligned with cluster 2 are: Topic 2, Topic 8, Topic 10
    


#### 3) Cluster 3: Privacy and Security
No topics were relevant to cluster 3. No alignemnt

On top of this, i was able to identify atleast 1 more cluster based on the topics generated, which is "bugs and unexpected behavior". The topics that align with this new cluster are: Topic 8, Topic 10

### IV. How do you explain the differences in terms of accuracy of the model and the size of data?

Large data sets are beneficial because they offer a more representative sample. In this part of the project, I didn't use very large data sets so this compromised both, the sample as well as the result. However, having *too large* of a data set can also be a problem. Extremely large data sets can lead to topics that are too vague and random, which lowers the accuracy

### V. Does any of the topics relate to the core functionalities you identified? Why?

Yes. As previously mentioned, the majority of topics related to cluster 1, which represented my core functionality.
This makes sense because Slack and Wire are similar apps, meaning they share a similar core functionality.

Furthermore, cluster 1 relates to communication and team collaboration. 50% of the topics generated included words that relate to communication and team collaboration. Examples of these words: "message", "notification", "email", "call" and "team"

# TASK 5: RECOMMENDATION

### I. Recommend two new features or enhancements (could be bug reports) for the app based on your above analysis that have not been done in the project

1) Work on making the app run faster
2) Work on bugs to prevent crashing

### II. How do you compare these recommendations, with the recommendations you provided based on persona analysis in Assignment 2? Explain similarities and differences

Surprisingly, although Slack and Wire are similar, they are different apps and I would expect them to have a different set of problems. However, this was not the case. Although my persona was made from Wire, you could easily pass them as Slack users. Their concerns/ experiences seem to be the similar.

My first persona, Sahana, complained that the app was very slow and crashed a lot

My second persona, Stephanie, complained about unresolved bugs that ruined her experience

According to the topic modelling, this is consistent with the feedback that Slack has gotten as well.


But, as i revise the personas and the recommendations I made, there are some differences. In assignment 2, I recommended that Wire revise their core functionality to include cluster 3 (privacy and security). However, my top 10 topics from topic modelling Slack reviews did not include any concerns about privacy and security. It seems that Wire users are facing issues with privacy and security on the app, while Slack users do not have that problem. Instead, Slack users are primarily concerned with Slack being too slow and buggy