In [191]:
import numpy as np
import pandas as pd


# Load the dataset

In [153]:
df = pd.read_excel('CIBC_App_Reviews.xlsx')
pd.options.display.max_columns = None

# Explore the dataset

In [164]:
# take a quick look at the data
df.head(10)

Unnamed: 0,App Name,App Store,App,Store,App ID,Review ID,Country,Version,Rating,Date,Author,Subject,Body,Translated Subject,Translated Body,Sentiment,Device,Language,OS Version,Reply URL,Topics,Custom Topics,Tags
0,CIBC Mobile Banking,iOS,CIBC Mobile Banking,iOS,351448953,3902085253,Canada,10.10.3,1,2019-03-20,rbc for the win,Switch banks,This app is as crappy as the bank that made,,,negative,,English,,https://app.appbot.co/apps/30102-cibc-mobile-b...,[],"[""Banking App""]","[""Dissatisfied users""]"
1,CIBC Mobile Banking,iOS,CIBC Mobile Banking,iOS,351448953,3901178898,Canada,10.10.3,2,2019-03-19,Leaving Wattpad,Credit Score Maintenance,The credit score calculator has been down for ...,,,negative,,English,,https://app.appbot.co/apps/30102-cibc-mobile-b...,[],"[""Banking App"", ""Credit Card""]","[""Credit Score"", ""Dissatisfied users""]"
2,CIBC Mobile Banking,iOS,CIBC Mobile Banking,iOS,351448953,3893450681,Canada,10.10.3,3,2019-03-17,CanadianBatman,Good,I stopped banking with CIBC due to another ban...,,,positive,,English,,https://app.appbot.co/apps/30102-cibc-mobile-b...,"[""Performance"", ""Pricing""]","[""Banking App""]","[""Satisfied users""]"
3,CIBC Mobile Banking,iOS,CIBC Mobile Banking,iOS,351448953,3882357860,China,10.10.3,1,2019-03-15,孙三皮,怎么切换语言,怎么是法语？,How to switch languages,How is French?,negative,,Chinese (Simplified),,https://app.appbot.co/apps/30102-cibc-mobile-b...,[],[],[]
4,CIBC Mobile Banking,iOS,CIBC Mobile Banking,iOS,351448953,3885434139,Canada,10.10.3,1,2019-03-15,allthenicknameseveraretaken,Not good at all.,I have to change my password every time I try ...,,,negative,,English,,https://app.appbot.co/apps/30102-cibc-mobile-b...,"[""Frequency"", ""Security & Accounts"", ""Sign Up ...",[],"[""Dissatisfied users"", ""Password issue""]"
5,CIBC Mobile Banking,iOS,CIBC Mobile Banking,iOS,351448953,3881274434,Canada,10.10.3,1,2019-03-14,Binketh,Log out,Can’t log out. Or is very cumbersome to log ou...,,,negative,,English,,https://app.appbot.co/apps/30102-cibc-mobile-b...,"[""Security & Accounts""]",[],"[""Dissatisfied users"", ""Logout Issue""]"
6,CIBC Mobile Banking,iOS,CIBC Mobile Banking,iOS,351448953,3865788153,Canada,10.10.3,5,2019-03-10,Leo 990@8888,Cash back on app,I love the app. I never faced any issues so fa...,,,positive,,English,,https://app.appbot.co/apps/30102-cibc-mobile-b...,"[""Bugs"", ""Feature Requests"", ""Satisfied users""]","[""Banking App""]","[""Cash Back"", ""Satisfied users""]"
7,CIBC Mobile Banking,iOS,CIBC Mobile Banking,iOS,351448953,3866472114,Canada,10.10.3,1,2019-03-10,Jolena Katusha,would give 0 stars,The absolute worst banking experience i’ve eve...,,,negative,,English,,https://app.appbot.co/apps/30102-cibc-mobile-b...,"[""Bugs"", ""Design & UX"", ""Dissatisfied users"", ...","[""Fingerprint Issues"", ""Fingerprint"", ""Banking...","[""Dissatisfied users"", ""Password issue""]"
8,CIBC Mobile Banking,iOS,CIBC Mobile Banking,iOS,351448953,3853019071,Canada,10.10.3,1,2019-03-07,Big Bird Below The Belt,Ve r y S l o w now😴,"Very slow since last review, shield with the c...",,,negative,,English,,https://app.appbot.co/apps/30102-cibc-mobile-b...,"[""Dissatisfied users"", ""Performance"", ""Pricing""]","[""E-Transfer "", ""E-Transfer"", ""Request for mon...","[""Feb 19 Update""]"
9,CIBC Mobile Banking,iOS,CIBC Mobile Banking,iOS,351448953,3847909796,Canada,10.10.3,1,2019-03-06,Kiikikikikokik,theres always problems!,it frequently wont allow me to transfer money ...,,,negative,,English,,https://app.appbot.co/apps/30102-cibc-mobile-b...,"[""Complexity"", ""Design & UX"", ""Frequency"", ""Pa...","[""Request for money"", ""Credit Card""]","[""Dissatisfied users"", ""March 1 Outage Issue""]"


** Some Observations**
<br>
1) It seems both Subject and Body have texts, we would need to combine them for analysis.
<br>
2) We would need to remove emojis.
<br>
3) Not all reviews are in English so we could either translate them into English or remove them for now.
<br>
Continue our exploration...

** Quick Stats**
<br>
Get some quick stats, for example, count of ratings per rating score, rating averages, and ratings by language.

In [166]:
print('--Check total records and columns--\n')
shape = df.shape
print(shape)
print('\nThe dataset has {} rows and {} columns.\n'.format(shape[0],shape[1]))

print('--Check the total # of empty values for each variable--\n')

print(df.isnull().sum())

--Check total records and columns--

(7738, 23)

The dataset has 7738 rows and 23 columns.

--Check the total # of empty values for each variable--

App Name                 0
App Store                0
App                      0
Store                    0
App ID                   0
Review ID              390
Country                  0
Version               4646
Rating                   0
Date                     0
Author                 340
Subject               2219
Body                    29
Translated Subject    7681
Translated Body       7552
Sentiment                0
Device                7738
Language                 7
OS Version            7738
Reply URL                0
Topics                   0
Custom Topics            0
Tags                     0
dtype: int64


Subject and Body have 2,219 and 29 empty values respectively. We would need to combine them and see if we still see empty values for the combined field.
<br>


In [186]:
print('--Get the rating count per rating--\n')
rating_count = df.groupby(['Rating'])['Rating'].count()\
                                            .reset_index(name='Count')
rating_count['Percent'] = rating_count.Count/rating_count.Count.sum()
print(rating_count.sort_values('Percent',ascending=False))

print('\n--Get the rating count per language--\n')
lan_count = df.groupby(['Language']).agg({'Language' : 'count', 'Rating' : 'mean'})\
                                    .rename(columns={'Language':'Total_Reviews','Rating' : 'Rating_Avg'}).reset_index()
lan_count['Percent'] = lan_count.Total_Reviews/lan_count.Total_Reviews.sum()
print(lan_count.sort_values('Percent',ascending=False))



--Get the rating count per rating--

   Rating  Count   Percent
4       5   2581  0.333549
0       1   2442  0.315585
3       4   1046  0.135177
2       3    862  0.111398
1       2    807  0.104291

--Get the rating count per language--

                 Language  Total_Reviews  Rating_Avg   Percent
3                 English           7400    3.049865  0.957185
4                  French            293    3.399317  0.037899
1    Chinese (Simplified)             15    2.266667  0.001940
0                  Arabic              7    4.142857  0.000905
12                Spanish              4    4.000000  0.000517
7                  Korean              3    4.333333  0.000388
2   Chinese (Traditional)              2    4.000000  0.000259
5              Indonesian              2    5.000000  0.000259
6                 Italian              1    5.000000  0.000129
8                   Latin              1    4.000000  0.000129
9                   Malay              1    5.000000  0.000129
10   

1) 1/3 of reviews are either 5 or 1. Close to half of (46%) reviews are good (score 4 or 5) and approximately 40% of reviews are bad (score 1 or 2). It seems users who left comment either love the app or hate it.
<br>
2) Most of reviews (96%) are English reviews. 7400 is good enough for the analysis. I will discard reviews in other languages for now.
<br>
3) People who leave English reviews think the app is okay giving a rate of 3.05 on average.

# Data Cleansing

Here are the few steps before we can train model on this dataset.
<br>
1) Keep English reviews only.
<br>
2) Select the columns we need.
<br>
3) Remove punctuations, emojis.
<br>
4) Convert all the words into lowercase.

In [306]:
print('--1) Keep only English reviews--\n')
Eng_reviews = df[df['Language'] =='English']
print('Now the total of reviews becomes {}.\n'.format(len(Eng_reviews)))
Eng_reviews.head()

--1) Keep only English reviews--

Now the total of reviews becomes 7400.



Unnamed: 0,App Name,App Store,App,Store,App ID,Review ID,Country,Version,Rating,Date,Author,Subject,Body,Translated Subject,Translated Body,Sentiment,Device,Language,OS Version,Reply URL,Topics,Custom Topics,Tags
0,CIBC Mobile Banking,iOS,CIBC Mobile Banking,iOS,351448953,3902085253,Canada,10.10.3,1,2019-03-20,rbc for the win,Switch banks,This app is as crappy as the bank that made,,,negative,,English,,https://app.appbot.co/apps/30102-cibc-mobile-b...,[],"[""Banking App""]","[""Dissatisfied users""]"
1,CIBC Mobile Banking,iOS,CIBC Mobile Banking,iOS,351448953,3901178898,Canada,10.10.3,2,2019-03-19,Leaving Wattpad,Credit Score Maintenance,The credit score calculator has been down for ...,,,negative,,English,,https://app.appbot.co/apps/30102-cibc-mobile-b...,[],"[""Banking App"", ""Credit Card""]","[""Credit Score"", ""Dissatisfied users""]"
2,CIBC Mobile Banking,iOS,CIBC Mobile Banking,iOS,351448953,3893450681,Canada,10.10.3,3,2019-03-17,CanadianBatman,Good,I stopped banking with CIBC due to another ban...,,,positive,,English,,https://app.appbot.co/apps/30102-cibc-mobile-b...,"[""Performance"", ""Pricing""]","[""Banking App""]","[""Satisfied users""]"
4,CIBC Mobile Banking,iOS,CIBC Mobile Banking,iOS,351448953,3885434139,Canada,10.10.3,1,2019-03-15,allthenicknameseveraretaken,Not good at all.,I have to change my password every time I try ...,,,negative,,English,,https://app.appbot.co/apps/30102-cibc-mobile-b...,"[""Frequency"", ""Security & Accounts"", ""Sign Up ...",[],"[""Dissatisfied users"", ""Password issue""]"
5,CIBC Mobile Banking,iOS,CIBC Mobile Banking,iOS,351448953,3881274434,Canada,10.10.3,1,2019-03-14,Binketh,Log out,Can’t log out. Or is very cumbersome to log ou...,,,negative,,English,,https://app.appbot.co/apps/30102-cibc-mobile-b...,"[""Security & Accounts""]",[],"[""Dissatisfied users"", ""Logout Issue""]"


In [307]:
print('--2) Select the columns for analysis--\n')
Eng_reviews = Eng_reviews[['Review ID','App Store','Version','Rating','Subject','Body','Sentiment']]
Eng_reviews.head(10)


--2) Select the columns for analysis--



Unnamed: 0,Review ID,App Store,Version,Rating,Subject,Body,Sentiment
0,3902085253,iOS,10.10.3,1,Switch banks,This app is as crappy as the bank that made,negative
1,3901178898,iOS,10.10.3,2,Credit Score Maintenance,The credit score calculator has been down for ...,negative
2,3893450681,iOS,10.10.3,3,Good,I stopped banking with CIBC due to another ban...,positive
4,3885434139,iOS,10.10.3,1,Not good at all.,I have to change my password every time I try ...,negative
5,3881274434,iOS,10.10.3,1,Log out,Can’t log out. Or is very cumbersome to log ou...,negative
6,3865788153,iOS,10.10.3,5,Cash back on app,I love the app. I never faced any issues so fa...,positive
7,3866472114,iOS,10.10.3,1,would give 0 stars,The absolute worst banking experience i’ve eve...,negative
8,3853019071,iOS,10.10.3,1,Ve r y S l o w now😴,"Very slow since last review, shield with the c...",negative
9,3847909796,iOS,10.10.3,1,theres always problems!,it frequently wont allow me to transfer money ...,negative
10,3849972209,iOS,10.10.3,2,Sign in with Face ID?,I’d like to have the option to sign into the a...,negative


In [308]:
print('--Merge Subject and Body into one column named Review--')
Eng_reviews['Review'] = Eng_reviews['Subject'].map(str) + ' ' + Eng_reviews["Body"].map(str)
Eng_reviews.head(10)


--Merge Subject and Body into one column named Review--


Unnamed: 0,Review ID,App Store,Version,Rating,Subject,Body,Sentiment,Review
0,3902085253,iOS,10.10.3,1,Switch banks,This app is as crappy as the bank that made,negative,Switch banks This app is as crappy as the bank...
1,3901178898,iOS,10.10.3,2,Credit Score Maintenance,The credit score calculator has been down for ...,negative,Credit Score Maintenance The credit score calc...
2,3893450681,iOS,10.10.3,3,Good,I stopped banking with CIBC due to another ban...,positive,Good I stopped banking with CIBC due to anothe...
4,3885434139,iOS,10.10.3,1,Not good at all.,I have to change my password every time I try ...,negative,Not good at all. I have to change my password ...
5,3881274434,iOS,10.10.3,1,Log out,Can’t log out. Or is very cumbersome to log ou...,negative,Log out Can’t log out. Or is very cumbersome t...
6,3865788153,iOS,10.10.3,5,Cash back on app,I love the app. I never faced any issues so fa...,positive,Cash back on app I love the app. I never faced...
7,3866472114,iOS,10.10.3,1,would give 0 stars,The absolute worst banking experience i’ve eve...,negative,would give 0 stars The absolute worst banking ...
8,3853019071,iOS,10.10.3,1,Ve r y S l o w now😴,"Very slow since last review, shield with the c...",negative,Ve r y S l o w now😴 Very slow...
9,3847909796,iOS,10.10.3,1,theres always problems!,it frequently wont allow me to transfer money ...,negative,theres always problems! it frequently wont all...
10,3849972209,iOS,10.10.3,2,Sign in with Face ID?,I’d like to have the option to sign into the a...,negative,Sign in with Face ID? I’d like to have the opt...


In [309]:
# Now let's check if Review has any empty value.
print('\n--Review has {} empty value(s).--\n'.format(Eng_reviews['Review'].isnull().sum()))


--Review has 0 empty value(s).--



In [320]:
print('--3) Remove punctuations, emojis--\n')

Eng_reviews['Review']=[s.encode('ascii', 'ignore').decode('ascii')\
                       for s in Eng_reviews['Review']]

print('--4) Normalize the words--\n')
Eng_reviews['Review']=Eng_reviews['Review'].str.lower()


print('--5) Remove punctuations--\n')
Eng_reviews['Review']=Eng_reviews['Review'].str.replace('[^A-Za-z ]+','')
#Eng_reviews['Review']=re.sub('[^A-Za-z ]+','',Eng_reviews['Review'])
Eng_reviews['Review'].head(10)


--3) Remove punctuations, emojis--

--4) Normalize the words--

--5) Remove punctuations--



0     switch banks this app is as crappy as the bank...
1     credit score maintenance the credit score calc...
2     good i stopped banking with cibc due to anothe...
4     not good at all i have to change my password e...
5     log out cant log out or is very cumbersome to ...
6     cash back on app i love the app i never faced ...
7     would give  stars the absolute worst banking e...
8     ve r    y    s  l   o      w    now very slow ...
9     theres always problems it frequently wont allo...
10    sign in with face id id like to have the optio...
Name: Review, dtype: object

In [323]:
print('-- Binary encode the sentiment--\n')
Eng_reviews['Sentiment_Encoded'] = [1 if x =='positive' else 0 for x in Eng_reviews.Sentiment]
Eng_reviews.head()

-- Binary encode the sentiment--



Unnamed: 0,Review ID,App Store,Version,Rating,Subject,Body,Sentiment,Review,Sentiment_encoded,Sentiment_Encoded
0,3902085253,iOS,10.10.3,1,Switch banks,This app is as crappy as the bank that made,negative,switch banks this app is as crappy as the bank...,0,0
1,3901178898,iOS,10.10.3,2,Credit Score Maintenance,The credit score calculator has been down for ...,negative,credit score maintenance the credit score calc...,0,0
2,3893450681,iOS,10.10.3,3,Good,I stopped banking with CIBC due to another ban...,positive,good i stopped banking with cibc due to anothe...,1,1
4,3885434139,iOS,10.10.3,1,Not good at all.,I have to change my password every time I try ...,negative,not good at all i have to change my password e...,0,0
5,3881274434,iOS,10.10.3,1,Log out,Can’t log out. Or is very cumbersome to log ou...,negative,log out cant log out or is very cumbersome to ...,0,0


## Data Preprocessing

Now convert the Reviews into the X array and Sentiment into the y array accordingly.

In [350]:
from sklearn.model_selection import train_test_split
X, y =(Eng_reviews['Review'].values, Eng_reviews['Sentiment_Encoded'].values)
X_train_org, X_test_org, y_train_org, y_test_org = train_test_split(X, y, test_size = 0.25, random_state = 1)
X_test_org

array(['great app and great bank fantastic app and to everyone that complains about the e transfer or system issues its going to happen unfortunately they normally resolve it pretty fast',
       'love it love love love it does exactly what it says it will do  ',
       'doesnt work on rooted phones just because i want to root my phone and have control over what i have on my phone and what i dont have on my phone doesnt mean i shouldnt be able to run your app i will be deleting it forthwith i was even going to get one of your credit cards to use the app but i wont be now',
       ...,
       'app is not responsive version  crashes as soon as you sign init is frustrating cant use mobile version of site either because the tabs are not responsive what is going on cibc why the issue persists for so long pls fix',
       'nan last update to edeposits sucks big time thank gods i have a second device that hadnt updated yet',
       'nan this app would be much better if it had the capability o

In [352]:
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences

tk = Tokenizer(lower = True)
tk.fit_on_texts(X)
X_train_seq = tk.texts_to_sequences(X_train_org)
X_train_pad = pad_sequences(X_train_seq, maxlen=100, padding='post')

X_test_seq = tk.texts_to_sequences(X_test_org)
X_test_pad = pad_sequences(X_test_seq, maxlen=100, padding='post')


In [357]:
batch_size = 64
X_train1 = X_train_pad[batch_size:]
y_train1 = y_train_org[batch_size:]
X_valid = X_train_pad[:batch_size]
y_valid = y_train_org[:batch_size]

In [358]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
vocabulary_size = len(tk.word_counts.keys())+1
max_words = 100
embedding_size = 32
model = Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))
model.add(LSTM(200))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [359]:
model.fit(X_train1, y_train1, validation_data=(X_valid, y_valid), batch_size=batch_size, epochs=10)

Train on 5486 samples, validate on 64 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1cba3bb1f60>

In [361]:
scores = model.evaluate(X_test_pad, y_test, verbose=0)
print('Test Accuracy:',scores[1])

Test Accuracy: 0.8416216213638718


In [362]:
y_predict = model.predict(X_test_pad)
y_predict

array([[0.8685466 ],
       [0.86854994],
       [0.16814384],
       ...,
       [0.16812989],
       [0.16812894],
       [0.16812901]], dtype=float32)

In [395]:
X_test_org=X_test_org.reshape(1850,1)
print(X_test_org.shape)
print(y_predict.shape)
final=np.concatenate([X_test_org,y_predict],axis=1)
df=pd.DataFrame(data=final, columns=['Review','Sentiment_Encoded'])


(1850, 1)
(1850, 1)


Unnamed: 0,Review,Sentiment
0,great app and great bank fantastic app and to ...,0.868547
1,love it love love love it does exactly what it...,0.86855
2,doesnt work on rooted phones just because i wa...,0.168144
3,touch id my touch id is not responding on the ...,0.168129
4,very convenient i find this app extremely conv...,0.168129
5,nan was five star the so called latest update ...,0.168129
6,nan easy to use,0.868549
7,wonderful awesomeness,0.868549
8,crashes it crashesplease fix,0.168129
9,works sorta updated never clicking find us pre...,0.130488
