# Title: Google Play App Store Sentiment Analysis of Customer Reviews

### Data Source
- Kaggle: https://www.kaggle.com/lava18/google-play-store-apps

### Abstract
Each word in the lexicon has scores for:
- polarity: negative vs. positive    (-1.0 => +1.0)
- subjectivity: objective vs. subjective (+0.0 => +1.0)

### Import Libraries

In [41]:
## Import Libraries
import pandas as pd
import numpy as np
import regex as re
import spacy
import en_core_web_sm
from string import punctuation
import category_encoders as ce

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns

from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

import warnings
warnings.filterwarnings('ignore')

### Load data

In [42]:
file = 'DATA/googleplaystore_user_reviews.csv'
df = pd.read_csv(file)

### Data wrangling

In [43]:
df.shape

(64295, 5)

In [44]:
df.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3


In [45]:
df.App.value_counts()

CBS Sports App - Scores, News, Stats & Watch Live    320
Angry Birds Classic                                  320
Bowmasters                                           320
8 Ball Pool                                          300
Helix Jump                                           300
                                                    ... 
Detector de Radares Gratis                            31
Dresses Ideas & Fashions +3000                        31
Easy Healthy Recipes                                  31
Easy Hair Style Design                                30
Drawing Clothes Fashion Ideas                         30
Name: App, Length: 1074, dtype: int64

- the dataset represents 1074 Google Play Mobile Apps with Reviews ranging from 30 - 320 per app

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64295 entries, 0 to 64294
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   App                     64295 non-null  object 
 1   Translated_Review       37427 non-null  object 
 2   Sentiment               37432 non-null  object 
 3   Sentiment_Polarity      37432 non-null  float64
 4   Sentiment_Subjectivity  37432 non-null  float64
dtypes: float64(2), object(3)
memory usage: 2.5+ MB


In [47]:
df.isnull().sum()

App                           0
Translated_Review         26868
Sentiment                 26863
Sentiment_Polarity        26863
Sentiment_Subjectivity    26863
dtype: int64

In [48]:
# drop rows with null or nan values
df = df.dropna()

In [49]:
# change names of columns
df.columns = ['app', 'review', 'sentiment', 'polarity', 'subjectivity']

In [50]:
df.head()

Unnamed: 0,app,review,sentiment,polarity,subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3
5,10 Best Foods for You,Best way,Positive,1.0,0.3


In [51]:
# Remove app column
df.drop(['app', 'subjectivity'], axis='columns', inplace=True)

In [52]:
df

Unnamed: 0,review,sentiment,polarity
0,I like eat delicious food. That's I'm cooking ...,Positive,1.000000
1,This help eating healthy exercise regular basis,Positive,0.250000
3,Works great especially going grocery store,Positive,0.400000
4,Best idea us,Positive,1.000000
5,Best way,Positive,1.000000
...,...,...,...
64222,Most ads older many agents ..not much owner po...,Positive,0.173333
64223,"If photos posted portal load, fit purpose. I'm...",Positive,0.225000
64226,"Dumb app, I wanted post property rent give opt...",Negative,-0.287500
64227,I property business got link SMS happy perform...,Positive,0.800000


### Staging for text cleaning

In [53]:
nlp = en_core_web_sm.load()

### Prepare the text

This stage involves all the text handling and preparation concerned with the changes and modifications from the raw source text to a format that will be used for the actual processing, things like:
- handle encoding
- handle extraneous and international charaters
- handle symbols
- handle metadata and embeded information
- handle repetitions (such multiple spaces or newlines)

In [54]:
def clean_text(text):
    # reduce multiple spaces and newlines to only one
    text = re.sub(r'(\s\s+|\n\n+)', r'\1', text)
    # remove double quotes
    text = re.sub(r'"', '', text)
    return text

In [55]:
%%time
text_cleaned = []
for text in df.review:
    text_cleaned.append(clean_text(text))

Wall time: 493 ms


In [56]:
df.review = text_cleaned

### Work the text
Concern with the meaning and the substance of the content to extract actual information.

In [57]:
def convert_text(text):
    '''
    Remove StopWords, and Lemmatize etc.
    '''
    # remove punctuation
    text = text.translate(str.maketrans('', '', punctuation))
    # remove anything that is not alphanumeric or underscore
    doc = nlp(text)
    converted_text = " ".join([token.lemma_ for token in doc if not token.is_stop])
    return converted_text

In [None]:
%%time
df['short'] = df.review.apply(convert_text)

In [19]:
df.sample(10)

Unnamed: 0,review,sentiment,polarity,short
63337,"It let game :/ phone enough space brand new, S...",Positive,0.195455,let game phone space brand new Samsung s9 lo...
1771,"love app! easy straight forward, tailor fit sc...",Positive,0.539167,love app easy straight forward tailor fit sche...
62063,I gave star heres why. I played game forever i...,Positive,0.183189,give star here play game forever invest bit mo...
9422,This useless.. launched loads forever...when d...,Negative,-0.5,useless launch load foreverwhen loading tell stop
52827,"Game good,but days giving many network problem...",Negative,-0.104167,game goodbut day give network problem friend l...
61757,Very bright. Has come handy several times!,Positive,0.503333,bright come handy time
19502,"It's good, works. 50% time bugs show everythin...",Positive,0.358333,good work 50 time bug go MLB play play get job
43501,We live near grandparents aunts uncles wonderf...,Positive,0.412121,live near grandparent aunt uncle wonderful way...
56232,Really like limited options adjusting page siz...,Positive,0.125893,like limited option adjust page size non user ...
39095,Works Really good,Positive,0.7,work good


### Perform basic EDA

In [20]:
# this shows the aggregate distinct values of the target variable can be multinomial
df.sentiment.value_counts()

Positive    23998
Negative     8271
Neutral      5158
Name: sentiment, dtype: int64

- Fortunately the dataset is large or there won't be enough training examples of all categories to fit the model

In [38]:
df_neg = df.loc[(df.sentiment == 'Negative')]

In [36]:
text_neg = ''
for text in df_neg.review:
    text_neg += text + ' '

In [37]:
text_neg

''

In [35]:
cloud_neg = WordCloud().generate(str(text_neg))

ValueError: We need at least 1 word to plot a word cloud, got 0.

In [None]:
plt.figure(figsize=(16,8))
plt.imshow(cloud_neg, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
df_neut = df.loc[(df.sentiment == 'Neutral')]

In [None]:
text_neut = ''
for text in df_neut.review:
    text_neut += text + ' '

In [None]:
cloud_neut = WordCloud().generate(str(text_neut))

In [None]:
plt.figure(figsize=(16,8))
plt.imshow(cloud_neut, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
df_pos = df.loc[(df.sentiment == 'Positive')]

In [None]:
%%time
text_pos = ''
for text in df_pos.review:
    text_pos += text + ' '

In [None]:
cloud_pos = WordCloud().generate(str(text_pos))

In [None]:
plt.figure(figsize=(16,8))
plt.imshow(cloud_pos, interpolation='bilinear')
plt.axis("off")
plt.show()

In [21]:
df.dtypes

review        object
sentiment     object
polarity     float64
short         object
dtype: object

#### Map code to sentiment

Should we keep the neutral class in our data set since it only is relevent when polarity = 0?

From Wikipedia: "Even though in most statistical classification methods, the neutral class is ignored under the assumption that neutral texts lie near the boundary of the binary classifier, several researchers suggest that, as in every polarity problem, three categories must be identified. Moreover, it can be proven that specific classifiers such as the Max Entropy and SVMs can benefit from the introduction of a neutral class and improve the overall accuracy of the classification."

Source: https://en.wikipedia.org/wiki/Sentiment_analysis

In [22]:
codes = {'Negative':-1, 'Neutral':0, 'Positive':1}

In [23]:
df['sentiment'] = df.sentiment.map(codes)

In [24]:
df.dtypes

review        object
sentiment      int64
polarity     float64
short         object
dtype: object

#### Describe distribution

In [40]:
df.describe()

Unnamed: 0,sentiment,polarity
count,37427.0,37427.0
mean,0.420205,0.182171
std,0.828029,0.351318
min,-1.0,-1.0
25%,0.0,0.0
50%,1.0,0.15
75%,1.0,0.4
max,1.0,1.0


- Polarity is simply the scalar version of sentiment, and hence should only be included in the model to estimate a regressor and of a classifier
- The collinearity between sentiment and polarity is further reinforced by the positive skewness of the boxplots, where outliers towards the negative side of polarity are many.
- Subjectivity denotes the objectivity and subjectivity of text respectively
- Dataset need not be scaled as original values should be preserved to represent the nominal scales

### Selecting features and splitting data

In [None]:
# Features and Labels
X = df['short']
y = df['sentiment'] 

# split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [34]:
# create classifiers using GridSearchCV
params_log = {'penalty': ['l1','l2'], 
             'C': np.logspace(-4, 4, 50)}
clf_log = GridSearchCV(LogisticRegression(random_state=42, multi_class='multinomial'), params_log, cv=10)

params_SVC = {'C': [0.1,1, 10, 100], 
             'gamma': [1,0.1,0.01,0.001],
             'kernel': ['rbf', 'poly', 'sigmoid']}
clf_SVC = GridSearchCV(SVC(probability=True, random_state=42), params_SVC, cv=10)

params_SGD = {'penalty': ['l1','l2', 'elasticnet'], 
             'alpha': ['0.0001','0.001','0.01','0.1','1.0']}
clf_SGD = GridSearchCV(SGDClassifier(random_state=42), params_SGD, cv=10)

params_NB = {'alpha': ['0.0001','0.001','0.01','0.1', '1.0']}
clf_NB = GridSearchCV(MultinomialNB(), params_NB)

params_KNN = {'metric': ['euclidean', 'manhattan'],
 'n_neighbors': np.arange(1, 16)}
clf_KNN = GridSearchCV(KNeighborsClassifier(), params_KNN, cv=10)

params_dt = {'criterion': ['entropy', 'gini'], 'max_depth': np.arange(3, 15),
            'min_samples_leaf': np.arange(3, 15)}
clf_dt = GridSearchCV(DecisionTreeClassifier(), params_dt, cv=10)

In [None]:
# create classifiers using RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

param_rf = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

clf_rf = RandomizedSearchCV(RandomForestClassifier(), 
                            param_distributions = param_rf, 
                            n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1)

### Feature Engineering

#### Bag-of-words

In [None]:
# create a matrix of word counts from the text
counts = CountVectorizer()

In [None]:
# do the actual counting
A = counts.fit_transform(X_train, y_train)

In [None]:
A.shape

In [None]:
# fit logistic regression
clf_log.fit(A.toarray(), y_train)