# Google Play Store Exploratory Analysis

Dataset available on https://www.kaggle.com/lava18/google-play-store-apps

In [1]:
import pandas as pd

# Loading and checking datasets

In [2]:
df_apps = pd.read_csv('datasets/googleplaystore.csv')
df_apps.head(3)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up


In [3]:
df_apps.shape

(10841, 13)

In [4]:
df_apps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


In [5]:
df_apps.isna().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

In [6]:
df_reviews = pd.read_csv('datasets/googleplaystore_user_reviews.csv')
df_reviews.head(3)

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,


In [7]:
df_reviews.shape

(64295, 5)

In [8]:
df_reviews.isna().sum()

App                           0
Translated_Review         26868
Sentiment                 26863
Sentiment_Polarity        26863
Sentiment_Subjectivity    26863
dtype: int64

# Data Cleaning

## Deleting irrelevant data

In [9]:
df_apps.drop(['Last Updated', 'Current Ver'], axis=1, inplace=True)
df_apps.head(1)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,4.0.3 and up


## Dealing with missing information

### Type

* There's only 1 App without 'Type', so we'll be fixing this manually

In [10]:
df_apps[df_apps['Type'].isna()]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Android Ver
9148,Command & Conquer: Rivals,FAMILY,,0,Varies with device,0,,0,Everyone 10+,Strategy,Varies with device


In [11]:
# Type according to Google Play Store (23/03/21)
type_column = df_apps.columns.get_loc('Type')
df_apps.iloc[9148, type_column] = 'Free'

In [12]:
df_apps['Type'].value_counts()

Free    10040
Paid      800
0           1
Name: Type, dtype: int64

* There's 1 App with 'Type' = 0. Let's take a look at it:

In [13]:
df_apps[df_apps['Type'] == '0']

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Android Ver
10472,Life Made WI-Fi Touchscreen Photo Frame,1.9,19.0,3.0M,"1,000+",Free,0,Everyone,,"February 11, 2018",


* This row values have been misplaced. We'll be fixing it:

In [14]:
df_apps.iloc[10472] = df_apps.iloc[10472].replace(['1.9', 19.0, '3.0M', '1,000+', 'Free', '0', 'Everyone', 'February 11, 2018'], ['LIFESTYLE', 1.9, '19.0', '3.0M', '1,000+', 'Free', '0', 'Lifestyle'])

version_column = df_apps.columns.get_loc('Android Ver')
df_apps.iloc[10472, version_column] = '4.0 and up'

content_rating_column = df_apps.columns.get_loc('Content Rating')
df_apps.iloc[10472, content_rating_column] = 'Everyone'

In [15]:
df_apps['Type'].value_counts()

Free    10041
Paid      800
Name: Type, dtype: int64

### Android Ver

* There are 2 Apps without 'Android Ver'. We'll be filling them with the most common Android Version

In [26]:
most_common_android = df_apps['Android Ver'].value_counts().idxmax()
df_apps['Android Ver'].fillna(most_common_android, inplace=True)

### Rating

* There are a lot of 'NaN' values in this column. We'll be filling them with the mean of all other ratings.

In [35]:
ratings_mean = df_apps['Rating'].mean()
df_apps['Rating'].fillna(ratings_mean, inplace=True)

In [36]:
df_apps.isna().sum()

App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              0
Price             0
Content Rating    0
Genres            0
Android Ver       0
dtype: int64

### Reviews

* There are a lot o 'NaN' values in the review dataset. We'll be dropping them.

In [46]:
df_reviews.dropna(inplace=True)

In [48]:
df_reviews.isna().sum()

App                       0
Translated_Review         0
Sentiment                 0
Sentiment_Polarity        0
Sentiment_Subjectivity    0
dtype: int64

In [54]:
df_apps

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.100000,159,19M,"10,000+",Free,0,Everyone,Art & Design,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.900000,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.700000,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.500000,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.300000,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,4.4 and up
...,...,...,...,...,...,...,...,...,...,...,...
10836,Sya9a Maroc - FR,FAMILY,4.500000,38,53M,"5,000+",Free,0,Everyone,Education,4.1 and up
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.000000,4,3.6M,100+,Free,0,Everyone,Education,4.1 and up
10838,Parkinson Exercices FR,MEDICAL,4.191513,3,9.5M,"1,000+",Free,0,Everyone,Medical,2.2 and up
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.500000,114,Varies with device,"1,000+",Free,0,Mature 17+,Books & Reference,Varies with device


In [53]:
df_apps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          10841 non-null  float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10841 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10841 non-null  object 
 9   Genres          10841 non-null  object 
 10  Android Ver     10841 non-null  object 
dtypes: float64(1), object(10)
memory usage: 931.8+ KB


In [None]:
## Change data type to numeric on 'Reviews', 'Size' and 'Installs'

## Renaming Columns