# Google App Store EDA

Name: Hashir Bhatti\
Email: bhattihashir26@gmail.com

In [168]:
# Import the libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import ydata_profiling as yd

In [169]:
df = pd.read_csv("./data/googleplaystore.csv")
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [170]:
df.shape

(10841, 13)

In [171]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10840 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  int64  
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10841 non-null  object 
 9   Genres          10840 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10839 non-null  object 
dtypes: float64(1), int64(1), object(11)
memory usage: 1.1+ MB


In [172]:
df.isnull().sum()

App                  0
Category             1
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       0
Genres               1
Last Updated         0
Current Ver          8
Android Ver          2
dtype: int64

In [173]:
df.describe()

Unnamed: 0,Rating,Reviews
count,9367.0,10841.0
mean,4.191513,444111.9
std,0.515735,2927629.0
min,1.0,0.0
25%,4.0,38.0
50%,4.3,2094.0
75%,4.5,54768.0
max,5.0,78158310.0


In [174]:
# # Create a ProfileReport without specifying a configuration file
# profile = yd.ProfileReport(df)

# # Generate the report and save it to an HTML file
# profile.to_file("outputs/ydata_googleplaystore.html")

In [175]:
df.sample(10)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
4789,X Home Bar - Home Bar Gesture Pro,TOOLS,4.0,88,2.0M,"10,000+",Paid,$1.99,Everyone,Tools,"July 2, 2018",1.5,4.3 and up
6244,B y H Niños ES,BOOKS_AND_REFERENCE,4.6,53,16M,"5,000+",Free,0,Everyone,Books & Reference,"September 22, 2015",1.0.2,2.3 and up
6797,BT Panorama,FINANCE,4.3,34,Varies with device,"5,000+",Free,0,Everyone,Finance,"June 14, 2017",Varies with device,Varies with device
3251,Samsung Max - Data Savings & Privacy Protection,TOOLS,4.3,330468,Varies with device,"10,000,000+",Free,0,Everyone,Tools,"May 29, 2018",Varies with device,Varies with device
917,Nick,ENTERTAINMENT,4.2,123279,25M,"10,000,000+",Free,0,Everyone 10+,Entertainment;Music & Video,"January 24, 2018",2.0.8,4.4 and up
7849,Grenade Practice for CS:GO,FAMILY,4.4,431,17M,"10,000+",Free,0,Everyone,Entertainment,"June 13, 2016",1.1.1,4.0 and up
5875,A-Z App Store,PRODUCTIVITY,3.2,2728,4.4M,"500,000+",Free,0,Everyone,Productivity,"October 5, 2016",1.0.5,4.1 and up
5208,ai.Bot Box,PRODUCTIVITY,,0,Varies with device,"10,000+",Free,0,Everyone,Productivity,"March 1, 2017",Varies with device,Varies with device
6373,VK,SOCIAL,3.8,5793284,Varies with device,"100,000,000+",Free,0,Mature 17+,Social,"August 3, 2018",Varies with device,Varies with device
10322,FE Civil Engineering Exam Prep,FAMILY,2.8,9,21M,"1,000+",Free,0,Everyone,Education,"July 27, 2018",5.33.3669,5.0 and up


In [176]:
df.dtypes

App                object
Category           object
Rating            float64
Reviews             int64
Size               object
Installs           object
Type               object
Price              object
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
dtype: object

## Convert Variables to Numeric Data Type

There are some variables that should be of the numeric data type, but here in this dataset they are of the object data type. We'll have to convert them to numeric. Those variables are:

1. Size
2. Installs
3. Price

### 1. `Size`

In [177]:
# View the data type of the Size column
df['Size'].dtype

dtype('O')

In [178]:
# Number of unique values in the Size column
df['Size'].nunique()

461

In [179]:
# Top 10 most-appearing values in the Size column
df['Size'].value_counts().sort_values(ascending=False).head(10)

Size
Varies with device    1695
11M                    198
12M                    196
14M                    194
13M                    191
15M                    184
17M                    160
19M                    154
26M                    149
16M                    149
Name: count, dtype: int64

In [180]:
# Replace 'Varies with device' with null values
df['Size'] = df['Size'].replace('Varies with device', np.nan)

In [181]:
# To check whether it has been replaced successfully
df['Size'].value_counts(dropna=False).sort_values(ascending=False).head(10)

Size
NaN    1695
11M     198
12M     196
14M     194
13M     191
15M     184
17M     160
19M     154
26M     149
16M     149
Name: count, dtype: int64

In [182]:
# Unique values in the Size column
df['Size'].unique()

array(['19M', '14M', '8.7M', '25M', '2.8M', '5.6M', '29M', '33M', '3.1M',
       '28M', '12M', '20M', '21M', '37M', '2.7M', '5.5M', '17M', '39M',
       '31M', '4.2M', '7.0M', '23M', '6.0M', '6.1M', '4.6M', '9.2M',
       '5.2M', '11M', '24M', nan, '9.4M', '15M', '10M', '1.2M', '26M',
       '8.0M', '7.9M', '56M', '57M', '35M', '54M', '201k', '3.6M', '5.7M',
       '8.6M', '2.4M', '27M', '2.5M', '16M', '3.4M', '8.9M', '3.9M',
       '2.9M', '38M', '32M', '5.4M', '18M', '1.1M', '2.2M', '4.5M',
       '9.8M', '52M', '9.0M', '6.7M', '30M', '2.6M', '7.1M', '3.7M',
       '22M', '7.4M', '6.4M', '3.2M', '8.2M', '9.9M', '4.9M', '9.5M',
       '5.0M', '5.9M', '13M', '73M', '6.8M', '3.5M', '4.0M', '2.3M',
       '7.2M', '2.1M', '42M', '7.3M', '9.1M', '55M', '23k', '6.5M',
       '1.5M', '7.5M', '51M', '41M', '48M', '8.5M', '46M', '8.3M', '4.3M',
       '4.7M', '3.3M', '40M', '7.8M', '8.8M', '6.6M', '5.1M', '61M',
       '66M', '79k', '8.4M', '118k', '44M', '695k', '1.6M', '6.2M', '18k',
       

In [183]:
# Check the data types in the 'Size' column
size_data_types = df['Size'].apply(type).unique()
print(size_data_types)

[<class 'str'> <class 'float'>]


The `Size` column now has two data types because NaN values are considered a flaot data type.

The logic I have applied here is that I've first converted values in kbs to mbs. Then I converted them to a float data type and created a new column named `Size (Mb)`.

In [184]:
# Define a function to convert sizes from kb to mb
def kb_to_mb(size):
    if size is None or pd.isna(size):
        return None
    if 'k' in size:
        return float(size.replace('k', '')) / 1024
    if 'M' in size:
        return float(size.replace('M', ''))

It's always better to make a copy of the original DataFrame before making any changes to it.

In [185]:
# Make a copy of the df
cleaned_df = df.copy()

In [186]:
# Apply the conversion function to the 'Size' column
cleaned_df['Size (Mb)'] = cleaned_df['Size'].apply(kb_to_mb)

In [187]:
cleaned_df.sample(20)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Size (Mb)
9602,Connect'Em Halloween,FAMILY,4.5,6673,4.4M,"500,000+",Free,0,Everyone,Puzzle,"September 23, 2016",1.0.7,2.0 and up,4.4
10509,PIP Selfie Camera Photo Editor,PHOTOGRAPHY,4.4,156322,,"10,000,000+",Free,0,Everyone,Photography,"February 1, 2018",Varies with device,Varies with device,
7429,Miami Crime Vice Town,GAME,4.1,154519,99M,"10,000,000+",Free,0,Mature 17+,Action,"May 9, 2017",1.4,2.3 and up,99.0
8339,DF Wall Plus – Droid Firewall,TOOLS,,9,6.3M,500+,Free,0,Everyone,Tools,"August 20, 2017",1.0,4.0.3 and up,6.3
6021,BD TYCOON,FAMILY,3.3,16,33M,500+,Free,0,Everyone,Simulation,"July 6, 2017",1.2,4.1 and up,33.0
3574,Cloud Print,PRODUCTIVITY,4.1,282460,,"500,000,000+",Free,0,Everyone,Productivity,"May 23, 2018",Varies with device,Varies with device,
8393,DG Monitor,BUSINESS,,1,4.5M,100+,Free,0,Everyone,Business,"October 12, 2017",1.0.0.17,4.0 and up,4.5
4636,/u/app,COMMUNICATION,4.7,573,53M,"10,000+",Free,0,Mature 17+,Communication,"July 3, 2018",4.2.4,4.1 and up,53.0
5929,Arabic Alphabet Alif Ba Ta Wooden Blocks,FAMILY,4.6,66,17M,"10,000+",Free,0,Everyone,Educational;Education,"February 25, 2018",1.0.0,2.3 and up,17.0
2368,live Point,MEDICAL,3.6,54,17M,"5,000+",Free,0,Mature 17+,Medical,"July 3, 2018",2.6.5,4.1 and up,17.0


In [188]:
# Drop the `Size` column
cleaned_df.drop('Size', axis=1, inplace=True)

### 2. `Installs`

In [189]:
# Check the data types in the 'Installs' column
installs_data_types = cleaned_df['Installs'].apply(type).unique()
print(installs_data_types)

[<class 'str'>]


In [190]:
# Define a function to convert the 'Installs' values to numeric
def convert_installs(installs):
    if isinstance(installs, str):
        return int(installs.replace(',', '').replace('+', ''))
    else:
        return installs
    
# Apply the conversion function to the 'Installs' column
cleaned_df['Installs'] = cleaned_df['Installs'].apply(convert_installs)

In [191]:
cleaned_df['Installs']

0           10000
1          500000
2         5000000
3        50000000
4          100000
           ...   
10836        5000
10837         100
10838        1000
10839        1000
10840    10000000
Name: Installs, Length: 10841, dtype: int64

In [192]:
cleaned_df.head()

Unnamed: 0,App,Category,Rating,Reviews,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Size (Mb)
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,10000,Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up,19.0
1,Coloring book moana,ART_AND_DESIGN,3.9,967,500000,Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,14.0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,5000000,Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up,8.7
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,50000000,Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up,25.0
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,100000,Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up,2.8


In [193]:
# Check it has been successfully converted on numerical variable
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10840 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  int64  
 4   Installs        10841 non-null  int64  
 5   Type            10840 non-null  object 
 6   Price           10841 non-null  object 
 7   Content Rating  10841 non-null  object 
 8   Genres          10840 non-null  object 
 9   Last Updated    10841 non-null  object 
 10  Current Ver     10833 non-null  object 
 11  Android Ver     10839 non-null  object 
 12  Size (Mb)       9146 non-null   float64
dtypes: float64(2), int64(2), object(9)
memory usage: 1.1+ MB


### 3. `Price`

In [194]:
# Check the data types in the 'Price' column
price_data_types = cleaned_df['Price'].apply(type).unique()
print(price_data_types)

[<class 'str'>]


In [195]:
cleaned_df['Price'].unique()

array(['0', '$4.99', '$3.99', '$6.99', '$1.49', '$2.99', '$7.99', '$5.99',
       '$3.49', '$1.99', '$9.99', '$7.49', '$0.99', '$9.00', '$5.49',
       '$10.00', '$24.99', '$11.99', '$79.99', '$16.99', '$14.99',
       '$1.00', '$29.99', '$12.99', '$2.49', '$10.99', '$1.50', '$19.99',
       '$15.99', '$33.99', '$74.99', '$39.99', '$3.95', '$4.49', '$1.70',
       '$8.99', '$2.00', '$3.88', '$25.99', '$399.99', '$17.99',
       '$400.00', '$3.02', '$1.76', '$4.84', '$4.77', '$1.61', '$2.50',
       '$1.59', '$6.49', '$1.29', '$5.00', '$13.99', '$299.99', '$379.99',
       '$37.99', '$18.99', '$389.99', '$19.90', '$8.49', '$1.75',
       '$14.00', '$4.85', '$46.99', '$109.99', '$154.99', '$3.08',
       '$2.59', '$4.80', '$1.96', '$19.40', '$3.90', '$4.59', '$15.46',
       '$3.04', '$4.29', '$2.60', '$3.28', '$4.60', '$28.99', '$2.95',
       '$2.90', '$1.97', '$200.00', '$89.99', '$2.56', '$30.99', '$3.61',
       '$394.99', '$1.26', '$1.20', '$1.04'], dtype=object)

In [196]:
# Define a function to convert the 'Price' values to float
def convert_price(price):
    if isinstance(price, str) and price.startswith('$'):
        return float(price.replace('$', ''))
    else:
        return None
    
# Apply the conversion function to the 'Price' column
cleaned_df['Price ($)'] = cleaned_df['Price'].apply(convert_price)

In [197]:
cleaned_df['Price ($)'].unique()

array([   nan,   4.99,   3.99,   6.99,   1.49,   2.99,   7.99,   5.99,
         3.49,   1.99,   9.99,   7.49,   0.99,   9.  ,   5.49,  10.  ,
        24.99,  11.99,  79.99,  16.99,  14.99,   1.  ,  29.99,  12.99,
         2.49,  10.99,   1.5 ,  19.99,  15.99,  33.99,  74.99,  39.99,
         3.95,   4.49,   1.7 ,   8.99,   2.  ,   3.88,  25.99, 399.99,
        17.99, 400.  ,   3.02,   1.76,   4.84,   4.77,   1.61,   2.5 ,
         1.59,   6.49,   1.29,   5.  ,  13.99, 299.99, 379.99,  37.99,
        18.99, 389.99,  19.9 ,   8.49,   1.75,  14.  ,   4.85,  46.99,
       109.99, 154.99,   3.08,   2.59,   4.8 ,   1.96,  19.4 ,   3.9 ,
         4.59,  15.46,   3.04,   4.29,   2.6 ,   3.28,   4.6 ,  28.99,
         2.95,   2.9 ,   1.97, 200.  ,  89.99,   2.56,  30.99,   3.61,
       394.99,   1.26,   1.2 ,   1.04])

In [198]:
# Drop the Price column
cleaned_df.drop("Price", axis=1, inplace=True)

In [199]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10840 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  int64  
 4   Installs        10841 non-null  int64  
 5   Type            10840 non-null  object 
 6   Content Rating  10841 non-null  object 
 7   Genres          10840 non-null  object 
 8   Last Updated    10841 non-null  object 
 9   Current Ver     10833 non-null  object 
 10  Android Ver     10839 non-null  object 
 11  Size (Mb)       9146 non-null   float64
 12  Price ($)       800 non-null    float64
dtypes: float64(3), int64(2), object(8)
memory usage: 1.1+ MB


In [200]:
cleaned_df.head()

Unnamed: 0,App,Category,Rating,Reviews,Installs,Type,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Size (Mb),Price ($)
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,10000,Free,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up,19.0,
1,Coloring book moana,ART_AND_DESIGN,3.9,967,500000,Free,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,14.0,
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,5000000,Free,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up,8.7,
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,50000000,Free,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up,25.0,
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,100000,Free,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up,2.8,


In [201]:
cleaned_df.to_csv("cleaned_googleplaystore.csv")