# Google APP Store EDA

In [2]:
import pandas as pd
import ydata_profiling as yd
import seaborn as sns 
import numpy as np
import matplotlib.pyplot as plt 
import sklearn as sk
import statsmodels as sum
from statsmodels.formula.api import ols
import statsmodels.api as sm 


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv('./data/googleplaystore.csv')

In [4]:
df.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver', 'Unnamed: 13'],
      dtype='object')

In [5]:
df.describe()

Unnamed: 0,Rating,Reviews,Unnamed: 13
count,9367.0,10841.0,0.0
mean,4.191513,444111.9,
std,0.515735,2927629.0,
min,1.0,0.0,
25%,4.0,38.0,
50%,4.3,2094.0,
75%,4.5,54768.0,
max,5.0,78158310.0,


In [6]:
# run automatic eda ydata profiling report
# profile = yd.ProfileReport(df)
# profile.to_file('./data/googleplaystore.html')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10840 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  int64  
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10841 non-null  object 
 9   Genres          10840 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10839 non-null  object 
 13  Unnamed: 13     0 non-null      float64
dtypes: float64(2), int64(1), object(11)
memory usage: 1.2+ MB


In [8]:
# look at the dataset
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Unnamed: 13
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up,
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up,
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up,
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up,


In [9]:
# take sample from big dataset
df.sample(3)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Unnamed: 13
1187,Debonairs Pizza,FOOD_AND_DRINK,3.8,3320,8.4M,"500,000+",Free,0,Everyone,Food & Drink,"August 4, 2018",1.9.117,4.0.3 and up,
3619,Baby Monitor,PARENTING,4.4,5343,5.2M,"1,000,000+",Free,0,Everyone,Parenting,"March 30, 2018",2.1.1,4.1 and up,
8282,D.C. Driving/Walking Tours,TRAVEL_AND_LOCAL,,0,32M,50+,Paid,$4.99,Everyone,Travel & Local,"July 27, 2018",3.8.0,4.4 and up,


In [10]:
df.nlargest(10, 'Rating')

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Unnamed: 13
329,Hojiboy Tojiboyev Life Hacks,COMICS,5.0,15,37M,"1,000+",Free,0,Everyone,Comics,"June 26, 2018",2.0,4.0.3 and up,
612,American Girls Mobile Numbers,DATING,5.0,5,4.4M,"1,000+",Free,0,Mature 17+,Dating,"July 17, 2018",3.0,4.0.3 and up,
615,Awake Dating,DATING,5.0,2,70M,100+,Free,0,Mature 17+,Dating,"July 24, 2018",2.2.9,4.4 and up,
633,Spine- The dating app,DATING,5.0,5,9.3M,500+,Free,0,Teen,Dating,"July 14, 2018",4.0,4.0.3 and up,
636,Girls Live Talk - Free Text and Video Chat,DATING,5.0,6,5.0M,100+,Free,0,Mature 17+,Dating,"August 1, 2018",8.2,4.0.3 and up,
640,Online Girls Chat Group,DATING,5.0,5,5.0M,100+,Free,0,Mature 17+,Dating,"August 2, 2018",8.2,4.0.3 and up,
654,Speeding Joyride & Car Meet App,DATING,5.0,3,25M,100+,Free,0,Mature 17+,Dating,"July 20, 2018",1.2.9,4.1 and up,
1028,SUMMER SONIC app,EVENTS,5.0,4,61M,500+,Free,0,Everyone,Events,"July 24, 2018",1.0,4.4 and up,
1030,Prosperity,EVENTS,5.0,16,2.3M,100+,Free,0,Everyone,Events,"July 9, 2018",1.14,2.0 and up,
1038,Mindvalley U Tallinn 2018,EVENTS,5.0,1,21M,100+,Free,0,Everyone,Events,"July 3, 2018",1.0.5,4.4 and up,


In [11]:
df.describe()

Unnamed: 0,Rating,Reviews,Unnamed: 13
count,9367.0,10841.0,0.0
mean,4.191513,444111.9,
std,0.515735,2927629.0,
min,1.0,0.0,
25%,4.0,38.0,
50%,4.3,2094.0,
75%,4.5,54768.0,
max,5.0,78158310.0,


## Write down the all numeric verible and make them numeric
1. Size
2. intall
3. price

In [12]:
df['Size'].value_counts()

Size
Varies with device    1695
11M                    198
12M                    196
14M                    194
13M                    191
                      ... 
470k                     1
975k                     1
980k                     1
404k                     1
420k                     1
Name: count, Length: 461, dtype: int64

# check that How many MBs are there in one Kbs? and how to calculate?
1. convert KBs into M
2. then remove M
3. Handle `varies with Device`

In [13]:
df['Size'].isnull().sum()

np.int64(0)

In [14]:
import numpy as np

# Function to convert size to MB
def convert_size(size):
    if size == 'Varies with device':
        return np.nan
    if 'M' in size:
        return float(size.replace('M', ''))
    if 'k' in size:
        return float(size.replace('k', '')) / 1024  # Convert KB to MB
    return np.nan

df['Size'] = df['Size'].apply(convert_size)


In [15]:
df['Size'].value_counts()

Size
11.000000    198
12.000000    196
14.000000    194
13.000000    191
15.000000    184
            ... 
0.539062       1
0.864258       1
0.996094       1
0.568359       1
0.604492       1
Name: count, Length: 459, dtype: int64

In [16]:
# Replace 'Varies with device' with NaN
df['Size'] = df['Size'].replace('Varies with device', pd.NA)

In [17]:
df['Size'].isnull().sum()

np.int64(1695)

# Install
you can do BINNING

In [18]:
df['Installs'].value_counts()

Installs
1,000,000+        1579
10,000,000+       1252
100,000+          1169
10,000+           1054
1,000+             908
5,000,000+         752
100+               719
500,000+           539
50,000+            479
5,000+             477
100,000,000+       409
10+                386
500+               330
50,000,000+        289
50+                205
5+                  82
500,000,000+        72
1+                  67
1,000,000,000+      58
0+                  14
0                    1
Name: count, dtype: int64

## do it as numeric column or group that into different group following Binning method

In [19]:
# we solve this problem by binning method 
df['Installs'] = (
    df['Installs']
    .str.replace('+', '', regex=False)
    .str.replace(',', '', regex=False)
    .astype(int)
)


In [20]:
df['Installs'].dtype


dtype('int64')

In [21]:
bins = [0, 1000, 10000, 100000, 1000000, 10000000, float('inf')]
labels = ['0-1K', '1K-10K', '10K-100K', '100K-1M', '1M-10M', '10M+']

df['Install_Group'] = pd.cut(df['Installs'], bins=bins, labels=labels)
df['Install_Group'].value_counts()

Install_Group
0-1K        2697
100K-1M     2118
1M-10M      2004
10K-100K    1648
1K-10K      1531
10M+         828
Name: count, dtype: int64

In [22]:
df['Install_Group'].dtype

CategoricalDtype(categories=['0-1K', '1K-10K', '10K-100K', '100K-1M', '1M-10M', '10M+'], ordered=True, categories_dtype=object)

## How to Handle Price 

In [23]:
df['Price'].value_counts()

Price
0          10041
$0.99        148
$2.99        129
$1.99         73
$4.99         72
           ...  
$3.61          1
$394.99        1
$1.26          1
$1.20          1
$1.04          1
Name: count, Length: 92, dtype: int64

## Remove $ Sign from all columns using pandas

In [24]:
df.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver', 'Unnamed: 13', 'Install_Group'],
      dtype='object')

In [25]:
# Remove $ sign
df['Price'] = df['Price'].str.replace('$', '', regex=False)

# Optional: convert to float
df['Price'] = df['Price'].astype(float)


In [26]:
df['Type'].isnull().sum()

np.int64(1)

In [27]:
## how to fill missing values in Type column
df['Type'].fillna(df['Type'].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Type'].fillna(df['Type'].mode()[0], inplace=True)


In [28]:
df['Type'].isnull().sum()

np.int64(0)

In [29]:
df.describe()

Unnamed: 0,Rating,Reviews,Size,Installs,Price,Unnamed: 13
count,9367.0,10841.0,9146.0,10841.0,10841.0,0.0
mean,4.191513,444111.9,21.514141,15462910.0,1.027273,
std,0.515735,2927629.0,22.588679,85025570.0,15.948971,
min,1.0,0.0,0.008301,0.0,0.0,
25%,4.0,38.0,4.9,1000.0,0.0,
50%,4.3,2094.0,13.0,100000.0,0.0,
75%,4.5,54768.0,30.0,5000000.0,0.0,
max,5.0,78158310.0,100.0,1000000000.0,400.0,
