# Goggle App Store EDA

In [1]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
#Loading DataSet
df = pd.read_csv("./Dataset/googleplaystore.csv")

In [3]:
# import ydata_profiling as yd
# profile= yd.ProfileReport(df)
# profile.to_file(output_file = "./outputs/ydata_goggle_EDA.html")

In [4]:
#check info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10840 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10840 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10840 non-null  object 
 8   Content Rating  10841 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


In [5]:
#lets look Data
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [6]:
#Take random from big Data
df.sample(2)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
3333,Speedtest by Ookla,TOOLS,4.4,1028794,Varies with device,"100,000,000+",Free,0,Everyone,Tools,"July 19, 2018",Varies with device,Varies with device
8855,DT Fieldlink,BUSINESS,3.0,2,49M,500+,Free,0,Everyone,Business,"September 19, 2017",2.1.1,4.1 and up


In [7]:
#Summary of statistics of numeric columns
df.describe()

Unnamed: 0,Rating
count,9367.0
mean,4.193338
std,0.537431
min,1.0
25%,4.0
50%,4.3
75%,4.5
max,19.0


# Write down all numeric variables and make them numeric
1. size
2. install
3. price

# Handling size column

In [8]:
import pandas as pd
import numpy as np

# Function to convert and format Size
def convert_and_format_size(size):
    if pd.isnull(size) or size == 'Varies with device':
        return "Unknown"
    size = size.strip().replace(',', '')
    try:
        if size.endswith('M'):
            size_kb = float(size[:-1]) * 1024
        elif size.endswith('k'):
            size_kb = float(size[:-1])
        else:
            return "Unknown"
        return f"{size_kb:.0f} KB"
    except:
        return "Unknown"

# Apply function directly to the original 'Size' column
df['Size'] = df['Size'].apply(convert_and_format_size)


In [9]:
#Checking Value Counts of size column
df["Size"].value_counts()

Size
Unknown     1696
11264 KB     198
12288 KB     196
14336 KB     194
13312 KB     191
            ... 
552 KB         1
885 KB         1
1020 KB        1
582 KB         1
619 KB         1
Name: count, Length: 460, dtype: int64

In [10]:
#Checking Missing Values in Size Column
df["Size"].isnull().sum()

np.int64(0)

# Handling Installs Column

In [11]:
# Clean the Installs column (remove commas and plus sign)
def clean_installs(value):
    if pd.isnull(value):
        return np.nan
    value = str(value).replace(',', '').replace('+', '')
    if value.isdigit():
        return int(value)
    else:
        return np.nan

df['Installs'] = df['Installs'].apply(clean_installs)

# Define bin edges and labels
bins = [0, 1000, 10000, 100000, 1000000, 10000000, df['Installs'].max()]
labels = ['Very Low', 'Low', 'Moderate', 'High', 'Very High', 'Extremely High']

# Create new column with categories
df['Install_Category'] = pd.cut(df['Installs'], bins=bins, labels=labels, include_lowest=True)


In [12]:
#Display first 10 rows of 'Installs' and their corresponding install categories
print(df[['Installs', 'Install_Category']].head(10))
print(df['Install_Category'].value_counts())


     Installs Install_Category
0     10000.0              Low
1    500000.0             High
2   5000000.0        Very High
3  50000000.0   Extremely High
4    100000.0         Moderate
5     50000.0         Moderate
6     50000.0         Moderate
7   1000000.0             High
8   1000000.0             High
9     10000.0              Low
Install_Category
Very Low          2711
High              2118
Very High         2004
Moderate          1648
Low               1531
Extremely High     828
Name: count, dtype: int64


In [13]:
#Checking Unique values in installs Column
df["Installs"].unique()

array([1.e+04, 5.e+05, 5.e+06, 5.e+07, 1.e+05, 5.e+04, 1.e+06, 1.e+07,
       5.e+03, 1.e+08, 1.e+09, 1.e+03, 5.e+08, 5.e+01, 1.e+02, 5.e+02,
       1.e+01, 1.e+00, 5.e+00, 0.e+00,    nan])

In [14]:
#Checking Value counts of installs Column
df["Installs"].value_counts()

Installs
1.000000e+06    1579
1.000000e+07    1252
1.000000e+05    1169
1.000000e+04    1054
1.000000e+03     907
5.000000e+06     752
1.000000e+02     719
5.000000e+05     539
5.000000e+04     479
5.000000e+03     477
1.000000e+08     409
1.000000e+01     386
5.000000e+02     330
5.000000e+07     289
5.000000e+01     205
5.000000e+00      82
5.000000e+08      72
1.000000e+00      67
1.000000e+09      58
0.000000e+00      15
Name: count, dtype: int64

# Handling Price column

In [15]:
# Remove the dollar sign and convert to float
df['Price'] = df['Price'].astype(str).str.replace('$', '', regex=False).replace('Free', '0')
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')

In [16]:
#Checking Value Counts of Price Column
df["Price"].value_counts()

Price
0.00      10040
0.99        148
2.99        129
1.99         73
4.99         72
          ...  
3.61          1
394.99        1
1.26          1
1.20          1
1.04          1
Name: count, Length: 92, dtype: int64

In [17]:
#Checking missing Values in Price Column
df["Price"].isnull().sum()

np.int64(1)

In [18]:
#  Display rows where the 'Price' column has missing (NaN) values
df[df['Price'].isnull()]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Install_Category
10472,Life Made WI-Fi Touchscreen Photo Frame,,19.0,3.0M,Unknown,,0,,Everyone,"February 11, 2018",1.0.19,4.0 and up,,
