# Import Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_columns', 500)

In [2]:
df = pd.read_csv('data/googleplaystore.csv')

In [3]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


In [5]:
data = df.copy()

In [6]:
data = df.drop(columns={'Genres', 'Current Ver', 'Android Ver', 'Price'}, axis=1)

In [7]:
for col in data:
    print(col)
    print(data[col].value_counts(normalize=True))
    print('-------------------------------')

App
ROBLOX                                               0.000830
CBS Sports App - Scores, News, Stats & Watch Live    0.000738
ESPN                                                 0.000646
Duolingo: Learn Languages Free                       0.000646
Candy Crush Saga                                     0.000646
                                                       ...   
HTC Sense Input - ES                                 0.000092
Servers Ultimate Pack B                              0.000092
Aquarium Co-Op Podcast                               0.000092
Zowi App                                             0.000092
Yidio: TV Show & Movie Guide                         0.000092
Name: App, Length: 9660, dtype: float64
-------------------------------
Category
FAMILY                 0.181902
GAME                   0.105525
TOOLS                  0.077760
MEDICAL                0.042708
BUSINESS               0.042432
PRODUCTIVITY           0.039111
PERSONALIZATION        0.036159
COMMUNICA

**Observations**
1. **App - there seems to be a few duplicates in app. have to deal with deleting those.**
2. **Category - need to look into '1.9'.**
3. **Rating - the rating is from 1-5, but there's a rating of 19. Need to delete that one.**
4. **Reviews - might drop the '0' reviews or filter the amount of reviews.**
5. **Size - the 'varies with device' need to be replace with the average size and the "M" needs to be taken out.**
6. **Installs - need to get rid of the '+' and maybe filter a min-max scale.**
7. **Only focused on Free apps at this time.**
8. **Change last update into days.**

# Cleaning

## App Duplicates

In [8]:
# let's check to make sure we have duplicates
data[data['App'] == 'ROBLOX']

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Content Rating,Last Updated
1653,ROBLOX,GAME,4.5,4447388,67M,"100,000,000+",Free,Everyone 10+,"July 31, 2018"
1701,ROBLOX,GAME,4.5,4447346,67M,"100,000,000+",Free,Everyone 10+,"July 31, 2018"
1748,ROBLOX,GAME,4.5,4448791,67M,"100,000,000+",Free,Everyone 10+,"July 31, 2018"
1841,ROBLOX,GAME,4.5,4449882,67M,"100,000,000+",Free,Everyone 10+,"July 31, 2018"
1870,ROBLOX,GAME,4.5,4449910,67M,"100,000,000+",Free,Everyone 10+,"July 31, 2018"
2016,ROBLOX,FAMILY,4.5,4449910,67M,"100,000,000+",Free,Everyone 10+,"July 31, 2018"
2088,ROBLOX,FAMILY,4.5,4450855,67M,"100,000,000+",Free,Everyone 10+,"July 31, 2018"
2206,ROBLOX,FAMILY,4.5,4450890,67M,"100,000,000+",Free,Everyone 10+,"July 31, 2018"
4527,ROBLOX,FAMILY,4.5,4443407,67M,"100,000,000+",Free,Everyone 10+,"July 31, 2018"


In [9]:
data[data['App'] == 'ESPN']

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Content Rating,Last Updated
2959,ESPN,SPORTS,4.2,521138,Varies with device,"10,000,000+",Free,Everyone 10+,"July 19, 2018"
3010,ESPN,SPORTS,4.2,521138,Varies with device,"10,000,000+",Free,Everyone 10+,"July 19, 2018"
3018,ESPN,SPORTS,4.2,521138,Varies with device,"10,000,000+",Free,Everyone 10+,"July 19, 2018"
3048,ESPN,SPORTS,4.2,521140,Varies with device,"10,000,000+",Free,Everyone 10+,"July 19, 2018"
3060,ESPN,SPORTS,4.2,521140,Varies with device,"10,000,000+",Free,Everyone 10+,"July 19, 2018"
3072,ESPN,SPORTS,4.2,521140,Varies with device,"10,000,000+",Free,Everyone 10+,"July 19, 2018"
4069,ESPN,SPORTS,4.2,521081,Varies with device,"10,000,000+",Free,Everyone 10+,"July 19, 2018"


In [10]:
data[data['App'] == 'Candy Crush Saga']

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Content Rating,Last Updated
1655,Candy Crush Saga,GAME,4.4,22426677,74M,"500,000,000+",Free,Everyone,"July 5, 2018"
1705,Candy Crush Saga,GAME,4.4,22428456,74M,"500,000,000+",Free,Everyone,"July 5, 2018"
1751,Candy Crush Saga,GAME,4.4,22428456,74M,"500,000,000+",Free,Everyone,"July 5, 2018"
1842,Candy Crush Saga,GAME,4.4,22429716,74M,"500,000,000+",Free,Everyone,"July 5, 2018"
1869,Candy Crush Saga,GAME,4.4,22430188,74M,"500,000,000+",Free,Everyone,"July 5, 2018"
1966,Candy Crush Saga,GAME,4.4,22430188,74M,"500,000,000+",Free,Everyone,"July 5, 2018"
3994,Candy Crush Saga,FAMILY,4.4,22419455,74M,"500,000,000+",Free,Everyone,"July 5, 2018"


**Looking at some of the duplicates it seems that the only difference is the amount of reviews and/or the category name. For now I think I'll stick with the first value and drop the rest.**

In [11]:
data = data.drop_duplicates(subset=['App'], keep='first')

## Category

In [12]:
data[data['Category'] == '1.9']

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Content Rating,Last Updated
10472,Life Made WI-Fi Touchscreen Photo Frame,1.9,19.0,3.0M,"1,000+",Free,0,,1.0.19


Since this is one entry I can easily find the correct info and plug it in.
Category = Lifestyle
Rating = 1.9
Reviews = 19.0
Size = 3.0M
Installs = 1,000
Type = Free
Content Rating = Everyone

In [13]:
data.at[10472,"Category"]="LIFESTYLE"
data.at[10472, "Rating"] = 1.9
data.at[10472, "Reviews"] = "19.0"
data.at[10472, "Size"] = "3.0M"
data.at[10472, "Installs"] = "1,000+"
data.at[10472, "Type"] = "Free"
data.at[10472, "Content Rating"] = "Everyone"
data.at[10472, "Last Updated"] = "February 11, 2018"

In [14]:
data[data['App'] == 'Life Made WI-Fi Touchscreen Photo Frame']

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Content Rating,Last Updated
10472,Life Made WI-Fi Touchscreen Photo Frame,LIFESTYLE,1.9,19.0,3.0M,"1,000+",Free,Everyone,"February 11, 2018"


## Rating

In [15]:
# replacing the NAN values with the average rating
data['Rating'] = data['Rating'].fillna(data['Rating'].mean())

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9660 entries, 0 to 10840
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             9660 non-null   object 
 1   Category        9660 non-null   object 
 2   Rating          9660 non-null   float64
 3   Reviews         9660 non-null   object 
 4   Size            9660 non-null   object 
 5   Installs        9660 non-null   object 
 6   Type            9659 non-null   object 
 7   Content Rating  9660 non-null   object 
 8   Last Updated    9660 non-null   object 
dtypes: float64(1), object(8)
memory usage: 1.0+ MB


## Reviews

In [17]:
data = data.dropna().reset_index()

In [18]:
data.Reviews.value_counts(normalize=True)

0         0.061290
1         0.028160
2         0.022052
3         0.017600
4         0.014184
            ...   
25183     0.000104
71432     0.000104
14952     0.000104
1580      0.000104
706301    0.000104
Name: Reviews, Length: 5331, dtype: float64

In [19]:
data['Reviews'] = pd.to_numeric(data['Reviews'])

In [20]:
data = data[data['Reviews'] > 0]

## Size

In [21]:
data[data['Size'] == "Varies with device"]

Unnamed: 0,index,App,Category,Rating,Reviews,Size,Installs,Type,Content Rating,Last Updated
37,37,Floor Plan Creator,ART_AND_DESIGN,4.1,36639.0,Varies with device,"5,000,000+",Free,Everyone,"July 14, 2018"
42,42,Textgram - write on photos,ART_AND_DESIGN,4.4,295221.0,Varies with device,"10,000,000+",Free,Everyone,"July 30, 2018"
52,52,Used Cars and Trucks for Sale,AUTO_AND_VEHICLES,4.6,17057.0,Varies with device,"1,000,000+",Free,Everyone,"July 30, 2018"
67,67,Ulysse Speedometer,AUTO_AND_VEHICLES,4.3,40211.0,Varies with device,"5,000,000+",Free,Everyone,"July 30, 2018"
68,68,REPUVE,AUTO_AND_VEHICLES,3.9,356.0,Varies with device,"100,000+",Free,Everyone,"May 25, 2018"
...,...,...,...,...,...,...,...,...,...,...
9536,10713,My Earthquake Alerts - US & Worldwide Earthquakes,WEATHER,4.4,3471.0,Varies with device,"100,000+",Free,Everyone,"July 24, 2018"
9546,10725,Posta App,MAPS_AND_NAVIGATION,3.6,8.0,Varies with device,"1,000+",Free,Everyone,"September 27, 2017"
9584,10765,Chat For Strangers - Video Chat,SOCIAL,3.4,622.0,Varies with device,"100,000+",Free,Mature 17+,"May 23, 2018"
9644,10826,Frim: get new friends on local chat rooms,SOCIAL,4.0,88486.0,Varies with device,"5,000,000+",Free,Mature 17+,"March 23, 2018"


In [22]:
data['Size'].value_counts(normalize=True)

Varies with device    0.131025
12M                   0.018308
11M                   0.017977
14M                   0.017977
13M                   0.017646
                        ...   
953k                  0.000110
526k                  0.000110
691k                  0.000110
219k                  0.000110
556k                  0.000110
Name: Size, Length: 447, dtype: float64

In [23]:
# create a variable to hold a certain value
mask = data['Size'].str.startswith('V')

In [24]:
#replace that value with zero
data.loc[mask, 'Size'] = 0

In [25]:
# convert the K and M to thousands and millions using regex
data.Size = (data.Size.replace(r'[kM]+$', '', regex=True).astype(float) * \
           data.Size.str.extract(r'[\d\.]+([kM]+)', expand=False).fillna(1)
           .replace(['k','M'], [10**3, 10**6]).astype(int))

In [26]:
#to retain some data replace zeros with the column's avearge
mean_size = data['Size'].mean()
data['Size'] = data.Size.mask(data.Size == 0.0,mean_size)

## Installs

In [27]:
# remove the + sign from columns
data['Installs'] = data['Installs'].str.replace('+','').str.replace(',','')

In [28]:
data['Installs'] = pd.to_numeric(data['Installs'])

## Last Updated

In [29]:
data['Last Updated'] = pd.to_datetime(data['Last Updated'], format="%B %d, %Y", errors='coerce')

## Free Apps

In [30]:
data.Type.value_counts(normalize=True)

Free    0.925003
Paid    0.074997
Name: Type, dtype: float64

In [31]:
main_df = data.copy()

In [32]:
main_df = main_df[main_df.Type == 'Free']

# Cleaned Data

In [33]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8387 entries, 0 to 9658
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   index           8387 non-null   int64         
 1   App             8387 non-null   object        
 2   Category        8387 non-null   object        
 3   Rating          8387 non-null   float64       
 4   Reviews         8387 non-null   float64       
 5   Size            8387 non-null   float64       
 6   Installs        8387 non-null   int64         
 7   Type            8387 non-null   object        
 8   Content Rating  8387 non-null   object        
 9   Last Updated    8387 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(3), int64(2), object(4)
memory usage: 720.8+ KB


In [34]:
main_df.to_csv('cleaned_data.csv')