In [1]:
import pandas as pd
import numpy as np
import json 
import seaborn as sns
import matplotlib.pyplot as plt
import os

from sklearn import linear_model
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
import sklearn.ensemble

In [2]:
df = pd.read_csv("dataset/googleplaystore.csv")

In [3]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [4]:
df.shape

(10841, 13)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


In [6]:
df.isnull().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

In [7]:
def value_to_float(x):
    # Convert the string feature into float/Integer
    #If there is 'K' or 'M' in the string, convert it to the corresponding number (1000 or 1000000)
    
    if type(x) == float or type(x) == int:
        return x
    if 'K' in x:
        if len(x) > 1:
            return float(x.replace('K', '')) * 10**3
        return 1000.0
    if 'M' in x:
        if len(x) > 1:
            return float(x.replace('M', '')) * 10**6
        return 1000000.0
    
    # If the string cannot be converted, return 0 instead
    try:
        parsed_val = float(x)
    except ValueError:
        parsed_val = 0.0
    return parsed_val

In [10]:
df = df.loc[df["Rating"].notnull()]
df = df.loc[df["Rating"] <= 5]

df["Price"] = df["Price"].apply(lambda x: x.replace('$', ''))
df["Installs"] = df["Installs"].apply(lambda x: x.replace('+', ''))
df['Installs'] = df['Installs'].apply(lambda a: str(a).replace(',', '') if ',' in str(a) else a)
df['Installs'] = df['Installs'].apply(lambda a : int(a))
df['Reviews'] = df['Reviews'].apply(lambda a : int(a))

genres = df["Genres"].value_counts().head().index

str_cols = ["Size", "Price"]

for col in str_cols:
    df[[col]] = df[[col]].fillna(value="")
    df[col] = df[col].apply(value_to_float)

In [11]:
df.dropna(inplace=True)

In [12]:
df.isnull().sum()

App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              0
Price             0
Content Rating    0
Genres            0
Last Updated      0
Current Ver       0
Android Ver       0
dtype: int64

In [13]:
df.shape

(9360, 13)

In [14]:
df.describe()

Unnamed: 0,Rating,Reviews,Size,Installs,Price
count,9360.0,9360.0,9360.0,9360.0,9360.0
mean,4.191838,514376.7,18941240.0,17908750.0,0.961279
std,0.515263,3145023.0,23028130.0,91266370.0,15.82164
min,1.0,1.0,0.0,1.0,0.0
25%,4.0,186.75,2400000.0,10000.0,0.0
50%,4.3,5955.0,9500000.0,500000.0,0.0
75%,4.5,81627.5,27000000.0,5000000.0,0.0
max,5.0,78158310.0,100000000.0,1000000000.0,400.0


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9360 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             9360 non-null   object 
 1   Category        9360 non-null   object 
 2   Rating          9360 non-null   float64
 3   Reviews         9360 non-null   int64  
 4   Size            9360 non-null   float64
 5   Installs        9360 non-null   int64  
 6   Type            9360 non-null   object 
 7   Price           9360 non-null   float64
 8   Content Rating  9360 non-null   object 
 9   Genres          9360 non-null   object 
 10  Last Updated    9360 non-null   object 
 11  Current Ver     9360 non-null   object 
 12  Android Ver     9360 non-null   object 
dtypes: float64(3), int64(2), object(8)
memory usage: 1023.8+ KB


In [20]:
# There are 7 records where Reviews are greater than Installs 
df[df['Reviews'] > df['Installs']].shape

(7, 13)

In [21]:
# Dropping 7 records that have greater Reviews than Installs
df.drop(df[df['Reviews'] > df['Installs']].index,inplace=True)
df[df['Reviews'] > df['Installs']].shape

(0, 13)

In [30]:
# Dropping rows that have Price > 200
df.drop(df[df['Price'] > 200].index,inplace=True)

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9338 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             9338 non-null   object 
 1   Category        9338 non-null   object 
 2   Rating          9338 non-null   float64
 3   Reviews         9338 non-null   int64  
 4   Size            9338 non-null   float64
 5   Installs        9338 non-null   int64  
 6   Type            9338 non-null   object 
 7   Price           9338 non-null   float64
 8   Content Rating  9338 non-null   object 
 9   Genres          9338 non-null   object 
 10  Last Updated    9338 non-null   object 
 11  Current Ver     9338 non-null   object 
 12  Android Ver     9338 non-null   object 
dtypes: float64(3), int64(2), object(8)
memory usage: 1021.3+ KB


In [34]:
for i in genres:
    print(i)

Tools
Entertainment
Education
Action
Productivity


In [7]:
df = df[df['Category'] != '1.9']
value_counts = df['Category'].value_counts()

In [8]:
category = pd.get_dummies(df['Category'], prefix='Category')
df = pd.concat([df, category], axis=1)


In [9]:
df.drop('Category', axis=1, inplace=True)

   Rating  Size Price Content Rating      Last Updated         Current Ver   
0     4.1   19M     0       Everyone   January 7, 2018               1.0.0  \
1     3.9   14M     0       Everyone  January 15, 2018               2.0.0   
2     4.7  8.7M     0       Everyone    August 1, 2018               1.2.4   
3     4.5   25M     0           Teen      June 8, 2018  Varies with device   
4     4.3  2.8M     0       Everyone     June 20, 2018                 1.1   

    Android Ver  Category_ART_AND_DESIGN  Category_AUTO_AND_VEHICLES   
0  4.0.3 and up                     True                       False  \
1  4.0.3 and up                     True                       False   
2  4.0.3 and up                     True                       False   
3    4.2 and up                     True                       False   
4    4.4 and up                     True                       False   

   Category_BEAUTY  ...  Category_PERSONALIZATION  Category_PHOTOGRAPHY   
0            False  ...

In [12]:
df['Rating'].value_counts()

Rating
4.4    1109
4.3    1076
4.5    1038
4.2     952
4.6     823
4.1     708
4.0     568
4.7     499
3.9     386
3.8     303
5.0     274
3.7     239
4.8     234
3.6     174
3.5     163
3.4     128
3.3     102
4.9      87
3.0      83
3.1      69
3.2      64
2.9      45
2.8      42
2.7      25
2.6      25
2.5      21
2.3      20
2.4      19
1.0      16
2.2      14
1.9      13
2.0      12
1.7       8
1.8       8
2.1       8
1.6       4
1.4       3
1.5       3
1.2       1
Name: count, dtype: int64

In [18]:
print(df['Content Rating'].value_counts())

Content Rating
Everyone           7420
Teen               1084
Mature 17+          461
Everyone 10+        397
Adults only 18+       3
Unrated               1
Name: count, dtype: int64


In [19]:
df = df[df['Content Rating'] != 'Unrated']


In [20]:
content_rating = pd.get_dummies(df['Content Rating'], prefix='Content Rating')

df = pd.concat([df, content_rating], axis=1)

df.drop('Content Rating', axis=1, inplace=True)


In [21]:
df.head(6)

Unnamed: 0,Rating,Size,Price,Last Updated,Current Ver,Android Ver,Category_ART_AND_DESIGN,Category_AUTO_AND_VEHICLES,Category_BEAUTY,Category_BOOKS_AND_REFERENCE,...,Category_SPORTS,Category_TOOLS,Category_TRAVEL_AND_LOCAL,Category_VIDEO_PLAYERS,Category_WEATHER,Content Rating_Adults only 18+,Content Rating_Everyone,Content Rating_Everyone 10+,Content Rating_Mature 17+,Content Rating_Teen
0,4.1,19000000.0,0.0,"January 7, 2018",1.0.0,4.0.3 and up,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False
1,3.9,14000000.0,0.0,"January 15, 2018",2.0.0,4.0.3 and up,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False
2,4.7,8700000.0,0.0,"August 1, 2018",1.2.4,4.0.3 and up,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3,4.5,25000000.0,0.0,"June 8, 2018",Varies with device,4.2 and up,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,4.3,2800000.0,0.0,"June 20, 2018",1.1,4.4 and up,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False
5,4.4,5600000.0,0.0,"March 26, 2017",1.0,2.3 and up,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [22]:
num_missing = df['Last Updated'].isna().sum()

print("Number of missing values in 'Last Updated':", num_missing)

Number of missing values in 'Last Updated': 0


In [23]:
print(df['Last Updated'].value_counts())

Last Updated
August 3, 2018      319
August 2, 2018      284
July 31, 2018       279
August 1, 2018      275
July 30, 2018       199
                   ... 
April 17, 2014        1
April 11, 2016        1
October 27, 2015      1
August 31, 2015       1
March 23, 2014        1
Name: count, Length: 1299, dtype: int64


In [24]:
df['Last Updated'] = pd.to_datetime(df['Last Updated'])


In [25]:
print(df['Current Ver'].value_counts())

Current Ver
Varies with device    1415
1.0                    458
1.1                    195
1.2                    126
2.0                    119
                      ... 
2.9.10                   1
3.18.5                   1
1.3.A.2.9                1
9.9.1.1910               1
0.3.4                    1
Name: count, Length: 2638, dtype: int64


In [26]:
df.head()

Unnamed: 0,Rating,Size,Price,Last Updated,Current Ver,Android Ver,Category_ART_AND_DESIGN,Category_AUTO_AND_VEHICLES,Category_BEAUTY,Category_BOOKS_AND_REFERENCE,...,Category_SPORTS,Category_TOOLS,Category_TRAVEL_AND_LOCAL,Category_VIDEO_PLAYERS,Category_WEATHER,Content Rating_Adults only 18+,Content Rating_Everyone,Content Rating_Everyone 10+,Content Rating_Mature 17+,Content Rating_Teen
0,4.1,19000000.0,0.0,2018-01-07,1.0.0,4.0.3 and up,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False
1,3.9,14000000.0,0.0,2018-01-15,2.0.0,4.0.3 and up,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False
2,4.7,8700000.0,0.0,2018-08-01,1.2.4,4.0.3 and up,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3,4.5,25000000.0,0.0,2018-06-08,Varies with device,4.2 and up,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,4.3,2800000.0,0.0,2018-06-20,1.1,4.4 and up,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False
