In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
google = pd.read_csv('../src/Data/googleplaystore.csv')

In [4]:
google.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
App               10841 non-null object
Category          10840 non-null object
Rating            9367 non-null float64
Reviews           10841 non-null int64
Size              10841 non-null object
Installs          10841 non-null object
Type              10840 non-null object
Price             10841 non-null object
Content Rating    10841 non-null object
Genres            10840 non-null object
Last Updated      10841 non-null object
Current Ver       10833 non-null object
Android Ver       10839 non-null object
dtypes: float64(1), int64(1), object(11)
memory usage: 1.1+ MB


- Category
    - ART_AND_DESIGN
    - AUTO_AND_VEHICLES
    - BEAUTY
    - BOOKS_AND_REFERENCE
    - BUSINESS
    - COMICS
    - COMMUNICATION
    - DATING
    - EDUCATION
    - ENTERTAINMENT
    - EVENTS
    - FAMILY
    - FINANCE
    - FOOD_AND_DRINK
    - GAME
    - HEALTH_AND_FITNESS
    - HOUSE_AND_HOME
    - LIBRARIES_AND_DEMO
    - LIFESTYLE
    - MAPS_AND_NAVIGATION
    - MEDICAL
    - NEWS_AND_MAGAZINES
    - PARENTING
    - PERSONALIZATION
    - PHOTOGRAPHY
    - PRODUCTIVITY
    - SHOPPING
    - SOCIAL
    - SPORTS
    - TOOLS
    - LAYERS
    - WEATHER
- Rating
- Reviews
- Size (in kilobytes)
- Installs

##Functions used for Data Cleaning

###Cleaning Category Variable

In [5]:
google.Category = google.Category.astype('category')

###Cleaning Size Variable

In [6]:
'''
Cleans 
'''
def size_fix(data):
    if data.Size[-1] == 'M':
        return float(data.Size[:-1]) * 1000
    elif data.Size[-1] == 'k':
        return float(data.Size[:-1])

In [7]:
google.Size = google.apply(size_fix, axis = 1)
google.Size = google.Size.astype('float64')

###Cleaning Installs Variable

In [8]:
google.Installs.astype('category').cat.categories

Index(['0', '0+', '1+', '1,000+', '1,000,000+', '1,000,000,000+', '10+',
       '10,000+', '10,000,000+', '100+', '100,000+', '100,000,000+', '5+',
       '5,000+', '5,000,000+', '50+', '50,000+', '50,000,000+', '500+',
       '500,000+', '500,000,000+'],
      dtype='object')

In [9]:
def installs_fix(data):
    if data.Installs == '0':
        return int(data.Installs)
    else:
        return int(data.Installs[:-1].replace(',',''))

In [10]:
google.Installs = google.apply(installs_fix, axis = 1)

###Cleaning Type Variable

In [11]:
google.Type = google.Type.astype('category')

###Cleaning Price Variable

In [12]:
def price_fix(data):
    if data.Price == '0':
        return float(data.Price)
    else:
        return float(data.Price[1:])

In [13]:
google.Price = google.apply(price_fix, axis = 1)

###Cleaning Content Rating Variable

In [14]:
google['Content Rating'] = google['Content Rating'].astype('category')

###Cleaning Genres Variable

In [15]:
google.iloc[10472,9] = 'Art & Design'

In [16]:
import re
google.Genres = google.apply(lambda x : re.split(';', x.Genres)[0], axis = 1)

In [17]:
google.Genres = google.Genres.astype('category')

###Cleaning Last Updated Variable

In [18]:
google['Last Updated'] = pd.to_datetime(google['Last Updated'], format = '%d-%b-%y')

###Cleaning Current Version Variable

In [19]:
google['Current Ver'].astype('category').cat.categories

Index(['0.0.0.2', '0.0.1', '0.0.10', '0.0.2', '0.0.3', '0.0.4', '0.0.42',
       '0.0.44', '0.0.5', '0.0.52',
       ...
       'v6.1', 'v6.6.0', 'v6.7.14', 'v67_slk3.0_20180115_01',
       'v7.0.02.3.0516.1_06_0713', 'v7.0.7.1.0625.1_06_0629',
       'v7.0.9.1.0526.1_06_0704', 'v8.0.1.8.0629.1', 'v8[1.0.10]',
       'version 0.994'],
      dtype='object', length=2783)

In [45]:
def curr_ver_fix(data):
    if pd.isnull(data['Current Ver']):
        return None
    else:
        values = re.split('\.', data['Current Ver'])
        return "". join(values)

In [46]:
google['Current Ver'] = google.apply(curr_ver_fix, axis = 1)

In [51]:
google['Current Ver'] = pd.to_numeric(google['Current Ver'], errors = 'coerce')

###Cleaning Android Version Variable

In [26]:
def android_fix(data):
    if pd.isnull(data['Android Ver']):
        return None
    elif data['Android Ver'] == 'Varies with device':
        return None
    else: 
        return float(data['Android Ver'][0:3])

In [27]:
def android_na_fix(data):
    if pd.isnull(data['Android Ver']):
        return google['Android Ver'].mean()
    else:
        return data['Android Ver']

In [28]:
google['Android Ver'] = google.apply(android_fix, axis = 1 )

In [29]:
google['Android Ver'] = google.apply(android_na_fix, axis = 1)

In [30]:
google.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
App               10841 non-null object
Category          10840 non-null category
Rating            9367 non-null float64
Reviews           10841 non-null int64
Size              9146 non-null float64
Installs          10841 non-null int64
Type              10840 non-null category
Price             10841 non-null float64
Content Rating    10841 non-null category
Genres            10841 non-null category
Last Updated      10841 non-null datetime64[ns]
Current Ver       10833 non-null object
Android Ver       10841 non-null float64
dtypes: category(4), datetime64[ns](1), float64(4), int64(2), object(2)
memory usage: 808.1+ KB


In [31]:
google.head(10)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19000.0,10000,Free,0.0,Everyone,Art & Design,2018-01-07,100,4.000000
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14000.0,500000,Free,0.0,Everyone,Art & Design,2018-01-15,200,4.000000
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8700.0,5000000,Free,0.0,Everyone,Art & Design,2018-08-01,124,4.000000
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25000.0,50000000,Free,0.0,Teen,Art & Design,2018-06-08,Varies with device,4.200000
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2800.0,100000,Free,0.0,Everyone,Art & Design,2018-06-20,11,4.400000
5,Paper flowers instructions,ART_AND_DESIGN,4.4,167,5600.0,50000,Free,0.0,Everyone,Art & Design,2017-03-26,1,2.300000
6,Smoke Effect Photo Maker - Smoke Editor,ART_AND_DESIGN,3.8,178,19000.0,50000,Free,0.0,Everyone,Art & Design,2018-04-26,11,4.000000
7,Infinite Painter,ART_AND_DESIGN,4.1,36815,29000.0,1000000,Free,0.0,Everyone,Art & Design,2018-06-14,61611,4.200000
8,Garden Coloring Book,ART_AND_DESIGN,4.4,13791,33000.0,1000000,Free,0.0,Everyone,Art & Design,2017-09-20,292,3.000000
9,Kids Paint Free - Drawing Fun,ART_AND_DESIGN,4.7,121,3100.0,10000,Free,0.0,Everyone,Art & Design,2018-07-03,28,4.000000


##Applying Machine Learning Algorithms

In [None]:
import keras
import tensorflow as tf

In [None]:
import xgboost as xgb