In [None]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
google = pd.read_csv('../src/Data/googleplaystore.csv')

In [None]:
google.info()

- Category
    - ART_AND_DESIGN
    - AUTO_AND_VEHICLES
    - BEAUTY
    - BOOKS_AND_REFERENCE
    - BUSINESS
    - COMICS
    - COMMUNICATION
    - DATING
    - EDUCATION
    - ENTERTAINMENT
    - EVENTS
    - FAMILY
    - FINANCE
    - FOOD_AND_DRINK
    - GAME
    - HEALTH_AND_FITNESS
    - HOUSE_AND_HOME
    - LIBRARIES_AND_DEMO
    - LIFESTYLE
    - MAPS_AND_NAVIGATION
    - MEDICAL
    - NEWS_AND_MAGAZINES
    - PARENTING
    - PERSONALIZATION
    - PHOTOGRAPHY
    - PRODUCTIVITY
    - SHOPPING
    - SOCIAL
    - SPORTS
    - TOOLS
    - LAYERS
    - WEATHER
- Rating
- Reviews
- Size (in kilobytes)
- Installs

###Cleaning Category Variable

In [None]:
google.iloc[10472,1] = 'ART_AND_DESIGN'

In [None]:
google.Category = google.Category.astype('category')

###Cleaning Size Variable

In [None]:
'''
Cleans 
'''
def size_fix(data):
    if data.Size[-1] == 'M':
        return float(data.Size[:-1]) * 1000
    elif data.Size[-1] == 'k':
        return float(data.Size[:-1])

In [None]:
google.Size = google.apply(size_fix, axis = 1)
google.Size = google.Size.astype('float64')

###Cleaning Installs Variable

In [None]:
def installs_fix(data):
    if data.Installs == '0':
        return int(data.Installs)
    else:
        return int(data.Installs[:-1].replace(',',''))

In [None]:
google.Installs = google.apply(installs_fix, axis = 1)

###Cleaning Type Variable

In [None]:
google.iloc[9148, 6] = 'Free'

In [None]:
google.Type = google.Type.astype('category')

###Cleaning Price Variable

In [None]:
def price_fix(data):
    if data.Price == '0':
        return float(data.Price)
    else:
        return float(data.Price[1:])

In [None]:
google.Price = google.apply(price_fix, axis = 1)

###Cleaning Content Rating Variable

In [None]:
google['Content Rating'] = google['Content Rating'].astype('category')

###Cleaning Genres Variable

In [None]:
google.iloc[10472,9] = 'Art & Design'

In [None]:
import re
google.Genres = google.apply(lambda x : re.split(';', x.Genres)[0], axis = 1)

In [None]:
google.Genres = google.Genres.astype('category')

###Cleaning Last Updated Variable

In [None]:
google['Last Updated'] = pd.to_datetime(google['Last Updated'], format = '%d-%b-%y')

###Cleaning Android Version Variable

In [None]:
def android_fix(data):
    if pd.isnull(data['Android Ver']):
        return None
    elif data['Android Ver'] == 'Varies with device':
        return None
    else: 
        return float(data['Android Ver'][0:3])

In [None]:
def android_na_fix(data):
    if pd.isnull(data['Android Ver']):
        return google['Android Ver'].mean()
    else:
        return data['Android Ver']

In [None]:
google['Android Ver'] = google.apply(android_fix, axis = 1 )

In [None]:
google['Android Ver'] = google.apply(android_na_fix, axis = 1)

In [None]:
google.drop('Current Ver', axis = 1,inplace = True)

###Cleaning NAs

In [None]:
def rating_na_fix(data):
    if pd.isnull(data.Rating):
        return google.Rating.mean()
    else:
        return data.Rating

In [None]:
def size_na_fix(data):
    if pd.isnull(data.Size):
        return google.Size.mean()
    else:
        return data.Size

In [None]:
google.Rating = google.apply(rating_na_fix, axis = 1)

In [None]:
google.Size = google.apply(size_na_fix, axis = 1)

In [None]:
google.info()

###Coding Categorical Variables

In [None]:
google.drop('Genres', axis = 1, inplace = True)

In [None]:
from sklearn import preprocessing

In [None]:
cat = preprocessing.LabelEncoder()
cat.fit(['ART_AND_DESIGN', 'AUTO_AND_VEHICLES', 'BEAUTY', 'BOOKS_AND_REFERENCE',
       'BUSINESS', 'COMICS', 'COMMUNICATION', 'DATING', 'EDUCATION',
       'ENTERTAINMENT', 'EVENTS', 'FAMILY', 'FINANCE', 'FOOD_AND_DRINK',
       'GAME', 'HEALTH_AND_FITNESS', 'HOUSE_AND_HOME', 'LIBRARIES_AND_DEMO',
       'LIFESTYLE', 'MAPS_AND_NAVIGATION', 'MEDICAL', 'NEWS_AND_MAGAZINES',
       'PARENTING', 'PERSONALIZATION', 'PHOTOGRAPHY', 'PRODUCTIVITY',
       'SHOPPING', 'SOCIAL', 'SPORTS', 'TOOLS', 'TRAVEL_AND_LOCAL',
       'VIDEO_PLAYERS', 'WEATHER'])

google.Category = cat.transform(google.Category)

In [None]:
google['Content Rating'].cat.categories

In [None]:
rating = preprocessing.LabelEncoder()
rating.fit(['Adults only 18+', 'Everyone', 'Everyone 10+', 'Mature 17+', 'Teen', 'Unrated'])

google['Content Rating'] = rating.transform(google['Content Rating'])

In [None]:
google.Type.cat.categories

In [None]:
type_app = preprocessing.LabelEncoder()
type_app.fit(['Free', 'Paid']) 

google.Type = type_app.transform(google.Type)

###Getting Data ready for Machine Learning input

In [None]:
google.info()

In [None]:
google.head()

In [None]:
google.to_csv('Data/google_clean.csv')