In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
google = pd.read_csv('../src/Data/googleplaystore.csv')

In [3]:
google.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
App               10841 non-null object
Category          10840 non-null object
Rating            9367 non-null float64
Reviews           10841 non-null int64
Size              10841 non-null object
Installs          10841 non-null object
Type              10840 non-null object
Price             10841 non-null object
Content Rating    10841 non-null object
Genres            10840 non-null object
Last Updated      10841 non-null object
Current Ver       10833 non-null object
Android Ver       10839 non-null object
dtypes: float64(1), int64(1), object(11)
memory usage: 1.1+ MB


- Category
    - ART_AND_DESIGN
    - AUTO_AND_VEHICLES
    - BEAUTY
    - BOOKS_AND_REFERENCE
    - BUSINESS
    - COMICS
    - COMMUNICATION
    - DATING
    - EDUCATION
    - ENTERTAINMENT
    - EVENTS
    - FAMILY
    - FINANCE
    - FOOD_AND_DRINK
    - GAME
    - HEALTH_AND_FITNESS
    - HOUSE_AND_HOME
    - LIBRARIES_AND_DEMO
    - LIFESTYLE
    - MAPS_AND_NAVIGATION
    - MEDICAL
    - NEWS_AND_MAGAZINES
    - PARENTING
    - PERSONALIZATION
    - PHOTOGRAPHY
    - PRODUCTIVITY
    - SHOPPING
    - SOCIAL
    - SPORTS
    - TOOLS
    - LAYERS
    - WEATHER
- Rating
- Reviews
- Size (in kilobytes)
- Installs

###Cleaning Category Variable

In [4]:
google.iloc[10472,1] = 'ART_AND_DESIGN'

In [5]:
google.Category = google.Category.astype('category')

###Cleaning Size Variable

In [6]:
'''
Cleans 
'''
def size_fix(data):
    if data.Size[-1] == 'M':
        return float(data.Size[:-1]) * 1000
    elif data.Size[-1] == 'k':
        return float(data.Size[:-1])

In [7]:
google.Size = google.apply(size_fix, axis = 1)
google.Size = google.Size.astype('float64')

###Cleaning Installs Variable

In [8]:
def installs_fix(data):
    if data.Installs == '0':
        return int(data.Installs)
    else:
        return int(data.Installs[:-1].replace(',',''))

In [9]:
google.Installs = google.apply(installs_fix, axis = 1)

###Cleaning Type Variable

In [10]:
google.iloc[9148, 6] = 'Free'

In [11]:
google.Type = google.Type.astype('category')

###Cleaning Price Variable

In [12]:
def price_fix(data):
    if data.Price == '0':
        return float(data.Price)
    else:
        return float(data.Price[1:])

In [13]:
google.Price = google.apply(price_fix, axis = 1)

###Cleaning Content Rating Variable

In [14]:
google['Content Rating'] = google['Content Rating'].astype('category')

###Cleaning Genres Variable

In [15]:
google.iloc[10472,9] = 'Art & Design'

In [16]:
import re
google.Genres = google.apply(lambda x : re.split(';', x.Genres)[0], axis = 1)

In [17]:
google.Genres = google.Genres.astype('category')

###Cleaning Last Updated Variable

In [18]:
google['Last Updated'] = pd.to_datetime(google['Last Updated'], format = '%d-%b-%y')

###Cleaning Android Version Variable

In [19]:
def android_fix(data):
    if pd.isnull(data['Android Ver']):
        return None
    elif data['Android Ver'] == 'Varies with device':
        return None
    else: 
        return float(data['Android Ver'][0:3])

In [20]:
def android_na_fix(data):
    if pd.isnull(data['Android Ver']):
        return google['Android Ver'].mean()
    else:
        return data['Android Ver']

In [21]:
google['Android Ver'] = google.apply(android_fix, axis = 1 )

In [22]:
google['Android Ver'] = google.apply(android_na_fix, axis = 1)

In [23]:
google.drop('Current Ver', axis = 1,inplace = True)

###Cleaning NAs

In [24]:
def rating_na_fix(data):
    if pd.isnull(data.Rating):
        return google.Rating.mean()
    else:
        return data.Rating

In [25]:
def size_na_fix(data):
    if pd.isnull(data.Size):
        return google.Size.mean()
    else:
        return data.Size

In [26]:
google.Rating = google.apply(rating_na_fix, axis = 1)

In [27]:
google.Size = google.apply(size_na_fix, axis = 1)

In [28]:
google.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 12 columns):
App               10841 non-null object
Category          10841 non-null category
Rating            10841 non-null float64
Reviews           10841 non-null int64
Size              10841 non-null float64
Installs          10841 non-null int64
Type              10841 non-null category
Price             10841 non-null float64
Content Rating    10841 non-null category
Genres            10841 non-null category
Last Updated      10841 non-null datetime64[ns]
Android Ver       10841 non-null float64
dtypes: category(4), datetime64[ns](1), float64(4), int64(2), object(1)
memory usage: 723.4+ KB


###Coding Categorical Variables

In [29]:
google.drop('Genres', axis = 1, inplace = True)

In [30]:
from sklearn import preprocessing

In [31]:
cat = preprocessing.LabelEncoder()
cat.fit(['ART_AND_DESIGN', 'AUTO_AND_VEHICLES', 'BEAUTY', 'BOOKS_AND_REFERENCE',
       'BUSINESS', 'COMICS', 'COMMUNICATION', 'DATING', 'EDUCATION',
       'ENTERTAINMENT', 'EVENTS', 'FAMILY', 'FINANCE', 'FOOD_AND_DRINK',
       'GAME', 'HEALTH_AND_FITNESS', 'HOUSE_AND_HOME', 'LIBRARIES_AND_DEMO',
       'LIFESTYLE', 'MAPS_AND_NAVIGATION', 'MEDICAL', 'NEWS_AND_MAGAZINES',
       'PARENTING', 'PERSONALIZATION', 'PHOTOGRAPHY', 'PRODUCTIVITY',
       'SHOPPING', 'SOCIAL', 'SPORTS', 'TOOLS', 'TRAVEL_AND_LOCAL',
       'VIDEO_PLAYERS', 'WEATHER'])

google.Category = cat.transform(google.Category)

In [32]:
google['Content Rating'].cat.categories

Index(['Adults only 18+', 'Everyone', 'Everyone 10+', 'Mature 17+', 'Teen',
       'Unrated'],
      dtype='object')

In [33]:
rating = preprocessing.LabelEncoder()
rating.fit(['Adults only 18+', 'Everyone', 'Everyone 10+', 'Mature 17+', 'Teen', 'Unrated'])

google['Content Rating'] = rating.transform(google['Content Rating'])

In [34]:
google.Type.cat.categories

Index(['Free', 'Paid'], dtype='object')

In [35]:
type_app = preprocessing.LabelEncoder()
type_app.fit(['Free', 'Paid']) 

google.Type = type_app.transform(google.Type)

###Getting Data ready for Machine Learning input

In [36]:
google.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 11 columns):
App               10841 non-null object
Category          10841 non-null int32
Rating            10841 non-null float64
Reviews           10841 non-null int64
Size              10841 non-null float64
Installs          10841 non-null int64
Type              10841 non-null int32
Price             10841 non-null float64
Content Rating    10841 non-null int32
Last Updated      10841 non-null datetime64[ns]
Android Ver       10841 non-null float64
dtypes: datetime64[ns](1), float64(4), int32(3), int64(2), object(1)
memory usage: 804.7+ KB


In [75]:
google.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Last Updated,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,0,4.100000,159,19000.000000,10000,0,0.0,1,2018-01-07,4.000000
1,Coloring book moana,0,3.900000,967,14000.000000,500000,0,0.0,1,2018-01-15,4.000000
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",0,4.700000,87510,8700.000000,5000000,0,0.0,1,2018-08-01,4.000000
3,Sketch - Draw & Paint,0,4.500000,215644,25000.000000,50000000,0,0.0,4,2018-06-08,4.200000
4,Pixel Draw - Number Art Coloring Book,0,4.300000,967,2800.000000,100000,0,0.0,1,2018-06-20,4.400000
5,Paper flowers instructions,0,4.400000,167,5600.000000,50000,0,0.0,1,2017-03-26,2.300000
6,Smoke Effect Photo Maker - Smoke Editor,0,3.800000,178,19000.000000,50000,0,0.0,1,2018-04-26,4.000000
7,Infinite Painter,0,4.100000,36815,29000.000000,1000000,0,0.0,1,2018-06-14,4.200000
8,Garden Coloring Book,0,4.400000,13791,33000.000000,1000000,0,0.0,1,2017-09-20,3.000000
9,Kids Paint Free - Drawing Fun,0,4.700000,121,3100.000000,10000,0,0.0,1,2018-07-03,4.000000


In [None]:
google.to_excel('../Data/googleclean.xlsx')

###Splitting into Train and Test Data

In [126]:
x = google.drop(['App', 'Last Updated'], axis = 1).values
y = google['Rating']

In [56]:
from sklearn.model_selection import train_test_split

In [134]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 123)

In [97]:
predictors = ['Category', 'Reviews', 'Size', 'Type', 'Price', 'Content Rating', 'Android Ver']
target = ['Rating']

##Applying XGBoost

In [49]:
import keras
import tensorflow as tf

Using TensorFlow backend.


In [112]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import math

In [None]:
param = {'max_depth' : [1, 3, 5, 7],
           'learning_rate' : [0.05, 0.1, 0.2, 0.3],
           'n_estimators' : [50, 100, 150],
           'booster' : ['gbtree', 'gblinear', 'dart'],
           'min_child_weight' : [1, 2, 3],
           'gamma' : [0, 1, 2],
           'subsample' : [0.5, 0.75, 1],
           'colsample_bytree' : [0.5, 0.75, 1]}

In [138]:
def xgb_tune(param, train_data, predictors, n = 5):
    
    xgb_search = GridSearchCV(xgb.XGBRegressor(), param, cv = n)
    xgb_search.fit(train_data, predictors)
    
    xgb_search.cv_results
    #xgb_search.best_params

In [None]:
xgb_tune(param, x_train, y_train)

In [None]:
xgb_model = xgb.XGBRegressor()
xgb_model.fit(x_train, y_train, eval_metric = 'rmse')
train_predict = xgb_model.predict(x_test)
    
print(('\nModel Results: {}').format(math.sqrt(metrics.mean_squared_error(y_test, train_predict))))