# Import

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import statsmodels.api as sm
from statsmodels.formula.api import ols

pd.set_option('display.max_columns', 500)

In [2]:
df = pd.read_csv('cleaned.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10351 entries, 0 to 10350
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      10351 non-null  int64  
 1   App             10351 non-null  object 
 2   Category        10351 non-null  object 
 3   Rating          10351 non-null  float64
 4   Reviews         10351 non-null  float64
 5   Size            10351 non-null  float64
 6   Installs        10351 non-null  int64  
 7   Content Rating  10351 non-null  object 
 8   Genres          10351 non-null  object 
 9   Month           10351 non-null  float64
 10  is_free         10351 non-null  int64  
dtypes: float64(4), int64(3), object(4)
memory usage: 889.7+ KB


In [4]:
df.drop(['Unnamed: 0', 'Genres', 'App'], axis=1, inplace=True)

Clearly, the data needs a lot of preprocessing to improve the results. Here you can see the relationship between 'Rating' and the independent variables. We can see what variables are continuous(Reviews, Size, and Installs) and categorical(Category, Content Rating, and Month).

In [5]:
df['Month'] = df['Month'].astype('object')

In [6]:
cats = df.select_dtypes(include='object')

In [7]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df['Content Rating'] = le.fit_transform(df['Content Rating'])

# le = preprocessing.LabelEncoder()
# df['Genres'] = le.fit_transform(df['Genres'])

# le = preprocessing.LabelEncoder()
# df['App'] = le.fit_transform(df['App'])

df = pd.concat([df, pd.get_dummies(df[['Category', 'Month']], prefix=('cat', 'mon'), drop_first=True)], axis=1)
# df = df.drop(columns=['cat_0'], axis=1)

In [8]:
df.columns = df.columns.str.replace(' ', '_').str.replace('+','').str.replace('.0','')

In [9]:
df.drop(columns=['Category', 'Month'], axis = 1, inplace=True)

In [10]:
# cats_dummies = pd.get_dummies(cats, columns=['Category', 'Content Rating', 'Genres', 'Month'], drop_first=True)
# model_df = pd.concat((df, cats_dummies), axis=1)
# model_df.drop(columns=['Category', 'Content Rating', 'Genres', 'Month'], axis = 1, inplace=True)
# model_df

In [11]:
model_df = df.copy()

In [12]:
model_df

Unnamed: 0,Rating,Reviews,Size,Installs,Content_Rating,is_free,cat_AUTO_AND_VEHICLES,cat_BEAUTY,cat_BOOKS_AND_REFERENCE,cat_BUSINESS,cat_COMICS,cat_COMMUNICATION,cat_DATING,cat_EDUCATION,cat_ENTERTAINMENT,cat_EVENTS,cat_FAMILY,cat_FINANCE,cat_FOOD_AND_DRINK,cat_GAME,cat_GAME_ACTION,cat_GAME_ADVENTURE,cat_GAME_ARCADE,cat_GAME_BOARD,cat_GAME_CARD,cat_GAME_CASINO,cat_GAME_CASUAL,cat_GAME_EDUCATIONAL,cat_GAME_MUSIC,cat_GAME_PUZZLE,cat_GAME_RACING,cat_GAME_ROLE_PLAYING,cat_GAME_SIMULATION,cat_GAME_SPORTS,cat_GAME_STRATEGY,cat_GAME_TRIVIA,cat_GAME_WORD,cat_HEALTH_AND_FITNESS,cat_HOUSE_AND_HOME,cat_LIBRARIES_AND_DEMO,cat_LIFESTYLE,cat_MAPS_AND_NAVIGATION,cat_MEDICAL,cat_MUSIC_AND_AUDIO,cat_NEWS_AND_MAGAZINES,cat_PARENTING,cat_PERSONALIZATION,cat_PHOTOGRAPHY,cat_PRODUCTIVITY,cat_SHOPPING,cat_SOCIAL,cat_SPORTS,cat_TOOLS,cat_TRAVEL_AND_LOCAL,cat_VIDEO_PLAYERS,cat_WEATHER,mon_2,mon_3,mon_4,mon_5,mon_6,mon_7,mon_8,mon_9,mon_,mon_11,mon_12
0,4.1,159.0,19.0,10000,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,3.9,967.0,14.0,500000,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,4.7,87510.0,8.7,5000000,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4.5,215644.0,25.0,50000000,4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,4.3,967.0,2.8,100000,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10346,4.6,105363.0,31.0,10000000,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
10347,4.7,165252.0,34.0,100000000,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
10348,3.9,1406630.0,34.2,100000000,4,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
10349,4.4,7285.0,25.0,1000000,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [13]:
model_df.columns = model_df.columns.str.replace(' ', '_').str.replace('+','').str.replace('.','_').str.replace('_0','')

# Baseline Model

In [14]:
outcome = 'Rating'
predictors = model_df.drop(['Rating'], axis=1)
pred_sum = '+'.join(predictors.columns)
f = outcome + '~' + pred_sum
model = ols(f, model_df).fit()
print(f'List of predictors: {pred_sum}')
display(model.summary())

List of predictors: Reviews+Size+Installs+Content_Rating+is_free+cat_AUTO_AND_VEHICLES+cat_BEAUTY+cat_BOOKS_AND_REFERENCE+cat_BUSINESS+cat_COMICS+cat_COMMUNICATION+cat_DATING+cat_EDUCATION+cat_ENTERTAINMENT+cat_EVENTS+cat_FAMILY+cat_FINANCE+cat_FOOD_AND_DRINK+cat_GAME+cat_GAME_ACTION+cat_GAME_ADVENTURE+cat_GAME_ARCADE+cat_GAME_BOARD+cat_GAME_CARD+cat_GAME_CASINO+cat_GAME_CASUAL+cat_GAME_EDUCATIONAL+cat_GAME_MUSIC+cat_GAME_PUZZLE+cat_GAME_RACING+cat_GAME_ROLE_PLAYING+cat_GAME_SIMULATION+cat_GAME_SPORTS+cat_GAME_STRATEGY+cat_GAME_TRIVIA+cat_GAME_WORD+cat_HEALTH_AND_FITNESS+cat_HOUSE_AND_HOME+cat_LIBRARIES_AND_DEMO+cat_LIFESTYLE+cat_MAPS_AND_NAVIGATION+cat_MEDICAL+cat_MUSIC_AND_AUDIO+cat_NEWS_AND_MAGAZINES+cat_PARENTING+cat_PERSONALIZATION+cat_PHOTOGRAPHY+cat_PRODUCTIVITY+cat_SHOPPING+cat_SOCIAL+cat_SPORTS+cat_TOOLS+cat_TRAVEL_AND_LOCAL+cat_VIDEO_PLAYERS+cat_WEATHER+mon_2+mon_3+mon_4+mon_5+mon_6+mon_7+mon_8+mon_9+mon_+mon_11+mon_12


0,1,2,3
Dep. Variable:,Rating,R-squared:,0.048
Model:,OLS,Adj. R-squared:,0.042
Method:,Least Squares,F-statistic:,7.821
Date:,"Mon, 12 Apr 2021",Prob (F-statistic):,6.51e-69
Time:,19:27:42,Log-Likelihood:,-7360.3
No. Observations:,10351,AIC:,14850.0
Df Residuals:,10284,BIC:,15340.0
Df Model:,66,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,4.3639,0.066,66.265,0.000,4.235,4.493
Reviews,9.59e-09,3.33e-09,2.883,0.004,3.07e-09,1.61e-08
Size,-7.801e-05,5.21e-05,-1.497,0.135,-0.000,2.42e-05
Installs,9.576e-11,1.14e-10,0.843,0.399,-1.27e-10,3.18e-10
Content_Rating,0.0030,0.006,0.537,0.591,-0.008,0.014
is_free,-0.0568,0.017,-3.395,0.001,-0.090,-0.024
cat_AUTO_AND_VEHICLES,-0.1652,0.080,-2.065,0.039,-0.322,-0.008
cat_BEAUTY,-0.0776,0.091,-0.854,0.393,-0.256,0.101
cat_BOOKS_AND_REFERENCE,-0.0163,0.068,-0.238,0.812,-0.150,0.118

0,1,2,3
Omnibus:,4757.044,Durbin-Watson:,1.865
Prob(Omnibus):,0.0,Jarque-Bera (JB):,37637.923
Skew:,-2.039,Prob(JB):,0.0
Kurtosis:,11.404,Cond. No.,5750000000.0


In [None]:
y = model_df['Rating']
X = model_df.drop('Rating', axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
print(len(X_train), len(X_test), len(y_train), len(y_test))

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
linreg = LinearRegression()

In [None]:
linreg.fit(X_train, y_train)
y_hat_train = linreg.predict(X_train)
y_hat_test = linreg.predict(X_test)

train_residuals = y_hat_train - y_train
test_residuals = y_hat_test - y_test

mse_train = mean_squared_error(y_train, y_hat_train)
mse_test = mean_squared_error(y_test, y_hat_test)
rmse_train = np.sqrt(mse_train)
rmse_test = np.sqrt(mse_test)

print('MAE:', mean_absolute_error(y_test, y_hat_test))
print('Train Mean Squared Error:', mse_train)
print('Test Mean Squared Error:', mse_test)
print('Root Mean Squared Error Train:', rmse_train)
print('Root Mean Squared Error Test:', rmse_test)

In [None]:
import scipy.stats as stats
residuals = model.resid
fig = sm.graphics.qqplot(residuals, dist=stats.norm, line='45', fit=True)

In [None]:
plt.hist(residuals, bins='auto');

# Outliers

In [None]:
from numpy import mean
from numpy import std

In [None]:
sns.boxplot(x=model_df['Reviews'])

In [None]:
def outliers(col, df):
    mean = df[col].mean()
    std = df[col].std()
    cut_off = std * 3
    lower = mean - cut_off
    upper = mean + cut_off
    outliers = [x for x in df[col] if x < lower or x > upper]
    return sorted(outliers)

In [None]:
outliers('Reviews', model_df)

In [None]:
model_out = model_df[(model_df['Reviews'] <= 5783441) & (model_df['Reviews'] >= 10)]
model_out.reset_index(drop=True)

In [None]:
sns.boxplot(x=model_df['Installs'])

In [None]:
outliers('Installs', model_out)

In [None]:
model_out = model_out[(model_out['Installs'] < 500000000)]
model_out.reset_index(drop=True)

In [None]:
sns.boxplot(x=model_out['Installs'])

In [None]:
sns.boxplot(x=model_out['Size'])

In [None]:
outliers('Size', model_out)

In [None]:
model_out = model_out[(model_out['Size'] < 120.0)]
model_out.reset_index(drop=True)

In [None]:
sns.boxplot(x=model_out['Rating'])

The median looks to be around 4.3. Anything beyond 3.3 and 5 are outliers. We will use these as our markers.

In [None]:
model_out = model_out[(model_out['Rating'] > 3.3)]
model_out.reset_index(drop=True)

In [None]:
model_out.describe()

# Model - Outliers Fixed

In [None]:
outcome = 'Rating'
predictors = model_out.drop(['Rating'], axis=1)
pred_sum = '+'.join(predictors.columns)
f = outcome + '~' + pred_sum
model = ols(f, model_out).fit()
print(f'List of predictors: {pred_sum}')
display(model.summary())

In [None]:
y = model_out['Rating']
X = model_out.drop('Rating', axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
print(len(X_train), len(X_test), len(y_train), len(y_test))

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
linreg = LinearRegression()

In [None]:
linreg.fit(X_train, y_train)
y_hat_train = linreg.predict(X_train)
y_hat_test = linreg.predict(X_test)

train_residuals = y_hat_train - y_train
test_residuals = y_hat_test - y_test

mse_train = mean_squared_error(y_train, y_hat_train)
mse_test = mean_squared_error(y_test, y_hat_test)
rmse_train = np.sqrt(mse_train)
rmse_test = np.sqrt(mse_test)

print('MAE:', mean_absolute_error(y_test, y_hat_test))
print('Train Mean Squared Error:', mse_train)
print('Test Mean Squared Error:', mse_test)
print('Root Mean Squared Error Train:', rmse_train)
print('Root Mean Squared Error Test:', rmse_test)

In [None]:
accuracy = linreg.score(X_test, y_test)
'Accuracy: ' + str(np.round(accuracy*100, 2)) + '%'

In [None]:
import scipy.stats as stats
residuals = model.resid
fig = sm.graphics.qqplot(residuals, dist=stats.norm, line='45', fit=True)

In [None]:
plt.hist(residuals, bins='auto');

# Transform

In [None]:
df_log = model_out.copy()

In [None]:
df_log[['Rating', 'Reviews', 'Size', 'Installs']].hist(figsize=(15,12));

All the continuous variable are very skewed. Major outliers with Reviews and Installs. 

In [None]:
df_log['log_Rating'] = np.log(df_log['Rating'])
df_log[['Rating', 'log_Rating']].hist(figsize=(12,5));

df_log['log_Reviews'] = np.log(df_log['Reviews'])
df_log[['Reviews', 'log_Reviews']].hist(figsize=(12,5));

df_log['log_Size'] = np.log(df_log['Size'])
df_log[['Size', 'log_Size']].hist(figsize=(12,5));

df_log['log_Installs'] = np.log(df_log['Installs'])
df_log[['Installs', 'log_Installs']].hist(figsize=(12,5));

In [None]:
def normalize(feature):
    return (feature - feature.mean()) / feature.std()

In [None]:
# df_log_norm = df_log[['log_Rating']]
df_log_norm = df_log[['log_Rating', 'log_Installs', 'log_Size', 'log_Reviews']]

In [None]:
df_log_norm = df_log_norm.apply(normalize)
df_log_norm.hist(figsize=(10,10));

In [None]:
df_log['Rating'] = df_log_norm['log_Rating']
df_log['log_Installs'] = df_log_norm['log_Installs']
df_log['log_Size'] = df_log_norm['log_Size']
df_log['log_Reviews'] = df_log_norm['log_Reviews']

In [None]:
df_log.info()

In [None]:
df_log = df_log.drop(['Reviews', 'Installs', 'Size'], axis=1)

# Linear Regression

In [None]:
outcome = 'log_Rating'
predictors = df_log.drop(['Rating', 'log_Rating'], axis=1)
pred_sum = '+'.join(predictors.columns)
f = outcome + '~' + pred_sum
model = ols(f, df_log).fit()
print(f'List of predictors: {pred_sum}')
display(model.summary())

In [None]:
y = df_log['log_Rating']
X = df_log.drop(['Rating','log_Rating'], axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
print(len(X_train), len(X_test), len(y_train), len(y_test))

In [None]:
linreg.fit(X_train, y_train)
y_hat_train = linreg.predict(X_train)
y_hat_test = linreg.predict(X_test)

train_residuals = y_hat_train - y_train
test_residuals = y_hat_test - y_test

mse_train = mean_squared_error(y_train, y_hat_train)
mse_test = mean_squared_error(y_test, y_hat_test)
rmse_train = np.sqrt(mse_train)
rmse_test = np.sqrt(mse_test)

print('MAE:', mean_absolute_error(y_test, y_hat_test))
print('Train Mean Squared Error:', mse_train)
print('Test Mean Squared Error:', mse_test)
print('Root Mean Squared Error Train:', rmse_train)
print('Root Mean Squared Error Test:', rmse_test)

In [None]:
accuracy = linreg.score(X_test, y_test)
'Accuracy: ' + str(np.round(accuracy*100, 2)) + '%'

In [None]:
import scipy.stats as stats
residuals = model.resid
fig = sm.graphics.qqplot(residuals, dist=stats.norm, line='45', fit=True)

In [None]:
plt.hist(residuals, bins='auto');

# KNeighbors Regression

In [None]:
y = model_out['Rating']
X = model_out.drop('Rating', axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [None]:
# Import StandardScaler
from sklearn.preprocessing import StandardScaler

# Instantiate StandardScaler
scaler = StandardScaler()

# Transform the training and test sets
scaled_data_train = scaler.fit_transform(X_train)
scaled_data_test = scaler.transform(X_test)

# Convert into a DataFrame
scaled_df_train = pd.DataFrame(scaled_data_train, columns=X.columns)
scaled_df_train.head()

In [None]:
# Import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()

# Instantiate KNeighborsClassifier

# Fit the classifier
knn.fit(scaled_data_train, y_train)

# Predict on the test set
test_preds = knn.predict(scaled_data_test)

In [None]:
accuracy = knn.score(X_test, y_test)
'Accuracy: ' + str(np.round(accuracy*100, 2)) + '%'

In [None]:
model = KNeighborsRegressor(n_neighbors=200,weights='distance')
model.fit(X_train, y_train)
y_pred=model.predict(X_test)

In [None]:
mse_train = mean_squared_error(y_train, y_hat_train)
mse_test = mean_squared_error(y_test, y_pred)
rmse_train = np.sqrt(mse_train)
rmse_test = np.sqrt(mse_test)

print('MAE:', mean_absolute_error(y_test, y_pred))
print('Train Mean Squared Error:', mse_train)
print('Test Mean Squared Error:', mse_test)
print('Root Mean Squared Error Train:', rmse_train)
print('Root Mean Squared Error Test:', rmse_test)

In [None]:
accuracy = model.score(y_test, y_pred)
'Accuracy: ' + str(np.round(accuracy*100, 2)) + '%'

# Random Forest