# Machine Learning

In [2]:
import pandas as pd

In [4]:
df = pd.read_csv("./input/data_cleaned_v2.csv")
df.head(5)

Unnamed: 0,APP_NAME,RATING,CATEGORY,COLLECTIONS,RATING_COUNT,1_STAR_RATINGS,2_STAR_RATINGS,3_STAR_RATINGS,4_STAR_RATINGS,5_STAR_RATINGS,...,SIZE,DEVELOPER,DEVELOPER_ADDRESS,COUNTRY,CONTENT_RATING,AD_SUPPORTED,IN_APP_PURCHASES,EDITORS_CHOICE,DAYS_SINCE_UPDATE,DAYS_SINCE_RELEASED
0,"""Ghost Voice Catcher"" AUTO EVP",3.933333,Lifestyle,topselling_paid,68,12,1,6,7,42,...,1100000.0,BIG BEARD Studios,114 1/2 Ash st. Nashua NH 03060,Usa,Everyone,False,False,False,1820,3482
1,"""OXXO""",4.74,Puzzle,topselling_paid,975,23,0,15,94,843,...,81000000.0,Hamster On Coke Games,ul Jugosłowiańska 37F/10\r\n73-110 Stargard Sz...,Poland,Everyone,False,False,False,0,945
2,#DRIVE,4.434152,Racing,topgrossing,229679,9352,4740,15249,47804,152534,...,132000000.0,Pixel Perfect Dude S.A.,ul. Polna 28/6 lok. 4\r\n62-800 Kalisz\r\nPolska,,Everyone,True,True,False,4,1084
3,#SelfCare,4.463476,Role Playing,topselling_free,14932,865,336,1088,1353,11290,...,83000000.0,TRU LUV Inc.,"186-720 King Street West\r\nToronto, Ontario, ...",Canada,Teen,False,True,False,952,1275
4,#open Polyamorous + ENM Dating,2.55,Dating,topselling_free,708,263,163,63,70,149,...,19000000.0,#open,,,Mature 17+,False,True,False,17,1026


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17469 entries, 0 to 17468
Data columns (total 26 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   APP_NAME             17469 non-null  object 
 1   RATING               17469 non-null  float64
 2   CATEGORY             17469 non-null  object 
 3   COLLECTIONS          17469 non-null  object 
 4   RATING_COUNT         17469 non-null  int64  
 5   1_STAR_RATINGS       17469 non-null  int64  
 6   2_STAR_RATINGS       17469 non-null  int64  
 7   3_STAR_RATINGS       17469 non-null  int64  
 8   4_STAR_RATINGS       17469 non-null  int64  
 9   5_STAR_RATINGS       17469 non-null  int64  
 10  REVIEW_COUNT         17469 non-null  float64
 11  INSTALLS             17469 non-null  object 
 12  MIN_INSTALLS         17469 non-null  int64  
 13  MAX_INSTALLS         17469 non-null  int64  
 14  FREE                 17469 non-null  bool   
 15  PRICE                17469 non-null 

## Model Building 

Identifying features for model building

Categorical: `CATEGORY`, `DEVELOPER`, `CONTENT_RATING`, `AD_SUPPORTED`, `IN_APP_PURCHASES`,  `EDITORS_CHOICE`, `FREE`, `COUNTRY`, `INSTALLS`

Numerical: `DAYS_SINCE_UPDATE`, `DAYS_SINCE_RELEASED`, `RATING`, `RATING_COUNT`, `SIZE`, `PRICE`, `MAX_INSTALLS`, `REVIEW_COUNT`

Features to drop
1. `APP_NAME` - the values are all unique which is useless in model building
2. `COLLECTIONS` - collections is the collection that the app falls into when scrapping the app, it is an indicator by google play store which is not a feature of the app itself
3. `1_STAR_RATINGS`, `2_STAR_RATINGS`, `3_STAR_RATINGS`, `4_STAR_RATINGS`, `5_STAR_RATINGS` - to check the rating count and for visualization purposes
4. `MIN_INSTALLS` - contains the lower bound of the number of installations, it act as a benchmark for `INSTALLS` to tell which category of installs the app falls into 
5. `DEVELOPER_ADDRESS` - we are using `COUNTRY` so we can drop the developer's address

In [6]:
cols_to_drop = ["APP_NAME", "COLLECTIONS", "1_STAR_RATINGS", "2_STAR_RATINGS", "3_STAR_RATINGS", "4_STAR_RATINGS", "5_STAR_RATINGS", "MIN_INSTALLS", "DEVELOPER_ADDRESS"]

model_df = df.drop(columns=cols_to_drop, axis=1)
model_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17469 entries, 0 to 17468
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   RATING               17469 non-null  float64
 1   CATEGORY             17469 non-null  object 
 2   RATING_COUNT         17469 non-null  int64  
 3   REVIEW_COUNT         17469 non-null  float64
 4   INSTALLS             17469 non-null  object 
 5   MAX_INSTALLS         17469 non-null  int64  
 6   FREE                 17469 non-null  bool   
 7   PRICE                17469 non-null  float64
 8   SIZE                 17469 non-null  float64
 9   DEVELOPER            17469 non-null  object 
 10  COUNTRY              11582 non-null  object 
 11  CONTENT_RATING       17469 non-null  object 
 12  AD_SUPPORTED         17469 non-null  bool   
 13  IN_APP_PURCHASES     17469 non-null  bool   
 14  EDITORS_CHOICE       17469 non-null  bool   
 15  DAYS_SINCE_UPDATE    17469 non-null 

We will first encode the columns with boolean values, there are 4 columns with boolean values, we expect to generate 4 * 2 = 8 columns for one hot encoding

In [7]:
# Import encoder from sklearn
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()

boolean_cols = ["FREE", "AD_SUPPORTED", "IN_APP_PURCHASES", "EDITORS_CHOICE"]

# One Hot Encoding of the boolean cols
model_df_bool = model_df[boolean_cols]
ohe.fit(model_df_bool)
model_df_ohe = pd.DataFrame(ohe.transform(model_df_bool).toarray(), columns=ohe.get_feature_names_out(model_df_bool.columns))

model_df_ohe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17469 entries, 0 to 17468
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   FREE_False              17469 non-null  float64
 1   FREE_True               17469 non-null  float64
 2   AD_SUPPORTED_False      17469 non-null  float64
 3   AD_SUPPORTED_True       17469 non-null  float64
 4   IN_APP_PURCHASES_False  17469 non-null  float64
 5   IN_APP_PURCHASES_True   17469 non-null  float64
 6   EDITORS_CHOICE_False    17469 non-null  float64
 7   EDITORS_CHOICE_True     17469 non-null  float64
dtypes: float64(8)
memory usage: 1.1 MB
