### Import Neccessary Packages

In [1]:
import pandas as pd
import numpy as np

# For data splitting
from sklearn.model_selection import train_test_split
# Import the encoder from sklearn
from sklearn.preprocessing import OneHotEncoder

# For resampling
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTEN

### Read Files

In [2]:
df = pd.read_csv("./datasets/google_app_scrap_cleaned2.csv")
df.head()

Unnamed: 0,APP_NAME,RATING,CATEGORY,RATING_COUNT,1_STAR_RATINGS,2_STAR_RATINGS,3_STAR_RATINGS,4_STAR_RATINGS,5_STAR_RATINGS,REVIEW_COUNT,...,COUNTRY,CONTENT_RATING,AD_SUPPORTED,IN_APP_PURCHASES,EDITORS_CHOICE,DAYS_SINCE_UPDATE,DAYS_SINCE_RELEASED,INSTALLS_GROUP,PRICEBAND,SIZEBAND
0,"""Ghost Voice Catcher"" AUTO EVP",3.933333,Lifestyle,68,12,1,6,7,42,24.0,...,Usa,Everyone,False,False,False,1820.0,3482.0,Less than 100K,normal,small
1,"""OXXO""",4.74,Game,975,23,0,15,94,843,54.0,...,,Everyone,False,False,False,0.0,945.0,Less than 100K,cheap,medium
2,#DRIVE,4.434152,Game,229679,9352,4740,15249,47804,152534,6473.0,...,,Everyone,True,True,False,4.0,1084.0,Between 100K and 10M,Free,large
3,#SelfCare,4.463476,Game,14932,865,336,1088,1353,11290,1501.0,...,,Teen,False,True,False,952.0,1275.0,Between 100K and 10M,Free,medium
4,#open Polyamorous + ENM Dating,2.55,Dating,708,263,163,63,70,149,362.0,...,,Mature 17+,False,True,False,17.0,1026.0,Less than 100K,Free,small


<a id='model building'></a>
## Model Building

First we identify the which features are usable in the dataset:
1. `RATING` (num)
2. `CATEGORY` (cat)
3. `INSTALLS_GROUP` (cat)
4. `SIZEBAND` (cat)
5. `FREE` (cat)
6. `PRICEBAND` (cat)
7. `AD_SUPPORTED` (cat)
8. `CONTENT_RATING` (cat)
9. `IN_APP_PURCHASES` (cat)
10. `EDITORS_CHOICE` (cat)


In [3]:
feature_list = ["RATING_COUNT", "RATING", "CATEGORY", "INSTALLS_GROUP", "SIZEBAND", "FREE", "PRICEBAND", "AD_SUPPORTED", "CONTENT_RATING", "IN_APP_PURCHASES", "EDITORS_CHOICE"]
model_df = df[feature_list].copy()
model_df.head()

Unnamed: 0,RATING_COUNT,RATING,CATEGORY,INSTALLS_GROUP,SIZEBAND,FREE,PRICEBAND,AD_SUPPORTED,CONTENT_RATING,IN_APP_PURCHASES,EDITORS_CHOICE
0,68,3.933333,Lifestyle,Less than 100K,small,False,normal,False,Everyone,False,False
1,975,4.74,Game,Less than 100K,medium,False,cheap,False,Everyone,False,False
2,229679,4.434152,Game,Between 100K and 10M,large,True,Free,True,Everyone,True,False
3,14932,4.463476,Game,Between 100K and 10M,medium,True,Free,False,Teen,True,False
4,708,2.55,Dating,Less than 100K,small,True,Free,False,Mature 17+,True,False


Handling null values

In [4]:
model_df.isna().sum()

RATING_COUNT        0
RATING              0
CATEGORY            0
INSTALLS_GROUP      0
SIZEBAND            0
FREE                0
PRICEBAND           0
AD_SUPPORTED        0
CONTENT_RATING      0
IN_APP_PURCHASES    0
EDITORS_CHOICE      0
dtype: int64

Seems like there are no null values in our dataset :)

Set benchmark of rating count to be more than 5000

In [5]:
benchmark = 5000
model_df = model_df[model_df["RATING_COUNT"] > benchmark]
model_df.reset_index(inplace=True, drop=True)
model_df.drop(["RATING_COUNT"], axis=1, inplace=True)
model_df.shape

(9476, 10)

### Data Preprocessing for model

From eda, we noticed that RATING is skewed towards 4 and 5, so we categorize the rating into low (less than or equal to 3.5), medium (3.5 to 4.6) and high (greater than 4.6)

In [6]:
#Categorize
model_df.loc[model_df["RATING"] <=3.5 , "RATING_GROUP"] = "Low"
model_df.loc[(model_df["RATING"] >3.5) & (model_df["RATING"] <=4.6) , "RATING_GROUP"] = "Medium"
model_df.loc[model_df["RATING"] >4.6 , "RATING_GROUP"] = "High"
model_df.drop(["RATING"], axis = 1,inplace= True)
model_df["RATING_GROUP"].value_counts()

Medium    6757
High      2196
Low        523
Name: RATING_GROUP, dtype: int64

For `CATEGORY`, we noticed from eda that most apps falls under "Game" category, with that we change the category values to either "Game" or "Non Game"

In [7]:
model_df.loc[model_df["CATEGORY"] != "Game", "CATEGORY"] = "Non Game"
model_df["CATEGORY"].value_counts()

Non Game    5222
Game        4254
Name: CATEGORY, dtype: int64

For `CONTENT_RATING`, we just include Everyone, Teens and Adult

In [8]:
model_df['CONTENT_RATING'] = model_df['CONTENT_RATING'].replace('Mature 17+',"Adults")
model_df['CONTENT_RATING'] = model_df['CONTENT_RATING'].replace('Adults only 18+',"Adults")
model_df['CONTENT_RATING'] = model_df['CONTENT_RATING'].replace('Everyone 10+',"Everyone")
model_df["CONTENT_RATING"].value_counts()

Everyone    7118
Teen        1851
Adults       507
Name: CONTENT_RATING, dtype: int64

**Split the dataset into train and test data**

In [9]:
# Extract Response and Predictors
response = "RATING_GROUP"
y = pd.DataFrame(model_df[response])
X = pd.DataFrame(model_df.drop(response, axis = 1))

# Split the dataset into 75% train and 25% test set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=20)

print("Low vs Medium vs High Rating Group distribution")
print(y_train.value_counts()[2], ':', y_train.value_counts()[0], ':', y_train.value_counts()[1])

Low vs Medium vs High Rating Group distribution
394 : 5085 : 1628


In [10]:
train_df = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
train_df.head()

Unnamed: 0,CATEGORY,INSTALLS_GROUP,SIZEBAND,FREE,PRICEBAND,AD_SUPPORTED,CONTENT_RATING,IN_APP_PURCHASES,EDITORS_CHOICE,RATING_GROUP
0,Game,Less than 100K,large,False,cheap,False,Teen,False,False,High
1,Game,Between 100K and 10M,medium,True,Free,True,Teen,True,False,Medium
2,Game,Between 100K and 10M,medium,True,Free,False,Everyone,True,False,Medium
3,Game,Between 100K and 10M,large,True,Free,True,Everyone,True,False,Medium
4,Non Game,Between 100K and 10M,medium,True,Free,False,Everyone,False,False,High


### Resampling RATING_GROUP

We noticed that there is an imbalanced distribution of `RATING_GROUP`, we handle them by oversampling techniques below.
1. Random Oversampling
2. SMOTEN

In [11]:
y_train.value_counts()

RATING_GROUP
Medium          5085
High            1628
Low              394
dtype: int64

**Random Oversampling**

We will resample the data to Low: Medium: High = 1:2:1 ratio

In [12]:
strategy = {"Medium": 5085, "High": 2500, "Low": 2500}
over_sampler = RandomOverSampler(sampling_strategy=strategy, random_state=20)
X_over, y_over = over_sampler.fit_resample(X_train, y_train)

print("Low vs Medium vs High Rating Group distribution")
print(y_over.value_counts()[2], ':', y_over.value_counts()[0], ':', y_over.value_counts()[1])

Low vs Medium vs High Rating Group distribution
2500 : 5085 : 2500


In [13]:
train_oversampled_df = pd.concat([X_over, y_over], axis=1)
train_oversampled_df.head()

Unnamed: 0,CATEGORY,INSTALLS_GROUP,SIZEBAND,FREE,PRICEBAND,AD_SUPPORTED,CONTENT_RATING,IN_APP_PURCHASES,EDITORS_CHOICE,RATING_GROUP
0,Game,Less than 100K,large,False,cheap,False,Teen,False,False,High
1,Game,Between 100K and 10M,medium,True,Free,True,Teen,True,False,Medium
2,Game,Between 100K and 10M,medium,True,Free,False,Everyone,True,False,Medium
3,Game,Between 100K and 10M,large,True,Free,True,Everyone,True,False,Medium
4,Non Game,Between 100K and 10M,medium,True,Free,False,Everyone,False,False,High


**SMOTEN Oversampling** 

Since our data are purely categorical, we apply the SMOTEN oversampling technique to resample our data.

In [14]:
strategy = {"Medium": 5085, "High": 2500, "Low": 2500}
smoten_sampler = SMOTEN(sampling_strategy=strategy, random_state=20)
X_smoten, y_smoten = smoten_sampler.fit_resample(X_train, y_train)

print("Low vs Medium vs High Rating Group distribution")
print(y_smoten.value_counts()[2], ':', y_smoten.value_counts()[0], ':', y_smoten.value_counts()[1])

Low vs Medium vs High Rating Group distribution
2500 : 5085 : 2500


In [15]:
train_smoten_df = pd.concat([X_smoten, y_smoten], axis=1)
train_smoten_df.head()

Unnamed: 0,CATEGORY,INSTALLS_GROUP,SIZEBAND,FREE,PRICEBAND,AD_SUPPORTED,CONTENT_RATING,IN_APP_PURCHASES,EDITORS_CHOICE,RATING_GROUP
0,Game,Less than 100K,large,False,cheap,False,Teen,False,False,High
1,Game,Between 100K and 10M,medium,True,Free,True,Teen,True,False,Medium
2,Game,Between 100K and 10M,medium,True,Free,False,Everyone,True,False,Medium
3,Game,Between 100K and 10M,large,True,Free,True,Everyone,True,False,Medium
4,Non Game,Between 100K and 10M,medium,True,Free,False,Everyone,False,False,High


**Combining test_df**

In [16]:
test_df = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)
test_df.head()

Unnamed: 0,CATEGORY,INSTALLS_GROUP,SIZEBAND,FREE,PRICEBAND,AD_SUPPORTED,CONTENT_RATING,IN_APP_PURCHASES,EDITORS_CHOICE,RATING_GROUP
0,Non Game,Between 100K and 10M,small,True,Free,False,Everyone,False,False,Medium
1,Non Game,More than 10M,medium,True,Free,True,Everyone,True,False,Medium
2,Non Game,Less than 100K,large,True,Free,False,Everyone,True,False,Medium
3,Game,Between 100K and 10M,large,True,Free,False,Adults,True,False,Medium
4,Game,Between 100K and 10M,large,True,Free,True,Everyone,True,False,Medium


#### Encoding categorical values

Before we continue, our categorical variables are further divided into 2 types —
- Ordinal categorical variables - `INSTALLS_GROUP`, `SIZEBAND`, `PRICEBAND`, `RATING_GROUP`,`FREE`, `AD_SUPPORTED`, `IN_APP_PURCHASES`, `EDITORS_CHOICE`
- Nominal categorical variable - `CATEGORY`,  `CONTENT_RATING`

In [17]:
nominal_cat = ["CONTENT_RATING", "CATEGORY"]
ordinal_cat = ["INSTALLS_GROUP", "SIZEBAND", "PRICEBAND", "RATING_GROUP","FREE","AD_SUPPORTED", "IN_APP_PURCHASES","EDITORS_CHOICE"]

**Function for handling Ordinal Categorical Variables**

In [18]:
def encode_ord_cat(df):
    ig_mapping = {"Less than 100K": 0, "Between 100K and 10M": 1, "More than 10M": 2}
    pb_mapping = {"Free": 0, "cheap": 1, "normal": 2, "expensive": 3}
    sb_mapping = {"small": 0, "medium": 1, "large": 2 }
    rg_mapping = {"Low": 0, "Medium": 1, "High": 2}
    free_mapping = {False: 0, True: 1}
    ads_mapping = {False: 0, True: 1}
    iap_mapping = {False: 0, True: 1}
    ec_mapping = {False: 0, True: 1}
    
    model_df_ord = df.copy()
    
    model_df_ord["INSTALLS_GROUP"] = model_df_ord["INSTALLS_GROUP"].map(ig_mapping)
    model_df_ord["PRICEBAND"] = model_df_ord["PRICEBAND"].map(pb_mapping)
    model_df_ord["SIZEBAND"] = model_df_ord["SIZEBAND"].map(sb_mapping)
    model_df_ord["RATING_GROUP"] = model_df_ord["RATING_GROUP"].map(rg_mapping)
    model_df_ord["FREE"] = model_df_ord["FREE"].map(free_mapping)
    model_df_ord["AD_SUPPORTED"] = model_df_ord["AD_SUPPORTED"].map(ads_mapping)
    model_df_ord["IN_APP_PURCHASES"] = model_df_ord["IN_APP_PURCHASES"].map(iap_mapping)
    model_df_ord["EDITORS_CHOICE"] = model_df_ord["EDITORS_CHOICE"].map(ec_mapping)
    
    return model_df_ord

**Function for handling Nominal Categorical Variables**

In [19]:
def encode_norm_cat(df):
    ohe = OneHotEncoder()
    model_df_cat = df[nominal_cat]
    ohe.fit(model_df_cat)
    model_df_cat_ohe = pd.DataFrame(ohe.transform(model_df_cat).toarray(), 
                                    columns=ohe.get_feature_names_out(model_df_cat.columns))
    return model_df_cat_ohe

**Encode all datasets**

In [20]:
model_df_ord = encode_ord_cat(train_df)[ordinal_cat]
model_df_norm = encode_norm_cat(train_df)
# Combine all features
train_df_encoded = pd.concat([ model_df_norm, model_df_ord], 
                        sort = False, axis = 1).reindex(index=model_df_ord.index)
train_df_encoded.head()

Unnamed: 0,CONTENT_RATING_Adults,CONTENT_RATING_Everyone,CONTENT_RATING_Teen,CATEGORY_Game,CATEGORY_Non Game,INSTALLS_GROUP,SIZEBAND,PRICEBAND,RATING_GROUP,FREE,AD_SUPPORTED,IN_APP_PURCHASES,EDITORS_CHOICE
0,0.0,0.0,1.0,1.0,0.0,0,2,1,2,0,0,0,0
1,0.0,0.0,1.0,1.0,0.0,1,1,0,1,1,1,1,0
2,0.0,1.0,0.0,1.0,0.0,1,1,0,1,1,0,1,0
3,0.0,1.0,0.0,1.0,0.0,1,2,0,1,1,1,1,0
4,0.0,1.0,0.0,0.0,1.0,1,1,0,2,1,0,0,0


In [21]:
model_df_ord = encode_ord_cat(train_oversampled_df)[ordinal_cat]
model_df_norm = encode_norm_cat(train_oversampled_df)
train_df_oversampled_encoded = pd.concat([ model_df_norm, model_df_ord], 
                        sort = False, axis = 1).reindex(index=model_df_ord.index)
train_df_oversampled_encoded.head()

Unnamed: 0,CONTENT_RATING_Adults,CONTENT_RATING_Everyone,CONTENT_RATING_Teen,CATEGORY_Game,CATEGORY_Non Game,INSTALLS_GROUP,SIZEBAND,PRICEBAND,RATING_GROUP,FREE,AD_SUPPORTED,IN_APP_PURCHASES,EDITORS_CHOICE
0,0.0,0.0,1.0,1.0,0.0,0,2,1,2,0,0,0,0
1,0.0,0.0,1.0,1.0,0.0,1,1,0,1,1,1,1,0
2,0.0,1.0,0.0,1.0,0.0,1,1,0,1,1,0,1,0
3,0.0,1.0,0.0,1.0,0.0,1,2,0,1,1,1,1,0
4,0.0,1.0,0.0,0.0,1.0,1,1,0,2,1,0,0,0


In [22]:
model_df_ord = encode_ord_cat(train_smoten_df)[ordinal_cat]
model_df_norm = encode_norm_cat(train_smoten_df)
train_df_smoten_encoded = pd.concat([ model_df_norm, model_df_ord], 
                        sort = False, axis = 1).reindex(index=model_df_ord.index)
train_df_smoten_encoded.head()

Unnamed: 0,CONTENT_RATING_Adults,CONTENT_RATING_Everyone,CONTENT_RATING_Teen,CATEGORY_Game,CATEGORY_Non Game,INSTALLS_GROUP,SIZEBAND,PRICEBAND,RATING_GROUP,FREE,AD_SUPPORTED,IN_APP_PURCHASES,EDITORS_CHOICE
0,0.0,0.0,1.0,1.0,0.0,0,2,1,2,0,0,0,0
1,0.0,0.0,1.0,1.0,0.0,1,1,0,1,1,1,1,0
2,0.0,1.0,0.0,1.0,0.0,1,1,0,1,1,0,1,0
3,0.0,1.0,0.0,1.0,0.0,1,2,0,1,1,1,1,0
4,0.0,1.0,0.0,0.0,1.0,1,1,0,2,1,0,0,0


In [23]:
model_df_ord = encode_ord_cat(test_df)[ordinal_cat]
model_df_norm = encode_norm_cat(test_df)
test_df_encoded = pd.concat([ model_df_norm, model_df_ord], 
                        sort = False, axis = 1).reindex(index=model_df_ord.index)
test_df_encoded.head()

Unnamed: 0,CONTENT_RATING_Adults,CONTENT_RATING_Everyone,CONTENT_RATING_Teen,CATEGORY_Game,CATEGORY_Non Game,INSTALLS_GROUP,SIZEBAND,PRICEBAND,RATING_GROUP,FREE,AD_SUPPORTED,IN_APP_PURCHASES,EDITORS_CHOICE
0,0.0,1.0,0.0,0.0,1.0,1,0,0,1,1,0,0,0
1,0.0,1.0,0.0,0.0,1.0,2,1,0,1,1,1,1,0
2,0.0,1.0,0.0,0.0,1.0,0,2,0,1,1,0,1,0
3,1.0,0.0,0.0,1.0,0.0,1,2,0,1,1,0,1,0
4,0.0,1.0,0.0,1.0,0.0,1,2,0,1,1,1,1,0


### Export the files

In [24]:
file_path = "./datasets/train.csv"
train_df_encoded.to_csv(path_or_buf=file_path, index=False);

file_path = "./datasets/train_oversampled.csv"
train_df_oversampled_encoded.to_csv(path_or_buf=file_path, index=False);

file_path = "./datasets/train_smoten.csv"
train_df_smoten_encoded.to_csv(path_or_buf=file_path, index=False);

file_path = "./datasets/test.csv"
test_df_encoded.to_csv(path_or_buf=file_path, index=False);