### Import Neccessary Packages

In [1]:
import pandas as pd
import numpy as np

# For data splitting
from sklearn.model_selection import train_test_split
# Import the encoder from sklearn
from sklearn.preprocessing import OneHotEncoder

# For resampling
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTEN

### Read Files

In [2]:
df = pd.read_csv("./datasets/google_app_scrap_cleaned2.csv")
df.head()

Unnamed: 0,APP_NAME,RATING,CATEGORY,RATING_COUNT,1_STAR_RATINGS,2_STAR_RATINGS,3_STAR_RATINGS,4_STAR_RATINGS,5_STAR_RATINGS,REVIEW_COUNT,...,DAYS_SINCE_RELEASED,SIZEBAND,PRICEBAND,REVIEW_PER_DAY,REVIEW_RATE,RATING_PER_DAY,RATING_RATE,DAYS_SINCE_RELEASED_RANGE,DAYS_SINCE_UPDATE_RANGE,INSTALLS_GROUP
0,100 Mystery Buttons,4.186567,Game,28291,3323,1051,2162,2217,19538,486.0,...,218.0,very large,Free,2.229358,More than once per day,129.775229,More than 50 per day,More than 6 months,3 to 6 months,Between 100K and 10M
1,100 Years - Life Simulator,3.931035,Game,80305,14570,3003,5536,7466,49730,2821.0,...,307.0,very large,Free,9.188925,More than once per day,261.579805,More than 50 per day,More than 6 months,Within 1 month,More than 10M
2,1000 Hours Outside,2.357143,Lifestyle,134,71,16,8,9,30,75.0,...,132.0,medium,cheap,0.568182,High,1.015152,Medium,3 to 6 months,Within 1 month,Less than 100K
3,1000 Pics Quiz,4.558824,Game,2629,181,46,46,181,2175,166.0,...,363.0,medium,Free,0.4573,High,7.242424,Medium,More than 6 months,More than 6 months,Between 100K and 10M
4,10X Fire GFX Sensitivity Tool,4.753087,Tools,2217,54,54,27,109,1973,57.0,...,225.0,small,cheap,0.253333,Medium,9.853333,Medium,More than 6 months,Few days ago,Less than 100K


<a id='model building'></a>
## Model Building

First we identify the which features are usable in the dataset:

1. `RATING_COUNT` (num)
2. `RATING` (num)
3. `CATEGORY` (cat)
4. `INSTALLS_GROUP` (cat)
5. `SIZEBAND` (cat)
6. `FREE` (cat)
7. `PRICEBAND` (cat)
8. `AD_SUPPORTED` (cat)
9. `CONTENT_RATING` (cat)
10. `IN_APP_PURCHASES` (cat)
11. `EDITORS_CHOICE` (cat)
12. `DAYS_SINCE_UPDATE` (num)
13. `DAYS_SINCE_RELEASED` (num)


In [3]:
feature_list = ["RATING_COUNT", "RATING_RATE", "REVIEW_RATE", "CATEGORY", "INSTALLS", "SIZEBAND", "FREE", "PRICEBAND", "AD_SUPPORTED", 
                "CONTENT_RATING", "IN_APP_PURCHASES", "EDITORS_CHOICE", "DAYS_SINCE_UPDATE_RANGE", "DAYS_SINCE_RELEASED"]
model_df = df[feature_list].copy()
model_df.head()

Unnamed: 0,RATING_COUNT,REVIEW_RATE,RATING_RATE,CATEGORY,INSTALLS,SIZEBAND,FREE,PRICEBAND,AD_SUPPORTED,CONTENT_RATING,IN_APP_PURCHASES,EDITORS_CHOICE,DAYS_SINCE_UPDATE_RANGE,DAYS_SINCE_RELEASED
0,28291,More than once per day,More than 50 per day,Game,"5,000,000+",very large,True,Free,True,Everyone,False,False,3 to 6 months,218.0
1,80305,More than once per day,More than 50 per day,Game,"10,000,000+",very large,True,Free,True,Adults,True,False,Within 1 month,307.0
2,134,High,Medium,Lifestyle,"5,000+",medium,False,cheap,False,Everyone,False,False,Within 1 month,132.0
3,2629,High,Medium,Game,"100,000+",medium,True,Free,True,Everyone,True,False,More than 6 months,363.0
4,2217,Medium,Medium,Tools,"50,000+",small,False,cheap,False,Everyone,False,False,Few days ago,225.0


Handling null values

In [4]:
model_df.isna().sum()

RATING_COUNT               0
REVIEW_RATE                0
RATING_RATE                0
CATEGORY                   0
INSTALLS                   0
SIZEBAND                   0
FREE                       0
PRICEBAND                  0
AD_SUPPORTED               0
CONTENT_RATING             0
IN_APP_PURCHASES           0
EDITORS_CHOICE             0
DAYS_SINCE_UPDATE_RANGE    0
DAYS_SINCE_RELEASED        0
dtype: int64

Seems like there are no null values in our dataset :)

In [5]:
model_df = model_df.loc[model_df["DAYS_SINCE_RELEASED"] <= 365, :]
model_df.reset_index(inplace= True, drop=True)
model_df

Unnamed: 0,RATING_COUNT,REVIEW_RATE,RATING_RATE,CATEGORY,INSTALLS,SIZEBAND,FREE,PRICEBAND,AD_SUPPORTED,CONTENT_RATING,IN_APP_PURCHASES,EDITORS_CHOICE,DAYS_SINCE_UPDATE_RANGE,DAYS_SINCE_RELEASED
0,28291,More than once per day,More than 50 per day,Game,"5,000,000+",very large,True,Free,True,Everyone,False,False,3 to 6 months,218.0
1,80305,More than once per day,More than 50 per day,Game,"10,000,000+",very large,True,Free,True,Adults,True,False,Within 1 month,307.0
2,134,High,Medium,Lifestyle,"5,000+",medium,False,cheap,False,Everyone,False,False,Within 1 month,132.0
3,2629,High,Medium,Game,"100,000+",medium,True,Free,True,Everyone,True,False,More than 6 months,363.0
4,2217,Medium,Medium,Tools,"50,000+",small,False,cheap,False,Everyone,False,False,Few days ago,225.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2976,19,Medium,Low,Game,"1,000+",very small,True,Free,False,Teen,True,False,1 to 3 months,82.0
2977,30512,Medium,More than 50 per day,Game,"100,000+",medium,True,Free,False,Teen,True,False,3 to 6 months,317.0
2978,1708,Medium,High,Game,"50,000+",very large,True,Free,False,Teen,True,False,Few days ago,96.0
2979,24562,Medium,More than 50 per day,Game,"100,000+",very large,True,Free,False,Teen,True,False,1 to 3 months,141.0


In [6]:
model_df['INSTALLS_GROUP'] = 'More than 1M'
model_df.loc[(model_df['INSTALLS'] == '1+') | (model_df['INSTALLS'] == '10+') | (model_df['INSTALLS'] == '50+') | 
       (model_df['INSTALLS'] == '100+') | (model_df['INSTALLS'] == '500+' ) | (model_df['INSTALLS'] == '1,000+' ) | 
       (model_df['INSTALLS'] == '5,000+' ) | (model_df['INSTALLS'] == '10,000+') | (model_df['INSTALLS'] == '50,000+') |
       (model_df['INSTALLS'] == '100,000+') | (model_df['INSTALLS'] == '500,000+'), 'INSTALLS_GROUP'] = 'Less than 1M'
model_df['INSTALLS_GROUP'] .value_counts()

Less than 1M    2091
More than 1M     890
Name: INSTALLS_GROUP, dtype: int64

### Data Preprocessing for model

From eda, we noticed that it is unlikely to accurately predict the rating of the app, due to its nature such that rating is not only affected by the features of the app, but user experience and user interface plays an important role too. In this case, our group set the indicator for the performance of the app to how much it has been installed. 

For data preprosessing, we picked 10 predictors for predicting the installs group that the app is going to fall into, namely
1. `CATEGORY` (cat)
2. `SIZEBAND` (cat)
3. `FREE` (cat)
4. `PRICEBAND` (cat)
5. `AD_SUPPORTED` (cat)
6. `CONTENT_RATING` (cat)
7.  `IN_APP_PURCHASES` (cat)
8.  `EDITORS_CHOICE` (cat)
9.  `DAYS_SINCE_UPDATE` (num)
10. `DAYS_SINCE_RELEASED` (num)

Before we continue, we need to do some changes on `CATEGORY` and `CONTENT_RATING`.

For `CATEGORY`, we noticed from eda that most apps falls under "Game" category, with that we change the category values to either "Game" or "Non Game"

In [7]:
model_df.loc[model_df["CATEGORY"] != "Game", "CATEGORY"] = "Non Game"
model_df["CATEGORY"].value_counts()

Game        1529
Non Game    1452
Name: CATEGORY, dtype: int64

**Split the dataset into train and test data**

In [8]:
# Extract Response and Predictors
model_df.drop(["RATING"], axis=1, inplace=True)
response = "INSTALLS_GROUP"
y = pd.DataFrame(model_df[response])
X = pd.DataFrame(model_df.drop(response, axis = 1))

# Split the dataset into 70% train and 30% test set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=20)

y_train.value_counts()

KeyError: "['RATING'] not found in axis"

In [None]:
# concat training data
train_df = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
train_df.head()

### Resampling INSTALLS_GROUP

We noticed that there is an imbalanced distribution of `INSTALLS_GROUP`, we handle them by oversampling techniques below.
1. Random Oversampling
2. SMOTEN

In [None]:
y_train.value_counts()

**Random Oversampling**

We will resample the data to equal portions

In [None]:
strategy = {"Less than 1M": 1672, 
            "More than 1M": 1672,}
over_sampler = RandomOverSampler(sampling_strategy=strategy, random_state=20)
X_over, y_over = over_sampler.fit_resample(X_train, y_train)

y_over.value_counts()

In [None]:
train_oversampled_df = pd.concat([X_over, y_over], axis=1)
train_oversampled_df.head()

**SMOTEN Oversampling** 

Since our data are purely categorical, we apply the SMOTEN oversampling technique to resample our data.

In [None]:
strategy = {"Less than 1M": 1672, 
            "More than 1M": 1672,}
smoten_sampler = SMOTEN(sampling_strategy=strategy, random_state=20)
X_smoten, y_smoten = smoten_sampler.fit_resample(X_train, y_train)

y_smoten.value_counts()

In [None]:
train_smoten_df = pd.concat([X_smoten, y_smoten], axis=1)
train_smoten_df.head()

**Combining test_df**

In [None]:
test_df = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)
test_df.head()

#### Encoding categorical values

Before we continue, our categorical variables are further divided into 2 types —
- Ordinal categorical variables - `INSTALLS_GROUP`, `SIZEBAND`, `PRICEBAND`, `RATING_GROUP`,`FREE`, `AD_SUPPORTED`, `IN_APP_PURCHASES`, `EDITORS_CHOICE`
- Nominal categorical variable - `CATEGORY`,  `CONTENT_RATING`

In [None]:
nominal_cat = ["CATEGORY", "CONTENT_RATING"]
ordinal_cat = ["FREE","AD_SUPPORTED", "INSTALLS_GROUP", "PRICEBAND", "SIZEBAND", "IN_APP_PURCHASES", "DAYS_SINCE_UPDATE_RANGE", "REVIEW_RATE", "RATING_RATE"]

numerical = []

**Function for handling Ordinal Categorical Variables**

In [None]:
def encode_ord_cat(df):
    ig_mapping = {"Less than 1M": 0, "More than 1M": 1}
    pb_mapping = {"Free": 0, "cheap": 1, "normal": 2, "expensive": 3, "very expensive": 4}
    sb_mapping = {"very small": 0, "small": 1, "medium": 2, "large": 3, "very large": 4}
    # dsr_mapping = {"Less than 6 months": 0, "6 months to 1 year": 1, "1 to 3 years": 2, "3 to 5 years": 3, "5 to 7 years": 4, "More than 7 years": 5}
    dsu_mapping = {"Few days ago": 0, "Almost a week": 1, "Within 1 month": 2, "1 to 3 months": 3, "3 to 6 months": 4, "More than 6 months": 5}
    revr_mapping = {"Low": 0, "Medium": 1, "High": 2, "More than once per day": 3}
    ratr_mapping = {"Low": 0, "Medium": 1, "High": 2, "More than 50 per day": 3}
    free_mapping = {False: 0, True: 1}
    ads_mapping = {False: 0, True: 1}
    iap_mapping = {False: 0, True: 1}
    ec_mapping = {False: 0, True: 1}
    
    model_df_ord = df.copy()
    
    model_df_ord["INSTALLS_GROUP"] = model_df_ord["INSTALLS_GROUP"].map(ig_mapping)
    model_df_ord["PRICEBAND"] = model_df_ord["PRICEBAND"].map(pb_mapping)
    model_df_ord["SIZEBAND"] = model_df_ord["SIZEBAND"].map(sb_mapping)
    model_df_ord["FREE"] = model_df_ord["FREE"].map(free_mapping)
    model_df_ord["AD_SUPPORTED"] = model_df_ord["AD_SUPPORTED"].map(ads_mapping)
    model_df_ord["IN_APP_PURCHASES"] = model_df_ord["IN_APP_PURCHASES"].map(iap_mapping)
    # model_df_ord["EDITORS_CHOICE"] = model_df_ord["EDITORS_CHOICE"].map(ec_mapping)
    # model_df_ord["DAYS_SINCE_RELEASED_RANGE"] = model_df_ord["DAYS_SINCE_RELEASED_RANGE"].map(dsr_mapping)
    model_df_ord["DAYS_SINCE_UPDATE_RANGE"] = model_df_ord["DAYS_SINCE_UPDATE_RANGE"].map(dsu_mapping)
    model_df_ord["REVIEW_RATE"] = model_df_ord["REVIEW_RATE"].map(revr_mapping)
    model_df_ord["RATING_RATE"] = model_df_ord["RATING_RATE"].map(ratr_mapping)
    
    return model_df_ord

**Function for handling Nominal Categorical Variables**

In [None]:
def encode_norm_cat(df):
    ohe = OneHotEncoder()
    model_df_cat = df[nominal_cat]
    ohe.fit(model_df_cat)
    model_df_cat_ohe = pd.DataFrame(ohe.transform(model_df_cat).toarray(), 
                                    columns=ohe.get_feature_names_out(model_df_cat.columns))
    return model_df_cat_ohe

**Encode all datasets**

In [None]:
model_df_ord = encode_ord_cat(train_df)[ordinal_cat]
model_df_norm = encode_norm_cat(train_df)
model_df_num = train_df[numerical]
# Combine all features
train_df_encoded = pd.concat([ model_df_norm, model_df_ord, model_df_num], 
                        sort = False, axis = 1).reindex(index=model_df_ord.index)
train_df_encoded.head()

In [None]:
model_df_ord = encode_ord_cat(train_oversampled_df)[ordinal_cat]
model_df_norm = encode_norm_cat(train_oversampled_df)
model_df_num = train_oversampled_df[numerical]
train_df_oversampled_encoded = pd.concat([ model_df_norm, model_df_ord, model_df_num], 
                        sort = False, axis = 1).reindex(index=model_df_ord.index)
train_df_oversampled_encoded.head()

In [None]:
model_df_ord = encode_ord_cat(train_smoten_df)[ordinal_cat]
model_df_norm = encode_norm_cat(train_smoten_df)
model_df_num = train_smoten_df[numerical]
train_df_smoten_encoded = pd.concat([ model_df_norm, model_df_ord, model_df_num], 
                        sort = False, axis = 1).reindex(index=model_df_ord.index)
train_df_smoten_encoded.head()

In [None]:
model_df_ord = encode_ord_cat(test_df)[ordinal_cat]
model_df_norm = encode_norm_cat(test_df)
model_df_num = test_df[numerical]
test_df_encoded = pd.concat([ model_df_norm, model_df_ord, model_df_num], 
                        sort = False, axis = 1).reindex(index=model_df_ord.index)
test_df_encoded.head()

In [None]:
train_df_encoded.info()

### Export the files

In [None]:
file_path = "./datasets/train.csv"
train_df_encoded.to_csv(path_or_buf=file_path, index=False);

file_path = "./datasets/train_oversampled.csv"
train_df_oversampled_encoded.to_csv(path_or_buf=file_path, index=False);

file_path = "./datasets/train_smoten.csv"
train_df_smoten_encoded.to_csv(path_or_buf=file_path, index=False);

file_path = "./datasets/test.csv"
test_df_encoded.to_csv(path_or_buf=file_path, index=False);