### Import Neccessary Packages

In [1575]:
import pandas as pd
import numpy as np

# For data splitting
from sklearn.model_selection import train_test_split
# Import the encoder from sklearn
from sklearn.preprocessing import OneHotEncoder

# For resampling
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTEN

### Read Files

In [1576]:
df = pd.read_csv("./datasets/google_app_scrap_cleaned2.csv")
df.head()

Unnamed: 0,APP_NAME,RATING,CATEGORY,RATING_COUNT,1_STAR_RATINGS,2_STAR_RATINGS,3_STAR_RATINGS,4_STAR_RATINGS,5_STAR_RATINGS,REVIEW_COUNT,...,COUNTRY,CONTENT_RATING,AD_SUPPORTED,IN_APP_PURCHASES,EDITORS_CHOICE,DAYS_SINCE_UPDATE,DAYS_SINCE_RELEASED,INSTALLS_GROUP,PRICEBAND,SIZEBAND
0,"""Ghost Voice Catcher"" AUTO EVP",3.933333,Lifestyle,68,12,1,6,7,42,24.0,...,Usa,Everyone,False,False,False,1820.0,3482.0,Less than 100K,expensive,very small
1,"""OXXO""",4.74,Game,975,23,0,15,94,843,54.0,...,,Everyone,False,False,False,0.0,945.0,Less than 100K,cheap,large
2,#DRIVE,4.434152,Game,229679,9352,4740,15249,47804,152534,6473.0,...,,Everyone,True,True,False,4.0,1084.0,Between 100K and 10M,Free,very large
3,#SelfCare,4.463476,Game,14932,865,336,1088,1353,11290,1501.0,...,,Teen,False,True,False,952.0,1275.0,Between 100K and 10M,Free,large
4,#open Polyamorous + ENM Dating,2.55,Dating,708,263,163,63,70,149,362.0,...,,Mature 17+,False,True,False,17.0,1026.0,Less than 100K,Free,small


<a id='model building'></a>
## Model Building

First we identify the which features are usable in the dataset:

1. `RATING_COUNT` (num)
2. `RATING` (num)
3. `CATEGORY` (cat)
4. `INSTALLS_GROUP` (cat)
5. `SIZEBAND` (cat)
6. `FREE` (cat)
7. `PRICEBAND` (cat)
8. `AD_SUPPORTED` (cat)
9. `CONTENT_RATING` (cat)
10. `IN_APP_PURCHASES` (cat)
11. `EDITORS_CHOICE` (cat)
12. `DAYS_SINCE_UPDATE` (num)
13. `DAYS_SINCE_RELEASED` (num)


In [1577]:
feature_list = ["RATING_COUNT", "REVIEW_COUNT", "RATING", "CATEGORY", "INSTALLS", "SIZEBAND", "FREE", "PRICEBAND", "AD_SUPPORTED", 
                "CONTENT_RATING", "IN_APP_PURCHASES", "EDITORS_CHOICE", "DAYS_SINCE_UPDATE", "DAYS_SINCE_RELEASED"]
model_df = df[feature_list].copy()
model_df.head()

Unnamed: 0,RATING_COUNT,REVIEW_COUNT,RATING,CATEGORY,INSTALLS,SIZEBAND,FREE,PRICEBAND,AD_SUPPORTED,CONTENT_RATING,IN_APP_PURCHASES,EDITORS_CHOICE,DAYS_SINCE_UPDATE,DAYS_SINCE_RELEASED
0,68,24.0,3.933333,Lifestyle,"1,000+",very small,False,expensive,False,Everyone,False,False,1820.0,3482.0
1,975,54.0,4.74,Game,"10,000+",large,False,cheap,False,Everyone,False,False,0.0,945.0
2,229679,6473.0,4.434152,Game,"5,000,000+",very large,True,Free,True,Everyone,True,False,4.0,1084.0
3,14932,1501.0,4.463476,Game,"1,000,000+",large,True,Free,False,Teen,True,False,952.0,1275.0
4,708,362.0,2.55,Dating,"50,000+",small,True,Free,False,Mature 17+,True,False,17.0,1026.0


Handling null values

In [1578]:
model_df.isna().sum()

RATING_COUNT           0
REVIEW_COUNT           0
RATING                 0
CATEGORY               0
INSTALLS               0
SIZEBAND               0
FREE                   0
PRICEBAND              0
AD_SUPPORTED           0
CONTENT_RATING         0
IN_APP_PURCHASES       0
EDITORS_CHOICE         0
DAYS_SINCE_UPDATE      0
DAYS_SINCE_RELEASED    0
dtype: int64

Seems like there are no null values in our dataset :)

In [1579]:
model_df = model_df.loc[model_df["DAYS_SINCE_RELEASED"] <= 365, :]
model_df.reset_index(inplace= True, drop=True)
model_df

Unnamed: 0,RATING_COUNT,REVIEW_COUNT,RATING,CATEGORY,INSTALLS,SIZEBAND,FREE,PRICEBAND,AD_SUPPORTED,CONTENT_RATING,IN_APP_PURCHASES,EDITORS_CHOICE,DAYS_SINCE_UPDATE,DAYS_SINCE_RELEASED
0,28291,486.0,4.186567,Game,"5,000,000+",very large,True,Free,True,Everyone 10+,False,False,109.0,218.0
1,80305,2821.0,3.931035,Game,"10,000,000+",very large,True,Free,True,Mature 17+,True,False,19.0,307.0
2,134,75.0,2.357143,Lifestyle,"5,000+",medium,False,cheap,False,Everyone,False,False,28.0,132.0
3,2629,166.0,4.558824,Game,"100,000+",medium,True,Free,True,Everyone,True,False,189.0,363.0
4,2217,57.0,4.753087,Tools,"50,000+",small,False,cheap,False,Everyone,False,False,1.0,225.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2976,19,17.0,4.894737,Game,"1,000+",very small,True,Free,False,Teen,True,False,46.0,82.0
2977,30512,29.0,4.406593,Game,"100,000+",medium,True,Free,False,Teen,True,False,144.0,317.0
2978,1708,6.0,2.250000,Game,"50,000+",very large,True,Free,False,Teen,True,False,5.0,96.0
2979,24562,27.0,4.960000,Game,"100,000+",very large,True,Free,False,Teen,True,False,89.0,141.0


In [1580]:
model_df["INSTALLS"].value_counts()

100,000+        621
1,000,000+      547
10,000+         441
500,000+        251
1,000+          245
50,000+         231
10,000,000+     180
5,000+          159
5,000,000+      145
500+             90
100+             46
50,000,000+      13
100,000,000+      5
50+               4
10+               3
Name: INSTALLS, dtype: int64

In [1581]:
model_df['INSTALLS_GROUP'] = 'More than 1M'
model_df.loc[(model_df['INSTALLS'] == '1+') | (model_df['INSTALLS'] == '10+') | (model_df['INSTALLS'] == '50+') | 
       (model_df['INSTALLS'] == '100+') | (model_df['INSTALLS'] == '500+' ) | (model_df['INSTALLS'] == '1,000+' ) | 
       (model_df['INSTALLS'] == '5,000+' ) | (model_df['INSTALLS'] == '10,000+') | (model_df['INSTALLS'] == '50,000+') | (model_df['INSTALLS'] == '100,000+') |
             (model_df['INSTALLS'] == '500,000+'), 'INSTALLS_GROUP'] = 'Less than 1M'

model_df['INSTALLS_GROUP'] .value_counts()

Less than 1M    2091
More than 1M     890
Name: INSTALLS_GROUP, dtype: int64

### Data Preprocessing for model

From eda, we noticed that it is unlikely to accurately predict the rating of the app, due to its nature such that rating is not only affected by the features of the app, but user experience and user interface plays an important role too. In this case, our group set the indicator for the performance of the app to how much it has been installed. 

For data preprosessing, we picked 10 predictors for predicting the installs group that the app is going to fall into, namely
1. `CATEGORY` (cat)
2. `SIZEBAND` (cat)
3. `FREE` (cat)
4. `PRICEBAND` (cat)
5. `AD_SUPPORTED` (cat)
6. `CONTENT_RATING` (cat)
7.  `IN_APP_PURCHASES` (cat)
8.  `EDITORS_CHOICE` (cat)
9.  `DAYS_SINCE_UPDATE` (num)
10. `DAYS_SINCE_RELEASED` (num)

Before we continue, we need to do some changes on `CATEGORY` and `CONTENT_RATING`.

For `CATEGORY`, we noticed from eda that most apps falls under "Game" category, with that we change the category values to either "Game" or "Non Game"

In [1582]:
model_df.loc[model_df["CATEGORY"] != "Game", "CATEGORY"] = "Non Game"
model_df["CATEGORY"].value_counts()

Game        1529
Non Game    1452
Name: CATEGORY, dtype: int64

For `CONTENT_RATING`, we just include Everyone, Teens and Adult

In [1583]:
model_df['CONTENT_RATING'] = model_df['CONTENT_RATING'].replace('Mature 17+',"Adults")
model_df['CONTENT_RATING'] = model_df['CONTENT_RATING'].replace('Adults only 18+',"Adults")
model_df['CONTENT_RATING'] = model_df['CONTENT_RATING'].replace('Everyone 10+',"Everyone")
model_df["CONTENT_RATING"].value_counts()

Everyone    2129
Teen         667
Adults       185
Name: CONTENT_RATING, dtype: int64

In [1584]:
model_df["DAYS_SINCE_RELEASED"].describe()

count    2981.000000
mean      178.271721
std       102.075656
min         3.000000
25%        91.000000
50%       173.000000
75%       263.000000
max       365.000000
Name: DAYS_SINCE_RELEASED, dtype: float64

In [1553]:
model_df['DAYS_SINCE_RELEASED_RANGE'] = 'More than 6 months'
model_df.loc[(model_df['DAYS_SINCE_RELEASED'] <= 30), "DAYS_SINCE_RELEASED_RANGE"] = "Within 1 month"
model_df.loc[(model_df['DAYS_SINCE_RELEASED'] > 30) & (model_df['DAYS_SINCE_RELEASED'] <= 30*3), "DAYS_SINCE_RELEASED_RANGE"] = "1 to 3 months"
model_df.loc[(model_df['DAYS_SINCE_RELEASED'] > 30*3) & (model_df['DAYS_SINCE_RELEASED'] <= 30*6), "DAYS_SINCE_RELEASED_RANGE"] = "3 to 6 months"
model_df.drop(["DAYS_SINCE_RELEASED"], axis = 1, inplace=True)

model_df['DAYS_SINCE_RELEASED_RANGE'].value_counts()

Less than 6 months    1573
6 months to 1 year    1408
Name: DAYS_SINCE_RELEASED_RANGE, dtype: int64

In [1554]:
model_df["DAYS_SINCE_UPDATE"].describe()

count    2981.000000
mean       51.885609
std        67.241805
min         0.000000
25%         8.000000
50%        22.000000
75%        68.000000
max       364.000000
Name: DAYS_SINCE_UPDATE, dtype: float64

In [1555]:
model_df['DAYS_SINCE_UPDATE_RANGE'] = 'More than 6 months'
model_df.loc[(model_df['DAYS_SINCE_UPDATE'] <= 5), "DAYS_SINCE_UPDATE_RANGE"] = "Few days ago"
model_df.loc[(model_df['DAYS_SINCE_UPDATE'] > 5) & (model_df['DAYS_SINCE_UPDATE'] <= 11), "DAYS_SINCE_UPDATE_RANGE"] = "Almost a week"
model_df.loc[(model_df['DAYS_SINCE_UPDATE'] > 11) & (model_df['DAYS_SINCE_UPDATE'] <= 30), "DAYS_SINCE_UPDATE_RANGE"] = "Within 1 month"
model_df.loc[(model_df['DAYS_SINCE_UPDATE'] > 30) & (model_df['DAYS_SINCE_UPDATE'] <= 30*3), "DAYS_SINCE_UPDATE_RANGE"] = "1 to 3 months"
model_df.loc[(model_df['DAYS_SINCE_UPDATE'] > 30*3) & (model_df['DAYS_SINCE_UPDATE'] <= 30*6), "DAYS_SINCE_UPDATE_RANGE"] = "3 to 6 months"
model_df.drop(["DAYS_SINCE_UPDATE"], axis = 1, inplace=True)

model_df['DAYS_SINCE_UPDATE_RANGE'] .value_counts()

1 to 3 months         743
Within 1 month        729
Few days ago          517
Almost a week         434
3 to 6 months         362
More than 6 months    196
Name: DAYS_SINCE_UPDATE_RANGE, dtype: int64

**Split the dataset into train and test data**

In [1556]:
# Extract Response and Predictors
model_df.drop(["RATING"], axis=1, inplace=True)
response = "INSTALLS_GROUP"
y = pd.DataFrame(model_df[response])
X = pd.DataFrame(model_df.drop(response, axis = 1))

# Split the dataset into 70% train and 30% test set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=20)

y_train.value_counts()

Flushing oldest 200 entries.
  warn('Output cache limit (currently {sz} entries) hit.\n'


INSTALLS_GROUP
Less than 1M      1571
More than 1M       664
dtype: int64

In [1557]:
# concat training data
train_df = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
train_df.head()

Unnamed: 0,RATING_COUNT,REVIEW_COUNT,CATEGORY,INSTALLS,SIZEBAND,FREE,PRICEBAND,AD_SUPPORTED,CONTENT_RATING,IN_APP_PURCHASES,EDITORS_CHOICE,DAYS_SINCE_RELEASED_RANGE,DAYS_SINCE_UPDATE_RANGE,INSTALLS_GROUP
0,18822,340.0,Game,"1,000,000+",very large,True,Free,True,Everyone,True,False,6 months to 1 year,3 to 6 months,More than 1M
1,96541,2904.0,Game,"5,000,000+",medium,True,Free,True,Adults,False,False,6 months to 1 year,Almost a week,More than 1M
2,20082,370.0,Game,"5,000,000+",very large,True,Free,True,Teen,False,False,6 months to 1 year,Within 1 month,More than 1M
3,1974,27.0,Non Game,"1,000,000+",small,True,Free,True,Everyone,False,False,Less than 6 months,1 to 3 months,More than 1M
4,92,10.0,Game,"1,000+",very large,False,cheap,False,Teen,False,False,6 months to 1 year,3 to 6 months,Less than 1M


### Resampling INSTALLS_GROUP

We noticed that there is an imbalanced distribution of `INSTALLS_GROUP`, we handle them by oversampling techniques below.
1. Random Oversampling
2. SMOTEN

In [1558]:
y_train.value_counts()

INSTALLS_GROUP
Less than 1M      1571
More than 1M       664
dtype: int64

**Random Oversampling**

We will resample the data to equal portions

In [1559]:
strategy = {"Less than 1M": 1672, 
            "More than 1M": 1672,}
over_sampler = RandomOverSampler(sampling_strategy=strategy, random_state=20)
X_over, y_over = over_sampler.fit_resample(X_train, y_train)

y_over.value_counts()



INSTALLS_GROUP
Less than 1M      1672
More than 1M      1672
dtype: int64

In [1560]:
train_oversampled_df = pd.concat([X_over, y_over], axis=1)
train_oversampled_df.head()

Unnamed: 0,RATING_COUNT,REVIEW_COUNT,CATEGORY,INSTALLS,SIZEBAND,FREE,PRICEBAND,AD_SUPPORTED,CONTENT_RATING,IN_APP_PURCHASES,EDITORS_CHOICE,DAYS_SINCE_RELEASED_RANGE,DAYS_SINCE_UPDATE_RANGE,INSTALLS_GROUP
0,18822,340.0,Game,"1,000,000+",very large,True,Free,True,Everyone,True,False,6 months to 1 year,3 to 6 months,More than 1M
1,96541,2904.0,Game,"5,000,000+",medium,True,Free,True,Adults,False,False,6 months to 1 year,Almost a week,More than 1M
2,20082,370.0,Game,"5,000,000+",very large,True,Free,True,Teen,False,False,6 months to 1 year,Within 1 month,More than 1M
3,1974,27.0,Non Game,"1,000,000+",small,True,Free,True,Everyone,False,False,Less than 6 months,1 to 3 months,More than 1M
4,92,10.0,Game,"1,000+",very large,False,cheap,False,Teen,False,False,6 months to 1 year,3 to 6 months,Less than 1M


**SMOTEN Oversampling** 

Since our data are purely categorical, we apply the SMOTEN oversampling technique to resample our data.

In [1561]:
strategy = {"Less than 1M": 1672, 
            "More than 1M": 1672,}
smoten_sampler = SMOTEN(sampling_strategy=strategy, random_state=20)
X_smoten, y_smoten = smoten_sampler.fit_resample(X_train, y_train)

y_smoten.value_counts()



INSTALLS_GROUP
Less than 1M      1672
More than 1M      1672
dtype: int64

In [1562]:
train_smoten_df = pd.concat([X_smoten, y_smoten], axis=1)
train_smoten_df.head()

Unnamed: 0,RATING_COUNT,REVIEW_COUNT,CATEGORY,INSTALLS,SIZEBAND,FREE,PRICEBAND,AD_SUPPORTED,CONTENT_RATING,IN_APP_PURCHASES,EDITORS_CHOICE,DAYS_SINCE_RELEASED_RANGE,DAYS_SINCE_UPDATE_RANGE,INSTALLS_GROUP
0,18822,340.0,Game,"1,000,000+",very large,True,Free,True,Everyone,True,False,6 months to 1 year,3 to 6 months,More than 1M
1,96541,2904.0,Game,"5,000,000+",medium,True,Free,True,Adults,False,False,6 months to 1 year,Almost a week,More than 1M
2,20082,370.0,Game,"5,000,000+",very large,True,Free,True,Teen,False,False,6 months to 1 year,Within 1 month,More than 1M
3,1974,27.0,Non Game,"1,000,000+",small,True,Free,True,Everyone,False,False,Less than 6 months,1 to 3 months,More than 1M
4,92,10.0,Game,"1,000+",very large,False,cheap,False,Teen,False,False,6 months to 1 year,3 to 6 months,Less than 1M


**Combining test_df**

In [1563]:
test_df = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)
test_df.head()

Unnamed: 0,RATING_COUNT,REVIEW_COUNT,CATEGORY,INSTALLS,SIZEBAND,FREE,PRICEBAND,AD_SUPPORTED,CONTENT_RATING,IN_APP_PURCHASES,EDITORS_CHOICE,DAYS_SINCE_RELEASED_RANGE,DAYS_SINCE_UPDATE_RANGE,INSTALLS_GROUP
0,906,39.0,Game,"500,000+",very large,True,Free,False,Teen,True,False,Less than 6 months,Few days ago,Less than 1M
1,14905,127.0,Game,"1,000,000+",medium,True,Free,True,Everyone,True,False,Less than 6 months,Within 1 month,More than 1M
2,440,17.0,Non Game,"50,000+",small,True,Free,True,Everyone,False,False,Less than 6 months,3 to 6 months,Less than 1M
3,342,1.0,Game,"100,000+",medium,True,Free,True,Everyone,True,False,Less than 6 months,Almost a week,Less than 1M
4,4959,34.0,Game,"1,000,000+",medium,True,Free,True,Everyone,True,False,6 months to 1 year,Within 1 month,More than 1M


#### Encoding categorical values

Before we continue, our categorical variables are further divided into 2 types —
- Ordinal categorical variables - `INSTALLS_GROUP`, `SIZEBAND`, `PRICEBAND`, `RATING_GROUP`,`FREE`, `AD_SUPPORTED`, `IN_APP_PURCHASES`, `EDITORS_CHOICE`
- Nominal categorical variable - `CATEGORY`,  `CONTENT_RATING`

In [1564]:
nominal_cat = ["CATEGORY"]
ordinal_cat = ["FREE","AD_SUPPORTED", "INSTALLS_GROUP", "PRICEBAND", "SIZEBAND", "IN_APP_PURCHASES", "DAYS_SINCE_UPDATE_RANGE"]

numerical = ["REVIEW_COUNT", "RATING_COUNT"]

In [1565]:
model_df['INSTALLS_GROUP'] .value_counts()

Less than 1M    2091
More than 1M     890
Name: INSTALLS_GROUP, dtype: int64

**Function for handling Ordinal Categorical Variables**

In [1566]:
def encode_ord_cat(df):
    ig_mapping = {"Less than 1M": 0, "More than 1M": 1}
    pb_mapping = {"Free": 0, "cheap": 1, "normal": 2, "expensive": 3, "very expensive": 4}
    sb_mapping = {"very small": 0, "small": 1, "medium": 2, "large": 3, "very large": 4}
    # dsr_mapping = {"Less than 6 months": 0, "6 months to 1 year": 1, "1 to 3 years": 2, "3 to 5 years": 3, "5 to 7 years": 4, "More than 7 years": 5}
    dsu_mapping = {"Few days ago": 0, "Almost a week": 1, "Within 1 month": 2, "1 to 3 months": 3, "3 to 6 months": 4, "More than 6 months": 5}
    free_mapping = {False: 0, True: 1}
    ads_mapping = {False: 0, True: 1}
    iap_mapping = {False: 0, True: 1}
    ec_mapping = {False: 0, True: 1}
    
    model_df_ord = df.copy()
    
    model_df_ord["INSTALLS_GROUP"] = model_df_ord["INSTALLS_GROUP"].map(ig_mapping)
    model_df_ord["PRICEBAND"] = model_df_ord["PRICEBAND"].map(pb_mapping)
    model_df_ord["SIZEBAND"] = model_df_ord["SIZEBAND"].map(sb_mapping)
    model_df_ord["FREE"] = model_df_ord["FREE"].map(free_mapping)
    model_df_ord["AD_SUPPORTED"] = model_df_ord["AD_SUPPORTED"].map(ads_mapping)
    model_df_ord["IN_APP_PURCHASES"] = model_df_ord["IN_APP_PURCHASES"].map(iap_mapping)
    # model_df_ord["EDITORS_CHOICE"] = model_df_ord["EDITORS_CHOICE"].map(ec_mapping)
    # model_df_ord["DAYS_SINCE_RELEASED_RANGE"] = model_df_ord["DAYS_SINCE_RELEASED_RANGE"].map(dsr_mapping)
    model_df_ord["DAYS_SINCE_UPDATE_RANGE"] = model_df_ord["DAYS_SINCE_UPDATE_RANGE"].map(dsu_mapping)
    
    return model_df_ord

**Function for handling Nominal Categorical Variables**

In [1567]:
def encode_norm_cat(df):
    ohe = OneHotEncoder()
    model_df_cat = df[nominal_cat]
    ohe.fit(model_df_cat)
    model_df_cat_ohe = pd.DataFrame(ohe.transform(model_df_cat).toarray(), 
                                    columns=ohe.get_feature_names_out(model_df_cat.columns))
    return model_df_cat_ohe

**Encode all datasets**

In [1568]:
model_df_ord = encode_ord_cat(train_df)[ordinal_cat]
model_df_norm = encode_norm_cat(train_df)
model_df_num = train_df[numerical]
# Combine all features
train_df_encoded = pd.concat([ model_df_norm, model_df_ord, model_df_num], 
                        sort = False, axis = 1).reindex(index=model_df_ord.index)
train_df_encoded.head()

Unnamed: 0,CATEGORY_Game,CATEGORY_Non Game,FREE,AD_SUPPORTED,INSTALLS_GROUP,PRICEBAND,SIZEBAND,IN_APP_PURCHASES,DAYS_SINCE_UPDATE_RANGE,REVIEW_COUNT,RATING_COUNT
0,1.0,0.0,1,1,1,0,4,1,4,340.0,18822
1,1.0,0.0,1,1,1,0,2,0,1,2904.0,96541
2,1.0,0.0,1,1,1,0,4,0,2,370.0,20082
3,0.0,1.0,1,1,1,0,1,0,3,27.0,1974
4,1.0,0.0,0,0,0,1,4,0,4,10.0,92


In [1569]:
model_df_ord = encode_ord_cat(train_oversampled_df)[ordinal_cat]
model_df_norm = encode_norm_cat(train_oversampled_df)
model_df_num = train_oversampled_df[numerical]
train_df_oversampled_encoded = pd.concat([ model_df_norm, model_df_ord, model_df_num], 
                        sort = False, axis = 1).reindex(index=model_df_ord.index)
train_df_oversampled_encoded.head()

Unnamed: 0,CATEGORY_Game,CATEGORY_Non Game,FREE,AD_SUPPORTED,INSTALLS_GROUP,PRICEBAND,SIZEBAND,IN_APP_PURCHASES,DAYS_SINCE_UPDATE_RANGE,REVIEW_COUNT,RATING_COUNT
0,1.0,0.0,1,1,1,0,4,1,4,340.0,18822
1,1.0,0.0,1,1,1,0,2,0,1,2904.0,96541
2,1.0,0.0,1,1,1,0,4,0,2,370.0,20082
3,0.0,1.0,1,1,1,0,1,0,3,27.0,1974
4,1.0,0.0,0,0,0,1,4,0,4,10.0,92


In [1570]:
model_df_ord = encode_ord_cat(train_smoten_df)[ordinal_cat]
model_df_norm = encode_norm_cat(train_smoten_df)
model_df_num = train_smoten_df[numerical]
train_df_smoten_encoded = pd.concat([ model_df_norm, model_df_ord, model_df_num], 
                        sort = False, axis = 1).reindex(index=model_df_ord.index)
train_df_smoten_encoded.head()

Unnamed: 0,CATEGORY_Game,CATEGORY_Non Game,FREE,AD_SUPPORTED,INSTALLS_GROUP,PRICEBAND,SIZEBAND,IN_APP_PURCHASES,DAYS_SINCE_UPDATE_RANGE,REVIEW_COUNT,RATING_COUNT
0,1.0,0.0,1,1,1,0,4,1,4,340.0,18822
1,1.0,0.0,1,1,1,0,2,0,1,2904.0,96541
2,1.0,0.0,1,1,1,0,4,0,2,370.0,20082
3,0.0,1.0,1,1,1,0,1,0,3,27.0,1974
4,1.0,0.0,0,0,0,1,4,0,4,10.0,92


In [1571]:
model_df_ord = encode_ord_cat(test_df)[ordinal_cat]
model_df_norm = encode_norm_cat(test_df)
model_df_num = test_df[numerical]
test_df_encoded = pd.concat([ model_df_norm, model_df_ord, model_df_num], 
                        sort = False, axis = 1).reindex(index=model_df_ord.index)
test_df_encoded.head()

Unnamed: 0,CATEGORY_Game,CATEGORY_Non Game,FREE,AD_SUPPORTED,INSTALLS_GROUP,PRICEBAND,SIZEBAND,IN_APP_PURCHASES,DAYS_SINCE_UPDATE_RANGE,REVIEW_COUNT,RATING_COUNT
0,1.0,0.0,1,0,0,0,4,1,0,39.0,906
1,1.0,0.0,1,1,1,0,2,1,2,127.0,14905
2,0.0,1.0,1,1,0,0,1,0,4,17.0,440
3,1.0,0.0,1,1,0,0,2,1,1,1.0,342
4,1.0,0.0,1,1,1,0,2,1,2,34.0,4959


In [1572]:
train_df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2235 entries, 0 to 2234
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   CATEGORY_Game            2235 non-null   float64
 1   CATEGORY_Non Game        2235 non-null   float64
 2   FREE                     2235 non-null   int64  
 3   AD_SUPPORTED             2235 non-null   int64  
 4   INSTALLS_GROUP           2235 non-null   int64  
 5   PRICEBAND                2235 non-null   int64  
 6   SIZEBAND                 2235 non-null   int64  
 7   IN_APP_PURCHASES         2235 non-null   int64  
 8   DAYS_SINCE_UPDATE_RANGE  2235 non-null   int64  
 9   REVIEW_COUNT             2235 non-null   float64
 10  RATING_COUNT             2235 non-null   int64  
dtypes: float64(3), int64(8)
memory usage: 192.2 KB


### Export the files

In [1573]:
file_path = "./datasets/train.csv"
train_df_encoded.to_csv(path_or_buf=file_path, index=False);

file_path = "./datasets/train_oversampled.csv"
train_df_oversampled_encoded.to_csv(path_or_buf=file_path, index=False);

file_path = "./datasets/train_smoten.csv"
train_df_smoten_encoded.to_csv(path_or_buf=file_path, index=False);

file_path = "./datasets/test.csv"
test_df_encoded.to_csv(path_or_buf=file_path, index=False);