# Random Forest Model-Total Sales

## To predict the class of total sales (low, medium, high) based on different features
### Target Variable and Features
- Target variable (y) = Total_Sales_Class (low, medium, high)
- X = Genre, ESRB_Rating, Platform, Publisher, Developer_x, Country

### Machine Learning Models
- rf_model = RandomForestClassifier
- brf_model = BalancedRandomForestClassifier
- eec_model = EasyEnsembleClassifier


In [95]:
import pandas as pd
import numpy as np

from pathlib import Path
from collections import Counter

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [175]:
# Load the dataset from AWS S3 bucket
#games_df = pd.read_csv('https://video-game-dataset-uot-boot-camp-2022-group-4.s3.us-east-2.amazonaws.com/all_columns_df.csv')
games_df = pd.read_csv('Cleaned_Data/all_columns_df.csv')
games_df

Unnamed: 0,Rank,Name,Genre,ESRB_Rating,Platform,Publisher,Developer_x,Critic_Score,User_Score,Year,Country,Total_Sales
0,1,Wii Sports,Sports,E,Wii,Nintendo,Nintendo EAD,7.7,,2006.0,Japan,82.86
1,2,Super Mario Bros.,Platform,,NES,Nintendo,Nintendo EAD,10.0,,1985.0,Japan,40.24
2,3,Mario Kart Wii,Racing,E,Wii,Nintendo,Nintendo EAD,8.2,9.1,2008.0,Japan,37.14
3,4,PlayerUnknown's Battlegrounds,Shooter,,PC,PUBG Corporation,PUBG Corporation,,,2017.0,,36.60
4,5,Wii Sports Resort,Sports,E,Wii,Nintendo,Nintendo EAD,8.0,8.8,2009.0,Japan,33.09
...,...,...,...,...,...,...,...,...,...,...,...,...
19857,19858,FirePower for Microsoft Combat Flight Simulator 3,Simulation,T,PC,GMX Media,Shockwave Productions,,,2004.0,,0.01
19858,19859,Tom Clancy's Splinter Cell,Shooter,T,PC,Ubisoft,Ubisoft,,,2003.0,Europe,0.01
19859,19860,Ashita no Joe 2: The Anime Super Remix,Fighting,,PS2,Capcom,Capcom,,,2002.0,Japan,0.01
19860,19861,Tokyo Yamanote Boys for V: Main Disc,Adventure,,PSV,Rejet,Rejet,,,2017.0,,0.01


In [176]:
games_df.dtypes

Rank              int64
Name             object
Genre            object
ESRB_Rating      object
Platform         object
Publisher        object
Developer_x      object
Critic_Score    float64
User_Score      float64
Year            float64
Country          object
Total_Sales     float64
dtype: object

In [177]:
# Change 'Year' dtype to object
games_df['Year'] = pd.to_datetime(games_df['Year'], format = '%Y').dt.strftime('%Y')
games_df.dtypes

Rank              int64
Name             object
Genre            object
ESRB_Rating      object
Platform         object
Publisher        object
Developer_x      object
Critic_Score    float64
User_Score      float64
Year             object
Country          object
Total_Sales     float64
dtype: object

In [179]:
# Drop columns that won't be included in the analysis
games_df.drop(['Rank'], axis=1, inplace=True)
games_df

Unnamed: 0,Name,Genre,ESRB_Rating,Platform,Publisher,Developer_x,Critic_Score,User_Score,Year,Country,Total_Sales
0,Wii Sports,Sports,E,Wii,Nintendo,Nintendo EAD,7.7,,2006,Japan,82.86
1,Super Mario Bros.,Platform,,NES,Nintendo,Nintendo EAD,10.0,,1985,Japan,40.24
2,Mario Kart Wii,Racing,E,Wii,Nintendo,Nintendo EAD,8.2,9.1,2008,Japan,37.14
3,PlayerUnknown's Battlegrounds,Shooter,,PC,PUBG Corporation,PUBG Corporation,,,2017,,36.60
4,Wii Sports Resort,Sports,E,Wii,Nintendo,Nintendo EAD,8.0,8.8,2009,Japan,33.09
...,...,...,...,...,...,...,...,...,...,...,...
19857,FirePower for Microsoft Combat Flight Simulator 3,Simulation,T,PC,GMX Media,Shockwave Productions,,,2004,,0.01
19858,Tom Clancy's Splinter Cell,Shooter,T,PC,Ubisoft,Ubisoft,,,2003,Europe,0.01
19859,Ashita no Joe 2: The Anime Super Remix,Fighting,,PS2,Capcom,Capcom,,,2002,Japan,0.01
19860,Tokyo Yamanote Boys for V: Main Disc,Adventure,,PSV,Rejet,Rejet,,,2017,,0.01


In [180]:
# Sattistics of the Total_Sales column
games_df.describe()

Unnamed: 0,Critic_Score,User_Score,Total_Sales
count,4706.0,238.0,19862.0
mean,7.269911,8.465546,0.530876
std,1.420956,1.215681,1.572634
min,1.0,2.0,0.01
25%,6.5,8.0,0.05
50%,7.5,8.8,0.16
75%,8.3,9.3,0.45
max,10.0,10.0,82.86


## Bin Total_Sales and Create 'Total_Sales_Class' column

In [181]:
# Create bin for 'Total_Sales' column
bins = [0,1,10,100]
labels = ['low', 'medium', 'high']

In [182]:
# Bin 'Total_Sales' into new column
games_df['Total_Sales_Class'] = pd.cut(games_df['Total_Sales'], bins=bins, labels=labels, right=False)
games_df.head(30)

Unnamed: 0,Name,Genre,ESRB_Rating,Platform,Publisher,Developer_x,Critic_Score,User_Score,Year,Country,Total_Sales,Total_Sales_Class
0,Wii Sports,Sports,E,Wii,Nintendo,Nintendo EAD,7.7,,2006,Japan,82.86,high
1,Super Mario Bros.,Platform,,NES,Nintendo,Nintendo EAD,10.0,,1985,Japan,40.24,high
2,Mario Kart Wii,Racing,E,Wii,Nintendo,Nintendo EAD,8.2,9.1,2008,Japan,37.14,high
3,PlayerUnknown's Battlegrounds,Shooter,,PC,PUBG Corporation,PUBG Corporation,,,2017,,36.6,high
4,Wii Sports Resort,Sports,E,Wii,Nintendo,Nintendo EAD,8.0,8.8,2009,Japan,33.09,high
5,Pokemon Red / Green / Blue Version,Role-Playing,E,GB,Nintendo,Game Freak,9.4,,1998,Japan,31.38,high
6,New Super Mario Bros.,Platform,E,DS,Nintendo,Nintendo EAD,9.1,8.1,2006,Japan,30.8,high
7,Tetris,Puzzle,E,GB,Nintendo,Bullet Proof Software,,,1989,Japan,30.26,high
8,New Super Mario Bros. Wii,Platform,E,Wii,Nintendo,Nintendo EAD,8.6,9.2,2009,Japan,30.22,high
9,Minecraft,Misc,,PC,Mojang,Mojang AB,10.0,,2010,,30.01,high


In [183]:
games_df.Total_Sales_Class.value_counts()

low       17420
medium     2355
high         87
Name: Total_Sales_Class, dtype: int64

In [184]:
# Dropping unnecessary columns
games_df.drop(['Name','User_Score'], axis=1, inplace=True)
games_df

Unnamed: 0,Genre,ESRB_Rating,Platform,Publisher,Developer_x,Critic_Score,Year,Country,Total_Sales,Total_Sales_Class
0,Sports,E,Wii,Nintendo,Nintendo EAD,7.7,2006,Japan,82.86,high
1,Platform,,NES,Nintendo,Nintendo EAD,10.0,1985,Japan,40.24,high
2,Racing,E,Wii,Nintendo,Nintendo EAD,8.2,2008,Japan,37.14,high
3,Shooter,,PC,PUBG Corporation,PUBG Corporation,,2017,,36.60,high
4,Sports,E,Wii,Nintendo,Nintendo EAD,8.0,2009,Japan,33.09,high
...,...,...,...,...,...,...,...,...,...,...
19857,Simulation,T,PC,GMX Media,Shockwave Productions,,2004,,0.01,low
19858,Shooter,T,PC,Ubisoft,Ubisoft,,2003,Europe,0.01,low
19859,Fighting,,PS2,Capcom,Capcom,,2002,Japan,0.01,low
19860,Adventure,,PSV,Rejet,Rejet,,2017,,0.01,low


In [185]:
games_df.count()

Genre                19862
ESRB_Rating          13925
Platform             19862
Publisher            19862
Developer_x          19860
Critic_Score          4706
Year                 19859
Country              11877
Total_Sales          19862
Total_Sales_Class    19862
dtype: int64

## Checking NaNs

In [186]:
# Check null values
games_df.count()

Genre                19862
ESRB_Rating          13925
Platform             19862
Publisher            19862
Developer_x          19860
Critic_Score          4706
Year                 19859
Country              11877
Total_Sales          19862
Total_Sales_Class    19862
dtype: int64

In [187]:
# To see the row count if drop NaN in all columns
games_df.dropna().count()

Genre                3579
ESRB_Rating          3579
Platform             3579
Publisher            3579
Developer_x          3579
Critic_Score         3579
Year                 3579
Country              3579
Total_Sales          3579
Total_Sales_Class    3579
dtype: int64

# Dataframe 1: with 'Critic_Score' column

In [194]:
# Drop all NaNs values
no_nan_df = games_df.drop(['Total_Sales'], axis=1).dropna()
print(no_nan_df.shape)
no_nan_df.head()

(3579, 9)


Unnamed: 0,Genre,ESRB_Rating,Platform,Publisher,Developer_x,Critic_Score,Year,Country,Total_Sales_Class
0,Sports,E,Wii,Nintendo,Nintendo EAD,7.7,2006,Japan,high
2,Racing,E,Wii,Nintendo,Nintendo EAD,8.2,2008,Japan,high
4,Sports,E,Wii,Nintendo,Nintendo EAD,8.0,2009,Japan,high
5,Role-Playing,E,GB,Nintendo,Game Freak,9.4,1998,Japan,high
6,Platform,E,DS,Nintendo,Nintendo EAD,9.1,2006,Japan,high


## Bucket data to top 10 and other bins

In [197]:
# Check unique values
no_nan_df.nunique()

Genre                 18
ESRB_Rating            5
Platform              24
Publisher             62
Developer_x          774
Critic_Score          81
Year                  28
Country               11
Total_Sales_Class      3
dtype: int64

In [198]:
# Keep top 15 of Genre
top = no_nan_df.Genre.value_counts().index[0:15]
no_nan_df.Genre = np.where(no_nan_df.Genre.isin(top), no_nan_df.Genre,'other')

In [199]:
# Keep top 15 of Platform
top = no_nan_df.Platform.value_counts().index[0:15]
no_nan_df.Platform = np.where(no_nan_df.Platform.isin(top), no_nan_df.Platform,'other')

In [200]:
# Keep top 15 of Publisher
top = no_nan_df.Publisher.value_counts().index[0:15]
no_nan_df.Publisher = np.where(no_nan_df.Publisher.isin(top), no_nan_df.Publisher, 'other')

In [201]:
# Keep top 15 of Developer_x
top = no_nan_df.Developer_x.value_counts().index[0:15]
no_nan_df.Developer_x = np.where(no_nan_df.Developer_x.isin(top), no_nan_df.Developer_x,'other')

In [202]:
# Keep top 15 of Year
top = no_nan_df.Year.value_counts().index[0:15]
no_nan_df.Year = np.where(no_nan_df.Year.isin(top), no_nan_df.Year, 'other')

In [203]:
no_nan_df

Unnamed: 0,Genre,ESRB_Rating,Platform,Publisher,Developer_x,Critic_Score,Year,Country,Total_Sales_Class
0,Sports,E,Wii,Nintendo,Nintendo EAD,7.7,2006,Japan,high
2,Racing,E,Wii,Nintendo,Nintendo EAD,8.2,2008,Japan,high
4,Sports,E,Wii,Nintendo,Nintendo EAD,8.0,2009,Japan,high
5,Role-Playing,E,other,Nintendo,other,9.4,other,Japan,high
6,Platform,E,DS,Nintendo,Nintendo EAD,9.1,2006,Japan,high
...,...,...,...,...,...,...,...,...,...
19732,Sports,E,GBA,2K Sports,other,6.6,2006,United States,low
19767,Action,M,PC,Ubisoft,Capcom,7.1,2006,Europe,low
19792,Shooter,T,PC,Activision,other,7.0,2003,United States,low
19794,Action,E,GBA,Atlus,other,6.0,2006,Japan,low


In [204]:
# Check unique values
no_nan_df.nunique()

Genre                16
ESRB_Rating           5
Platform             16
Publisher            16
Developer_x          16
Critic_Score         81
Year                 16
Country              11
Total_Sales_Class     3
dtype: int64

## Encoding categorical variables

In [206]:
# Assign features
X = no_nan_df.drop('Total_Sales_Class', axis = 1)
X

Unnamed: 0,Genre,ESRB_Rating,Platform,Publisher,Developer_x,Critic_Score,Year,Country
0,Sports,E,Wii,Nintendo,Nintendo EAD,7.7,2006,Japan
2,Racing,E,Wii,Nintendo,Nintendo EAD,8.2,2008,Japan
4,Sports,E,Wii,Nintendo,Nintendo EAD,8.0,2009,Japan
5,Role-Playing,E,other,Nintendo,other,9.4,other,Japan
6,Platform,E,DS,Nintendo,Nintendo EAD,9.1,2006,Japan
...,...,...,...,...,...,...,...,...
19732,Sports,E,GBA,2K Sports,other,6.6,2006,United States
19767,Action,M,PC,Ubisoft,Capcom,7.1,2006,Europe
19792,Shooter,T,PC,Activision,other,7.0,2003,United States
19794,Action,E,GBA,Atlus,other,6.0,2006,Japan


In [207]:
X.dtypes

Genre            object
ESRB_Rating      object
Platform         object
Publisher        object
Developer_x      object
Critic_Score    float64
Year             object
Country          object
dtype: object

In [208]:
# Encoding object dtype columns
X_cat = X.select_dtypes(include='object')
X_cat = list(X_cat.columns)
X_cat

['Genre',
 'ESRB_Rating',
 'Platform',
 'Publisher',
 'Developer_x',
 'Year',
 'Country']

In [209]:
from sklearn.preprocessing import OneHotEncoder

# creating instance of one-hot-encoder
enc = OneHotEncoder(sparse=False)
# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(X[X_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(X_cat)

encode_df



Unnamed: 0,Genre_Action,Genre_Action-Adventure,Genre_Adventure,Genre_Fighting,Genre_Misc,Genre_Music,Genre_Party,Genre_Platform,Genre_Puzzle,Genre_Racing,...,Country_Europe,Country_Finland,Country_France,Country_Japan,Country_Norway,Country_Poland,Country_Russia,Country_South Korea,Country_United Kingdom,Country_United States
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3574,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3575,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3576,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3577,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [210]:
# Reset X dataframe index to merge with encode_df
X.reset_index(drop=True, inplace=True)
X

Unnamed: 0,Genre,ESRB_Rating,Platform,Publisher,Developer_x,Critic_Score,Year,Country
0,Sports,E,Wii,Nintendo,Nintendo EAD,7.7,2006,Japan
1,Racing,E,Wii,Nintendo,Nintendo EAD,8.2,2008,Japan
2,Sports,E,Wii,Nintendo,Nintendo EAD,8.0,2009,Japan
3,Role-Playing,E,other,Nintendo,other,9.4,other,Japan
4,Platform,E,DS,Nintendo,Nintendo EAD,9.1,2006,Japan
...,...,...,...,...,...,...,...,...
3574,Sports,E,GBA,2K Sports,other,6.6,2006,United States
3575,Action,M,PC,Ubisoft,Capcom,7.1,2006,Europe
3576,Shooter,T,PC,Activision,other,7.0,2003,United States
3577,Action,E,GBA,Atlus,other,6.0,2006,Japan


In [211]:
# Merge one-hot encoded features and drop the originals
X = X.merge(encode_df, left_index=True, right_index=True)
X = X.drop(X_cat,1)
X

  X = X.drop(X_cat,1)


Unnamed: 0,Critic_Score,Genre_Action,Genre_Action-Adventure,Genre_Adventure,Genre_Fighting,Genre_Misc,Genre_Music,Genre_Party,Genre_Platform,Genre_Puzzle,...,Country_Europe,Country_Finland,Country_France,Country_Japan,Country_Norway,Country_Poland,Country_Russia,Country_South Korea,Country_United Kingdom,Country_United States
0,7.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,8.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3574,6.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3575,7.1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3576,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3577,6.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [212]:
# Assign the target
y = no_nan_df['Total_Sales_Class']
y.value_counts()

low       2471
medium    1044
high        64
Name: Total_Sales_Class, dtype: int64

In [213]:
X.shape

(3579, 97)

In [214]:
y.shape

(3579,)

## Spliting and scale the data

In [215]:
# Split data to training and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Check the balance of the target variables.
print(f"y_train: {Counter(y_train)}")
print(f"y_test: {Counter(y_test)}")

y_train: Counter({'low': 1876, 'medium': 759, 'high': 49})
y_test: Counter({'low': 595, 'medium': 285, 'high': 15})


In [216]:
# Creating a StandardScaler instance.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Random Forest Classifier Model

In [231]:
# Create a random forest classifier.
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [269]:
# Fitting the model
rf_model_1 = rf_model.fit(X_train_scaled, y_train)

In [270]:
# Making predictions using the testing data.
y_pred_rf = rf_model_1.predict(X_test_scaled)

In [271]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
cm = confusion_matrix(y_test, rf_pred)

# Create a DataFrame from the confusion matrix.
#cm_df = pd.DataFrame(
#    cm, index=["Actual high", "Actual low"], columns=["Predicted high", "Predicted low"])

#cm_df
#cm

In [272]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, y_pred_rf)

In [273]:
# Displaying results
#print("Confusion Matrix")
#display(cm_df)
print('Model: Random Forest Classifier')
print("---------------------")
print(f"Accuracy Score : {acc_score}")
print("---------------------")
print("Classification Report")
print(classification_report(y_test, y_pred_rf))

Model: Random Forest Classifier
---------------------
Accuracy Score : 0.7675977653631285
---------------------
Classification Report
              precision    recall  f1-score   support

        high       0.33      0.13      0.19        15
         low       0.79      0.92      0.85       595
      medium       0.70      0.48      0.57       285

    accuracy                           0.77       895
   macro avg       0.61      0.51      0.54       895
weighted avg       0.75      0.77      0.75       895



## Rank the importance of features

In [335]:
# Calculate feature importance in the Random Forest model.
print("Feature Importance: rf model 1: all columns")
sorted(zip(rf_model_1.feature_importances_, X.columns), reverse=True)

Feature Importance: rf model 1: all columns


[(0.044193684132475, 'Publisher_Nintendo'),
 (0.024093815164273634, 'Critic_Score'),
 (0.02331408452774382, 'ESRB_Rating_T'),
 (0.023167113413960137, 'Publisher_other'),
 (0.022441355704036617, 'Year_other'),
 (0.021531852810864473, 'Year_2006'),
 (0.02121131888136623, 'Genre_other'),
 (0.021122356730877093, 'Platform_PS'),
 (0.0204750665301325, 'Genre_Racing'),
 (0.02002338589631102, 'Genre_Puzzle'),
 (0.01994935792891028, 'Year_2005'),
 (0.019649831282753518, 'Genre_Role-Playing'),
 (0.019486429569204735, 'ESRB_Rating_M'),
 (0.019119098129298708, 'Genre_Simulation'),
 (0.019004210884545, 'Genre_Misc'),
 (0.018669435828678325, 'Platform_NS'),
 (0.018300963522922615, 'Platform_PS2'),
 (0.017902777995300025, 'Year_2008'),
 (0.0178796012782204, 'Platform_WiiU'),
 (0.01785230921936681, 'ESRB_Rating_E'),
 (0.01779728100900375, 'Year_2009'),
 (0.01767476621527384, 'Genre_Party'),
 (0.01750212516032465, 'Developer_x_other'),
 (0.017087933784941552, 'Publisher_EA Sports'),
 (0.017062176582503

## Balanced Random Forest Classifier Model

In [274]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

brf_model = BalancedRandomForestClassifier(n_estimators=128, random_state = 78) 

# Fitting the model
brf_model_1 = brf_model.fit(X_train, y_train)

In [275]:
# Calculated the balanced accuracy score
y_pred_brf = brf_model_1.predict(X_test)

from sklearn.metrics import balanced_accuracy_score
brf_acc_score = balanced_accuracy_score(y_test, y_pred_brf)

In [55]:
# Display the confusion matrix
#from sklearn.metrics import confusion_matrix
#pd.DataFrame(
#    confusion_matrix(y_test, y_pred_brf),
#    index=["Actual high", "Actual low"],
#    columns=["Predicted high", "Predicted low"])

In [276]:
# Print the imbalanced classification report
print('Model: Balanced Random Forest Classifier')
print("---------------------")
print(f"Accuracy Score : {brf_acc_score}")
print("---------------------")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred_brf))

Model: Balanced Random Forest Classifier
---------------------
Accuracy Score : 0.6938915917244092
---------------------
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

       high       0.10      0.93      0.86      0.18      0.89      0.81        15
        low       0.84      0.74      0.73      0.79      0.73      0.54       595
     medium       0.50      0.41      0.81      0.45      0.58      0.32       285

avg / total       0.72      0.64      0.75      0.67      0.69      0.47       895



In [334]:
# Calculate feature importance
print("Feature Importance: brf model 1")
sorted(zip(brf_model_1.feature_importances_, X.columns), reverse=True)

Feature Importance: brf model 1


[(0.07361605882687765, 'Publisher_Nintendo'),
 (0.03991922858109818, 'ESRB_Rating_M'),
 (0.03756262178449561, 'Publisher_other'),
 (0.0311087607482961, 'Year_other'),
 (0.029896778270020227, 'ESRB_Rating_T'),
 (0.02707309587530154, 'Critic_Score'),
 (0.023744960394805253, 'Platform_WiiU'),
 (0.023251814966422476, 'Developer_x_Neversoft Entertainment'),
 (0.02314405765967697, 'Genre_Role-Playing'),
 (0.023114594743123638, 'Genre_other'),
 (0.020426000009680503, 'Genre_Racing'),
 (0.01988718030573432, 'Publisher_2K Sports'),
 (0.019406725068274764, 'Developer_x_other'),
 (0.019282843096206714, 'Platform_PS'),
 (0.01817565941752642, 'Year_2006'),
 (0.01772930951146559, 'Platform_PS3'),
 (0.016978543325082633, 'Platform_PS2'),
 (0.01692302584390739, 'Year_2009'),
 (0.016591320939371896, 'Genre_Party'),
 (0.016448677822674673, 'Genre_Simulation'),
 (0.01620909396185332, 'Platform_PC'),
 (0.016175524304636354, 'Genre_Misc'),
 (0.01602682179905282, 'Publisher_EA Sports'),
 (0.0152427632179712

## Easy Ensemble AdaBoost Classifier Model

In [277]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier 

eec_model = EasyEnsembleClassifier(n_estimators=128, random_state=78)

eec_model_1 = eec_model.fit(X_train, y_train)

In [278]:
# Calculated the balanced accuracy score
y_pred_eec = eec_model_1.predict(X_test)

ecc_acc_score = balanced_accuracy_score(y_test, y_pred_eec)

In [279]:
# Display the confusion matrix
#pd.DataFrame(
#    confusion_matrix(y_test, y_pred_eec),
#    index=["Actual high_risk", "Actual low_risk"],
#    columns=["Predicted high_risk", "Predicted low_risk"])

In [280]:
# Print the imbalanced classification report
print('Model: EasyEnsembleClassifier')
print("---------------------")
print(f"Accuracy Score : {ecc_acc_score}")
print("---------------------")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred_eec))

Model: EasyEnsembleClassifier
---------------------
Accuracy Score : 0.6665880387242616
---------------------
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

       high       0.14      0.73      0.93      0.24      0.82      0.67        15
        low       0.86      0.65      0.79      0.74      0.72      0.50       595
     medium       0.47      0.62      0.68      0.54      0.65      0.42       285

avg / total       0.73      0.64      0.76      0.67      0.70      0.48       895



# Dataframe 2: No 'Critic_Score' column
row count after dropping NaNs = 9383

In [240]:
# Drop 'Critic_Score' column 
no_critic_df = games_df.drop(['Critic_Score','Total_Sales'], axis=1).dropna()
print(no_critic_df.shape)
no_critic_df.head()

(9383, 8)


Unnamed: 0,Genre,ESRB_Rating,Platform,Publisher,Developer_x,Year,Country,Total_Sales_Class
0,Sports,E,Wii,Nintendo,Nintendo EAD,2006,Japan,high
2,Racing,E,Wii,Nintendo,Nintendo EAD,2008,Japan,high
4,Sports,E,Wii,Nintendo,Nintendo EAD,2009,Japan,high
5,Role-Playing,E,GB,Nintendo,Game Freak,1998,Japan,high
6,Platform,E,DS,Nintendo,Nintendo EAD,2006,Japan,high


In [241]:
# Check unique values
no_critic_df.nunique()

Genre                  19
ESRB_Rating             6
Platform               29
Publisher             100
Developer_x          1387
Year                   32
Country                17
Total_Sales_Class       3
dtype: int64

In [242]:
# Keep top 15 of Genre
top = no_critic_df.Genre.value_counts().index[0:15]
no_critic_df.Genre = np.where(no_critic_df.Genre.isin(top), no_critic_df.Genre,'other')

In [243]:
# Keep top 15 of Platform
top = no_critic_df.Platform.value_counts().index[0:15]
no_critic_df.Platform = np.where(no_critic_df.Platform.isin(top), no_critic_df.Platform,'other')

In [244]:
# Keep top 15 of Publisher
top = no_critic_df.Publisher.value_counts().index[0:15]
no_critic_df.Publisher = np.where(no_critic_df.Publisher.isin(top), no_critic_df.Publisher, 'other')

In [245]:
# Keep top 15 of Developer_x
top = no_critic_df.Developer_x.value_counts().index[0:15]
no_critic_df.Developer_x = np.where(no_critic_df.Developer_x.isin(top), no_critic_df.Developer_x,'other')

In [246]:
# Keep top 15 of Year
top = no_critic_df.Year.value_counts().index[0:15]
no_critic_df.Year = np.where(no_critic_df.Year.isin(top), no_critic_df.Year, 'other')

In [254]:
# Assign features
X2 = no_critic_df.drop('Total_Sales_Class', axis = 1)
X2

Unnamed: 0,Genre,ESRB_Rating,Platform,Publisher,Developer_x,Year,Country
0,Sports,E,Wii,Nintendo,Nintendo EAD,2006,Japan
2,Racing,E,Wii,Nintendo,Nintendo EAD,2008,Japan
4,Sports,E,Wii,Nintendo,Nintendo EAD,2009,Japan
5,Role-Playing,E,other,Nintendo,other,other,Japan
6,Platform,E,DS,Nintendo,Nintendo EAD,2006,Japan
...,...,...,...,...,...,...,...
19823,Adventure,E10,PC,other,other,2008,United States
19838,Strategy,T,PC,Sega,other,2006,United States
19850,Simulation,M,XOne,other,other,2017,AustriaSweden
19856,Platform,E,3DS,Nintendo,other,2011,Japan


In [255]:
# Encoding object dtype columns
X_cat2 = X2.select_dtypes(include='object')
X_cat2 = list(X_cat2.columns)
X_cat2

['Genre',
 'ESRB_Rating',
 'Platform',
 'Publisher',
 'Developer_x',
 'Year',
 'Country']

In [256]:
from sklearn.preprocessing import OneHotEncoder

# creating instance of one-hot-encoder
enc = OneHotEncoder(sparse=False)
# Fit and transform the OneHotEncoder using the categorical variable list
encode_df2 = pd.DataFrame(enc.fit_transform(X2[X_cat2]))

# Add the encoded variable names to the dataframe
encode_df2.columns = enc.get_feature_names(X_cat2)

encode_df2



Unnamed: 0,Genre_Action,Genre_Action-Adventure,Genre_Adventure,Genre_Fighting,Genre_MMO,Genre_Misc,Genre_Music,Genre_Platform,Genre_Puzzle,Genre_Racing,...,Country_Italy,Country_Japan,Country_Netherlands,Country_Norway,Country_Poland,Country_Russia,Country_South Korea,Country_Spain,Country_United Kingdom,Country_United States
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9378,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9379,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9380,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9381,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [257]:
# Reset X dataframe index to merge with encode_df
X2.reset_index(drop=True, inplace=True)
X2

Unnamed: 0,Genre,ESRB_Rating,Platform,Publisher,Developer_x,Year,Country
0,Sports,E,Wii,Nintendo,Nintendo EAD,2006,Japan
1,Racing,E,Wii,Nintendo,Nintendo EAD,2008,Japan
2,Sports,E,Wii,Nintendo,Nintendo EAD,2009,Japan
3,Role-Playing,E,other,Nintendo,other,other,Japan
4,Platform,E,DS,Nintendo,Nintendo EAD,2006,Japan
...,...,...,...,...,...,...,...
9378,Adventure,E10,PC,other,other,2008,United States
9379,Strategy,T,PC,Sega,other,2006,United States
9380,Simulation,M,XOne,other,other,2017,AustriaSweden
9381,Platform,E,3DS,Nintendo,other,2011,Japan


In [258]:
# Merge one-hot encoded features and drop the originals
X2 = X2.merge(encode_df2, left_index=True, right_index=True)
X2 = X2.drop(X_cat2, 1)
X2

  X2 = X2.drop(X_cat2, 1)


Unnamed: 0,Genre_Action,Genre_Action-Adventure,Genre_Adventure,Genre_Fighting,Genre_MMO,Genre_Misc,Genre_Music,Genre_Platform,Genre_Puzzle,Genre_Racing,...,Country_Italy,Country_Japan,Country_Netherlands,Country_Norway,Country_Poland,Country_Russia,Country_South Korea,Country_Spain,Country_United Kingdom,Country_United States
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9378,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9379,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9380,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9381,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [253]:
# Assign the target
y2 = no_critic_df['Total_Sales_Class']
y2.value_counts()

low       7591
medium    1717
high        75
Name: Total_Sales_Class, dtype: int64

In [259]:
print(X2.shape)
print(y2.shape)

(9383, 103)
(9383,)


In [260]:
# Split data to training and testing set
from sklearn.model_selection import train_test_split
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, random_state=1)

# Check the balance of the target variables.
print(f"y2_train: {Counter(y2_train)}")
print(f"y2_test: {Counter(y2_test)}")

y2_train: Counter({'low': 5677, 'medium': 1303, 'high': 57})
y2_test: Counter({'low': 1914, 'medium': 414, 'high': 18})


In [261]:
# Creating a StandardScaler instance.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X2_scaler = scaler.fit(X2_train)

# Scaling the data.
X2_train_scaled = X2_scaler.transform(X2_train)
X2_test_scaled = X2_scaler.transform(X2_test)

## Fit the models

In [281]:
# Fitting the model
rf_model_2 = rf_model.fit(X2_train_scaled, y2_train)

In [282]:
# Making predictions using the testing data.
y2_pred_rf = rf_model_2.predict(X2_test_scaled)

In [283]:
acc_score_2 = accuracy_score(y2_test, y2_pred_rf)

In [284]:
print('Model: Random Forest Classifier 2')
print("---------------------")
print(f"Accuracy Score : {acc_score_2}")
print("---------------------")
print("Classification Report")
print(classification_report(y2_test, y2_pred_rf))

Model: Random Forest Classifier 2
---------------------
Accuracy Score : 0.8226768968456948
---------------------
Classification Report
              precision    recall  f1-score   support

        high       0.11      0.06      0.07        18
         low       0.86      0.94      0.90      1914
      medium       0.52      0.31      0.39       414

    accuracy                           0.82      2346
   macro avg       0.50      0.44      0.45      2346
weighted avg       0.80      0.82      0.80      2346



In [332]:
# Calculate feature importance in the Random Forest model 2.
print("Feature Importance: rf model 2: no 'Critic_Score' column")
sorted(zip(rf_model_2.feature_importances_, X2.columns), reverse=True)

Feature Importance: rf model 2: no 'Critic_Score' column


[(0.044193684132475, 'Publisher_Nintendo'),
 (0.024093815164273634, 'Genre_Action'),
 (0.02331408452774382, 'ESRB_Rating_T'),
 (0.023167113413960137, 'Publisher_other'),
 (0.022441355704036617, 'Year_other'),
 (0.021531852810864473, 'Year_2008'),
 (0.02121131888136623, 'ESRB_Rating_E'),
 (0.021122356730877093, 'Platform_PS2'),
 (0.0204750665301325, 'Genre_Role-Playing'),
 (0.02002338589631102, 'Genre_Racing'),
 (0.01994935792891028, 'Year_2007'),
 (0.019649831282753518, 'Genre_Shooter'),
 (0.019486429569204735, 'ESRB_Rating_M'),
 (0.019119098129298708, 'Genre_Sports'),
 (0.019004210884545, 'Genre_Misc'),
 (0.018669435828678325, 'Platform_PC'),
 (0.018300963522922615, 'Platform_PS3'),
 (0.017902777995300025, 'Year_2010'),
 (0.0178796012782204, 'Platform_X360'),
 (0.01785230921936681, 'ESRB_Rating_E10'),
 (0.01779728100900375, 'Year_2011'),
 (0.01767476621527384, 'Genre_Platform'),
 (0.01750212516032465, 'Developer_x_other'),
 (0.017087933784941552, 'Publisher_EA Sports'),
 (0.0170621765

In [285]:
# Fitting the model
brf_model_2 = brf_model.fit(X2_train, y2_train)

In [286]:
# Calculated the balanced accuracy score
y2_pred_brf = brf_model_2.predict(X2_test)

brf_acc_score_2 = balanced_accuracy_score(y2_test, y2_pred_brf)

In [287]:
# Print the imbalanced classification report
print('Model: Balanced Random Forest Classifier 2')
print("---------------------")
print(f"Accuracy Score : {brf_acc_score_2}")
print("---------------------")
print("Classification Report")
print(classification_report_imbalanced(y2_test, y2_pred_brf))

Model: Balanced Random Forest Classifier 2
---------------------
Accuracy Score : 0.6515681553162813
---------------------
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

       high       0.06      0.83      0.90      0.11      0.87      0.75        18
        low       0.91      0.69      0.68      0.79      0.69      0.47      1914
     medium       0.28      0.43      0.76      0.34      0.57      0.32       414

avg / total       0.79      0.65      0.70      0.70      0.67      0.45      2346



In [328]:
# Calculate feature importance
print("Feature Importance: brf model 2")
sorted(zip(brf_model_2.feature_importances_, X2.columns), reverse=True)

Feature Importance: brf model 2


[(0.07361605882687765, 'Publisher_Nintendo'),
 (0.03991922858109818, 'ESRB_Rating_M'),
 (0.03756262178449561, 'Publisher_other'),
 (0.0311087607482961, 'Year_other'),
 (0.029896778270020227, 'ESRB_Rating_T'),
 (0.02707309587530154, 'Genre_Action'),
 (0.023744960394805253, 'Platform_X360'),
 (0.023251814966422476, 'Developer_x_Namco'),
 (0.02314405765967697, 'Genre_Shooter'),
 (0.023114594743123638, 'ESRB_Rating_E'),
 (0.020426000009680503, 'Genre_Role-Playing'),
 (0.01988718030573432, 'Publisher_Acclaim Entertainment'),
 (0.019406725068274764, 'Developer_x_other'),
 (0.019282843096206714, 'Platform_PS2'),
 (0.01817565941752642, 'Year_2008'),
 (0.01772930951146559, 'Platform_PS4'),
 (0.016978543325082633, 'Platform_PS3'),
 (0.01692302584390739, 'Year_2011'),
 (0.016591320939371896, 'Genre_Platform'),
 (0.016448677822674673, 'Genre_Sports'),
 (0.01620909396185332, 'Platform_PS'),
 (0.016175524304636354, 'Genre_Misc'),
 (0.01602682179905282, 'Publisher_EA Sports'),
 (0.015242763217971245,

In [289]:
eec_model_2 = eec_model.fit(X2_train, y2_train)

In [290]:
# Calculated the balanced accuracy score
y2_pred_eec = eec_model_2.predict(X2_test)

ecc_acc_score_2 = balanced_accuracy_score(y2_test, y2_pred_eec)

In [291]:
# Print the imbalanced classification report
print('Model: EasyEnsembleClassifier 2')
print("---------------------")
print(f"Accuracy Score : {ecc_acc_score_2}")
print("---------------------")
print("Classification Report")
print(classification_report_imbalanced(y2_test, y2_pred_eec))

Model: EasyEnsembleClassifier 2
---------------------
Accuracy Score : 0.6290364918550826
---------------------
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

       high       0.05      0.83      0.88      0.09      0.85      0.73        18
        low       0.90      0.62      0.68      0.74      0.65      0.42      1914
     medium       0.25      0.43      0.72      0.32      0.56      0.30       414

avg / total       0.78      0.59      0.69      0.66      0.64      0.40      2346



# Dataframe 3: No 'Critic_Score' and 'Country' columns
row count after dropping NaNs = 13922

In [292]:
# Drop 'Country' column 
no_country_critic_df = games_df.drop(['Country', 'Critic_Score', 'Total_Sales'], axis=1).dropna()
print(no_country_critic_df.shape)
no_country_critic_df.head()

(13922, 7)


Unnamed: 0,Genre,ESRB_Rating,Platform,Publisher,Developer_x,Year,Total_Sales_Class
0,Sports,E,Wii,Nintendo,Nintendo EAD,2006,high
2,Racing,E,Wii,Nintendo,Nintendo EAD,2008,high
4,Sports,E,Wii,Nintendo,Nintendo EAD,2009,high
5,Role-Playing,E,GB,Nintendo,Game Freak,1998,high
6,Platform,E,DS,Nintendo,Nintendo EAD,2006,high


In [293]:
# Check unique values
no_country_critic_df.nunique()

Genre                  20
ESRB_Rating             6
Platform               31
Publisher             432
Developer_x          2250
Year                   33
Total_Sales_Class       3
dtype: int64

In [294]:
# Keep top 15 of Genre
top = no_country_critic_df.Genre.value_counts().index[0:15]
no_country_critic_df.Genre = np.where(no_country_critic_df.Genre.isin(top), no_country_critic_df.Genre,'other')

In [295]:
# Keep top 15 of Platform
top = no_country_critic_df.Platform.value_counts().index[0:15]
no_country_critic_df.Platform = np.where(no_country_critic_df.Platform.isin(top), no_country_critic_df.Platform,'other')

In [296]:
# Keep top 15 of Publisher
top = no_country_critic_df.Publisher.value_counts().index[0:15]
no_country_critic_df.Publisher = np.where(no_country_critic_df.Publisher.isin(top), no_country_critic_df.Publisher, 'other')

In [297]:
# Keep top 15 of Developer_x
top = no_country_critic_df.Developer_x.value_counts().index[0:15]
no_country_critic_df.Developer_x = np.where(no_country_critic_df.Developer_x.isin(top), no_country_critic_df.Developer_x,'other')

In [298]:
# Keep top 15 of Year
top = no_country_critic_df.Year.value_counts().index[0:15]
no_country_critic_df.Year = np.where(no_country_critic_df.Year.isin(top), no_country_critic_df.Year, 'other')

In [299]:
# Assign features
X3 = no_country_critic_df.drop('Total_Sales_Class', axis = 1)
X3

Unnamed: 0,Genre,ESRB_Rating,Platform,Publisher,Developer_x,Year
0,Sports,E,Wii,Nintendo,Nintendo EAD,2006
2,Racing,E,Wii,Nintendo,Nintendo EAD,2008
4,Sports,E,Wii,Nintendo,Nintendo EAD,2009
5,Role-Playing,E,other,Nintendo,other,other
6,Platform,E,DS,Nintendo,Nintendo EAD,2006
...,...,...,...,...,...,...
19850,Simulation,M,XOne,other,other,2017
19854,Adventure,E,DS,Majesco,other,2007
19856,Platform,E,3DS,Nintendo,other,2011
19857,Simulation,T,PC,other,other,2004


In [301]:
# Encoding object dtype columns
X_cat3 = X3.select_dtypes(include='object')
X_cat3 = list(X_cat3.columns)
X_cat3

['Genre', 'ESRB_Rating', 'Platform', 'Publisher', 'Developer_x', 'Year']

In [302]:
from sklearn.preprocessing import OneHotEncoder

# creating instance of one-hot-encoder
enc = OneHotEncoder(sparse=False)
# Fit and transform the OneHotEncoder using the categorical variable list
encode_df3 = pd.DataFrame(enc.fit_transform(X3[X_cat3]))

# Add the encoded variable names to the dataframe
encode_df3.columns = enc.get_feature_names(X_cat3)

encode_df3



Unnamed: 0,Genre_Action,Genre_Action-Adventure,Genre_Adventure,Genre_Fighting,Genre_MMO,Genre_Misc,Genre_Music,Genre_Platform,Genre_Puzzle,Genre_Racing,...,Year_2007,Year_2008,Year_2009,Year_2010,Year_2011,Year_2014,Year_2016,Year_2017,Year_2018,Year_other
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
13918,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13919,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
13920,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [303]:
# Reset X dataframe index to merge with encode_df
X3.reset_index(drop=True, inplace=True)
X3

Unnamed: 0,Genre,ESRB_Rating,Platform,Publisher,Developer_x,Year
0,Sports,E,Wii,Nintendo,Nintendo EAD,2006
1,Racing,E,Wii,Nintendo,Nintendo EAD,2008
2,Sports,E,Wii,Nintendo,Nintendo EAD,2009
3,Role-Playing,E,other,Nintendo,other,other
4,Platform,E,DS,Nintendo,Nintendo EAD,2006
...,...,...,...,...,...,...
13917,Simulation,M,XOne,other,other,2017
13918,Adventure,E,DS,Majesco,other,2007
13919,Platform,E,3DS,Nintendo,other,2011
13920,Simulation,T,PC,other,other,2004


In [304]:
# Merge one-hot encoded features and drop the originals
X3 = X3.merge(encode_df3, left_index=True, right_index=True)
X3 = X3.drop(X_cat3, 1)
X3

  X3 = X3.drop(X_cat3, 1)


Unnamed: 0,Genre_Action,Genre_Action-Adventure,Genre_Adventure,Genre_Fighting,Genre_MMO,Genre_Misc,Genre_Music,Genre_Platform,Genre_Puzzle,Genre_Racing,...,Year_2007,Year_2008,Year_2009,Year_2010,Year_2011,Year_2014,Year_2016,Year_2017,Year_2018,Year_other
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
13918,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13919,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
13920,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [305]:
# Assign the target
y3 = no_country_critic_df['Total_Sales_Class']
y3.value_counts()

low       11817
medium     2024
high         81
Name: Total_Sales_Class, dtype: int64

In [306]:
print(X3.shape)
print(y3.shape)

(13922, 86)
(13922,)


In [307]:
# Split data to training and testing set
from sklearn.model_selection import train_test_split
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, random_state=1)

# Check the balance of the target variables.
print(f"y3_train: {Counter(y3_train)}")
print(f"y3_test: {Counter(y3_test)}")

y3_train: Counter({'low': 8869, 'medium': 1512, 'high': 60})
y3_test: Counter({'low': 2948, 'medium': 512, 'high': 21})


In [308]:
# Creating a StandardScaler instance.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X3_scaler = scaler.fit(X3_train)

# Scaling the data.
X3_train_scaled = X3_scaler.transform(X3_train)
X3_test_scaled = X3_scaler.transform(X3_test)

In [309]:
# Fitting the model
rf_model_3 = rf_model.fit(X3_train_scaled, y3_train)
# Making predictions using the testing data.
y3_pred_rf = rf_model_3.predict(X3_test_scaled)
acc_score_3 = accuracy_score(y3_test, y3_pred_rf)

In [310]:
print('Model: Random Forest Classifier 3')
print("---------------------")
print(f"Accuracy Score : {acc_score_3}")
print("---------------------")
print("Classification Report")
print(classification_report(y3_test, y3_pred_rf))

Model: Random Forest Classifier 3
---------------------
Accuracy Score : 0.8463085320310255
---------------------
Classification Report
              precision    recall  f1-score   support

        high       0.14      0.10      0.11        21
         low       0.89      0.95      0.92      2948
      medium       0.48      0.29      0.36       512

    accuracy                           0.85      3481
   macro avg       0.50      0.44      0.46      3481
weighted avg       0.82      0.85      0.83      3481



In [327]:
# Calculate feature importance
print("Feature Importance: rf model 3")
sorted(zip(rf_model_3.feature_importances_, X3.columns), reverse=True)

Feature Importance: rf model 3


[(0.044193684132475, 'Publisher_Nintendo'),
 (0.024093815164273634, 'Genre_Action'),
 (0.02331408452774382, 'ESRB_Rating_T'),
 (0.023167113413960137, 'Publisher_other'),
 (0.022441355704036617, 'Year_other'),
 (0.021531852810864473, 'Year_2008'),
 (0.02121131888136623, 'ESRB_Rating_E'),
 (0.021122356730877093, 'Platform_PS2'),
 (0.0204750665301325, 'Genre_Role-Playing'),
 (0.02002338589631102, 'Genre_Racing'),
 (0.01994935792891028, 'Year_2007'),
 (0.019649831282753518, 'Genre_Shooter'),
 (0.019486429569204735, 'ESRB_Rating_M'),
 (0.019119098129298708, 'Genre_Sports'),
 (0.019004210884545, 'Genre_Misc'),
 (0.018669435828678325, 'Platform_PC'),
 (0.018300963522922615, 'Platform_PS3'),
 (0.017902777995300025, 'Year_2010'),
 (0.0178796012782204, 'Platform_X360'),
 (0.01785230921936681, 'ESRB_Rating_E10'),
 (0.01779728100900375, 'Year_2011'),
 (0.01767476621527384, 'Genre_Platform'),
 (0.01750212516032465, 'Developer_x_other'),
 (0.017087933784941552, 'Publisher_Electronic Arts'),
 (0.0170

In [311]:
# Fitting the BalancedRandomForestClassifier model
brf_model_3 = brf_model.fit(X3_train, y3_train)
y3_pred_brf = brf_model_3.predict(X3_test)

brf_acc_score_3 = balanced_accuracy_score(y3_test, y3_pred_brf)

In [312]:
# Print the imbalanced classification report
print('Model: Balanced Random Forest Classifier 3')
print("---------------------")
print(f"Accuracy Score : {brf_acc_score_3}")
print("---------------------")
print("Classification Report")
print(classification_report_imbalanced(y3_test, y3_pred_brf))

Model: Balanced Random Forest Classifier 3
---------------------
Accuracy Score : 0.63445640439577
---------------------
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

       high       0.04      0.81      0.87      0.07      0.84      0.70        21
        low       0.92      0.68      0.69      0.78      0.69      0.47      2948
     medium       0.25      0.41      0.78      0.31      0.57      0.31       512

avg / total       0.82      0.64      0.71      0.71      0.67      0.45      3481



In [326]:
# Calculate feature importance
print("Feature Importance: brf model 3")
sorted(zip(brf_model_3.feature_importances_, X3.columns), reverse=True)

Feature Importance: brf model 3


[(0.07361605882687765, 'Publisher_Nintendo'),
 (0.03991922858109818, 'ESRB_Rating_M'),
 (0.03756262178449561, 'Publisher_other'),
 (0.0311087607482961, 'Year_other'),
 (0.029896778270020227, 'ESRB_Rating_T'),
 (0.02707309587530154, 'Genre_Action'),
 (0.023744960394805253, 'Platform_X360'),
 (0.023251814966422476, 'Developer_x_Nintendo EAD'),
 (0.02314405765967697, 'Genre_Shooter'),
 (0.023114594743123638, 'ESRB_Rating_E'),
 (0.020426000009680503, 'Genre_Role-Playing'),
 (0.01988718030573432, 'Publisher_Activision'),
 (0.019406725068274764, 'Developer_x_other'),
 (0.019282843096206714, 'Platform_PS2'),
 (0.01817565941752642, 'Year_2008'),
 (0.01772930951146559, 'Platform_PS4'),
 (0.016978543325082633, 'Platform_PS3'),
 (0.01692302584390739, 'Year_2011'),
 (0.016591320939371896, 'Genre_Platform'),
 (0.016448677822674673, 'Genre_Sports'),
 (0.01620909396185332, 'Platform_PS'),
 (0.016175524304636354, 'Genre_Misc'),
 (0.01602682179905282, 'Publisher_Electronic Arts'),
 (0.01524276321797124

In [314]:
# Fitting the EasyEnsembleClassifier model
eec_model_3 = eec_model.fit(X3_train, y3_train)
y3_pred_eec = eec_model_3.predict(X3_test)

ecc_acc_score_3 = balanced_accuracy_score(y3_test, y3_pred_eec)

In [315]:
# Print the imbalanced classification report
print('Model: EasyEnsembleClassifier 3')
print("---------------------")
print(f"Accuracy Score : {ecc_acc_score_3}")
print("---------------------")
print("Classification Report")
print(classification_report_imbalanced(y3_test, y3_pred_eec))

Model: EasyEnsembleClassifier 3
---------------------
Accuracy Score : 0.5716041091215999
---------------------
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

       high       0.03      0.57      0.87      0.05      0.70      0.48        21
        low       0.93      0.66      0.72      0.77      0.69      0.47      2948
     medium       0.27      0.49      0.77      0.35      0.61      0.36       512

avg / total       0.83      0.63      0.73      0.70      0.68      0.45      3481

