In [None]:
#Get datasets
!wget -q https://github.com/jomokojomoko/CISCASSIGNMENT/raw/main/Video_games_esrb_rating.csv
!wget -q https://github.com/jomokojomoko/CISCASSIGNMENT/raw/main/test_esrb.csv


In [None]:
#Get training and testing sets
import pandas as pd
import csv
from sklearn import preprocessing
from keras.utils import np_utils
xy_train_df = pd.read_csv('Video_games_esrb_rating.csv',error_bad_lines=False)
test=pd.read_csv('test_esrb.csv',error_bad_lines=False)
#Get x sets
xtrain=xy_train_df.drop(columns=['title','esrb_rating'])
xtest=test.drop(columns=['title','esrb_rating'])
#Get y sets
ytrain=xy_train_df['esrb_rating']
ytest=test['esrb_rating']
print(ytrain)

0        E
1       ET
2        M
3       ET
4        T
        ..
1890     M
1891     T
1892     E
1893     T
1894     E
Name: esrb_rating, Length: 1895, dtype: object


In [None]:
# model training and tuning
#Importing the sklearn packages needed
import numpy as np
from scipy.stats import loguniform
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import confusion_matrix
np.random.seed(0)

#Selecting the categorical features that will be used
categorical_features = [
   'console','alcohol_reference', 'animated_blood','blood', 'blood_and_gore', 'cartoon_violence','crude_humor', 'drug_reference', 'fantasy_violence','intense_violence', 'language', 'lyrics','mature_humor', 'mild_blood', 'mild_cartoon_violence','mild_fantasy_violence', 'mild_language', 'mild_lyrics','mild_suggestive_themes', 'mild_violence', 'no_descriptors','nudity', 'partial_nudity', 'sexual_content','sexual_themes', 'simulated_gambling', 'strong_janguage','strong_sexual_content', 'suggestive_themes', 'use_of_alcohol','use_of_drugs_and_alcohol', 'violence']
#Preprocessing steps that will be used, Missing values are replaced with 0 encodes each category into a boolean section
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

#Configures preprocessor, to have to right steps apply to the correct variables
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)])

#Creates the settings that are to be used for the classifier, use the preprocessing settings from before
#Use the xgboost classifier technique with the settings for multiclass classification
regr = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', XGBClassifier(
                          objective='multi:softmax', seed=3))])

#Selects the columns used for the training and testing set
xtrain = xtrain[[*categorical_features]]
X_test = xtest[[*categorical_features]]

# `__` denotes attribute 
# (e.g. regressor__n_estimators means the `n_estimators` param for `regressor`
#  which is our xgb)
#Hyperparameters used
param_grid = {
    #Have 4 options for number of trees
    'regressor__n_estimators': [10,20,30,40],
    #Have 4 options for ma tree depth
    'regressor__max_depth':[2,4,6,8],
}
#Setting the cross validation settings
#10 splits that are repeated 3 times
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

#Gridsearch method, using the settings selected above
#Does 10 fold cross validation using accuracy as the scoring method
#Gridsearch goes through different combinations of the hyper parameters
grid_search = RandomizedSearchCV(
    regr, param_grid,n_iter=16, cv=cv, verbose=3, n_jobs=2, 
    scoring='accuracy')
grid_search.fit(xtrain, ytrain)

print('best score {}'.format(grid_search.best_score_))

Fitting 30 folds for each of 16 candidates, totalling 480 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:    3.0s
[Parallel(n_jobs=2)]: Done 124 tasks      | elapsed:   18.6s
[Parallel(n_jobs=2)]: Done 284 tasks      | elapsed:   54.6s
[Parallel(n_jobs=2)]: Done 480 out of 480 | elapsed:  2.4min finished


best score 0.8487171632785668


In [None]:
#Print Confusion Matrix
y_pred = grid_search.best_estimator_.predict(xtest)
print (confusion_matrix(ytest, y_pred))
import sklearn
acc=sklearn.metrics.accuracy_score(ytest,y_pred)
print(acc)

[[ 87   2   0  11]
 [  4 111   0  11]
 [  0   0  60  30]
 [  3  18   6 157]]
0.83
