In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [2]:
BASE_DIR = os.getcwd()
ready_csv_path = os.path.join(BASE_DIR, "mtg-nums.csv")

In [3]:
df = pd.read_csv(ready_csv_path)

In [4]:
df.head()

Unnamed: 0,cmc,power,toughness,color,type
0,6.0,3.0,3.0,1,0
1,2.0,2.0,2.0,2,0
2,0.0,1.0,1.0,2,1
3,3.0,1.0,2.0,3,0
4,1.0,2.0,1.0,2,0


### Prepare data that we're going to feed to the classifier

#### Set efficient dtypes

In [5]:
df['cmc'] = df['cmc'].astype('int16')
df['power'] = df['power'].astype('int16')
df['toughness'] = df['toughness'].astype('int16')
df['color'] = df['color'].astype('int16')
df['type'] = df['type'].astype('int16')

#### Declare target and features

In [6]:
y = df['type']

features = ['cmc', 'power', 'toughness', 'color']
X = df[features]

#### Split the data into training and test

In [7]:
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [8]:
print(f"x_train: {x_train.shape}")
print(f"x_test: {x_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test: {y_test.shape}")

x_train: (20469, 4)
x_test: (6823, 4)
y_train: (20469,)
y_test: (6823,)


#### Convert data to numpy arrays

In [9]:
x_train = x_train.to_numpy()
x_test = x_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

### Random Forest Classifier

#### Using GridSearchCV to tune the parameters

In [10]:
rf = RandomForestClassifier()

# Based on first grid search run and fine-tuning n_estimators after that
n_estimators = [77]
max_features = ['auto']
max_depth = [4]
min_samples_split = [5]
min_samples_leaf = [2]
bootstrap = [True]
random_state = [1]

param_grid = {
    "n_estimators": n_estimators,
    "max_features": max_features,
    "max_depth": max_depth,
    "min_samples_split": min_samples_split,
    "min_samples_leaf": min_samples_leaf,
    "bootstrap": bootstrap,
    "random_state": random_state
}


rf_grid = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, verbose=1, n_jobs=2)

rf_grid.fit(x_train, y_train)

print(f"Train accuracy: {rf_grid.score(x_train, y_train):.3f}")
print(f"Test accuracy : {rf_grid.score(x_test, y_test):.3f}")
print(f"Best parameters:\n {rf_grid.best_params_}")

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Train accuracy: 0.824
Test accuracy : 0.824
Best parameters:
 {'bootstrap': True, 'max_depth': 4, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 77, 'random_state': 1}


#### Fitting the model with best parameters

In [11]:
rf_model = RandomForestClassifier(n_estimators=77, max_depth=4, min_samples_split=5, min_samples_leaf=2, max_features='auto', bootstrap=True, random_state=1)
rf_model.fit(x_train, y_train)
forest_predictions = rf_model.predict(x_test)

In [12]:
score = accuracy_score(y_test, forest_predictions)
mae = mean_absolute_error(y_test, forest_predictions)
print(f"{mae:.3f} - Mean Absolute Error")
print(f"{score:.3f} - Accuracy score")

0.176 - Mean Absolute Error
0.824 - Accuracy score


#### Fitting the model with no parameters passed except random_state

In [13]:
rf_model_2 = RandomForestClassifier(random_state=1)
rf_model_2.fit(x_train, y_train)
forest_predictions_2 = rf_model_2.predict(x_test)

score = accuracy_score(y_test, forest_predictions_2)
mae = mean_absolute_error(y_test, forest_predictions_2)
print(f"{mae:.3f} - Mean Absolute Error")
print(f"{score:.3f} - Accuracy score")

0.163 - Mean Absolute Error
0.837 - Accuracy score
