## Import Dataset

In [1]:
import pandas as pd

dataset = pd.read_csv(
    'data/banknote_authentication.txt', 
    header=None, 
    names=['Variance', 'Skewness', 'Curtosis', 'Entropy', 'Class']
)

print(dataset.shape[0], 'records')
dataset.head()

1372 records


Unnamed: 0,Variance,Skewness,Curtosis,Entropy,Class
0,3.6216,8.6661,-2.8073,-0.44699,0
1,4.5459,8.1674,-2.4586,-1.4621,0
2,3.866,-2.6383,1.9242,0.10645,0
3,3.4566,9.5228,-4.0112,-3.5944,0
4,0.32924,-4.4552,4.5718,-0.9888,0


## Data Preprocessing

### Prepare features and labels

In [2]:
X = dataset.loc[:, dataset.columns != 'Class']
y = dataset['Class']

### Split data into Training and Test sets

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

print('Training data:', len(X_train))
print('Test data:', len(X_test))

Training data: 1097
Test data: 275


### Feature scaling

In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Create and Train model

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

classifier = RandomForestClassifier(random_state=0)

grid_params = {
    'n_estimators': [5, 10, 20, 40],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]  
}

grid_search = GridSearchCV(
    estimator=classifier,
    param_grid=grid_params,
    scoring='accuracy',
    cv=5, # number of folds
    n_jobs=-1 # all available computing power
)

grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False, random_state=0,
                                   

In [6]:
best_parameters = grid_search.best_params_
print('Best parameters:\n', best_parameters)

best_result = grid_search.best_score_
print('\nBest training accuracy:', best_result)

model = grid_search.best_estimator_
print('\nBest model:\n', model)

Best parameters:
 {'bootstrap': True, 'criterion': 'gini', 'n_estimators': 20}

Best training accuracy: 0.9927023661270237

Best model:
 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)


## Evaluate the model

In [7]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

y_pred = model.predict(X_test)

print('Accuracy score:', accuracy_score(y_test, y_pred))
print('\nConfusion matrix:\n', confusion_matrix(y_test,y_pred))
print('\nClassification report:\n', classification_report(y_test,y_pred))

Accuracy score: 0.9890909090909091

Confusion matrix:
 [[155   2]
 [  1 117]]

Classification report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       157
           1       0.98      0.99      0.99       118

    accuracy                           0.99       275
   macro avg       0.99      0.99      0.99       275
weighted avg       0.99      0.99      0.99       275

