In [21]:
# importing libraries
import numpy as np
import pandas as pd

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, precision_score, recall_score

In [22]:
# Read the csv files:
mnist_train = pd.read_csv('mnist_train.csv')
mnist_test = pd.read_csv('mnist_test.csv')

# Print data shapes:
print(mnist_train.shape)
print(mnist_test.shape)

(60000, 785)
(10000, 785)


The training set represents 14.28% of all the pictures in both train and test files.

In [23]:
# Assign variables for train set:
y_train = mnist_train.iloc[:, 0].values
x_train = mnist_train.iloc[:, 1:].values

In [24]:
# Assign variables for test set:
y_test = mnist_test.iloc[:, 0]
x_test = mnist_test.iloc[:, 1:]

In [25]:
# Splitting the train set into train and development sets by using train_test_split funciton into train 80% and development 20% set:
x_train, x_dev, y_train, y_dev = train_test_split(x_train, y_train, test_size = 0.2, random_state = 5)

The aim of the three data sets:
- train data set - will be used to create/train initial Random Forest model. It will also be used later on, once the parameters are tuned for the model.
- development (test) data set - will be used to test the initial training (get the initial accuracy score for the model before parameters are tuned). It will also be used as a portion of data to test various parameters settings for the model
- test (validation) data set - will be used to test the tuned model on the unseen portion of data, and to calculate final precision, recall, f1, accuracy scores and visualise confusion matrix, to see how well the model predicted the values compared with the actual test values.  

In [26]:
# Build Random Forest classification model: 
ensemble_forest = RandomForestClassifier(max_depth = 5, n_estimators=100, random_state=5)
ensemble_forest.fit(x_train, y_train)

# Predict the model on dev set:
y_pred_dev = ensemble_forest.predict(x_dev)

# Get accuracy of the model performance on dev data:
accuracy_dev = ensemble_forest.score(x_dev, y_dev)
print("Accuracy of ensemble forest on dev data:", round(accuracy_dev,3))

Accuracy of ensemble forest on dev data: 0.858


In [27]:
# The parameter I chose to tune is max_depth. This indicates how deep the tree can be (the deeper, the more splits it has and captures more information). 
# I assessed the number of max_depths in Random Forest model looking at the accuracy score for each max_depth (range 2-10) using development data:
for x in range(2,11):
    max_depth = x
    ensemble_forest = RandomForestClassifier(max_depth = max_depth, n_estimators=100, random_state=5).fit(x_dev, y_dev)
    print(f"Forest accuracy score at {max_depth} max_depth:\t", round(ensemble_forest.score(x_dev, y_dev),3))
    

Forest accuracy score at 2 max_depth:	 0.645
Forest accuracy score at 3 max_depth:	 0.752
Forest accuracy score at 4 max_depth:	 0.811
Forest accuracy score at 5 max_depth:	 0.866
Forest accuracy score at 6 max_depth:	 0.9
Forest accuracy score at 7 max_depth:	 0.928
Forest accuracy score at 8 max_depth:	 0.952
Forest accuracy score at 9 max_depth:	 0.97
Forest accuracy score at 10 max_depth:	 0.985


max_depth of 10 produced the highest accuracy, so I will use it in my model on the test data.

In [28]:
# I also looked at the number of n_estimators in Random Forest model on the accuracy score using dev data: 
for x in (1,3,5,10,20,50,100,150,200):
    estimations = x
    ensemble_forest = RandomForestClassifier(max_depth = 10, n_estimators=estimations, random_state=5).fit(x_dev, y_dev)
    print(f"Accuracy Forest at {estimations} estimations:\t", round(ensemble_forest.score(x_dev, y_dev),3))

Accuracy Forest at 1 estimations:	 0.818
Accuracy Forest at 3 estimations:	 0.919
Accuracy Forest at 5 estimations:	 0.949
Accuracy Forest at 10 estimations:	 0.97
Accuracy Forest at 20 estimations:	 0.978
Accuracy Forest at 50 estimations:	 0.984
Accuracy Forest at 100 estimations:	 0.985
Accuracy Forest at 150 estimations:	 0.986
Accuracy Forest at 200 estimations:	 0.986


n_estimators of 100, chosen at the beginning, works ok here, so I will keep using this value in my model.

In [29]:
# Make Random Forest Model predictions on test data:
ensemble_forest = RandomForestClassifier(max_depth = 10, n_estimators=100, random_state=5)
ensemble_forest.fit(x_train, y_train)
y_pred_test = ensemble_forest.predict(x_test)

# Creating and visualizing confusion matrix for my Random Forest model on the test set:
conf_mat = confusion_matrix(y_test,y_pred_test)
conf_mat_df = pd.DataFrame(conf_mat)
conf_mat_df



Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,968,1,0,0,0,2,4,1,3,1
1,0,1120,4,3,0,1,3,1,3,0
2,9,1,964,11,13,0,7,15,10,2
3,1,0,15,941,1,19,0,14,15,4
4,1,0,1,0,920,0,8,1,8,43
5,6,6,3,20,6,826,8,3,7,7
6,8,4,0,0,6,9,925,0,5,1
7,2,7,28,3,4,0,0,945,5,34
8,3,1,6,12,7,8,6,4,904,23
9,8,7,0,14,17,4,1,5,12,941


In [30]:
# Which classes the model struggled most?

# Getting the conf_mat data frame labels:
classes = conf_mat_df.columns

# f1 score per feature tells us which were most difficult to identify:
f = f1_score(y_test, y_pred_test, average = None)
print(f)
lowest_score = min(f)
hardest_class = classes[list(f).index(lowest_score)]
print('Hardest class to identify:', hardest_class)

[0.97482377 0.98159509 0.93911349 0.93445879 0.9406953  0.93810335
 0.96354167 0.9370352  0.9290853  0.91138015]
Hardest class to identify: 9


The hardest to identify was number 9. 

In [31]:
# Accuracy, precision, recall and f1 scores for the predicted model: 
accur_score = accuracy_score(y_test, y_pred_test)
print('Accuracy: ',accur_score)

prec_score = precision_score(y_test, y_pred_test, average = "micro")
print('Precision:', prec_score)

rec_score = recall_score(y_test, y_pred_test, average = "micro")
print('Recall:', rec_score)

average_f1 = f1_score(y_test, y_pred_test, average='micro')
print('f1 score:', round(average_f1,3))

Accuracy:  0.9454
Precision: 0.9454
Recall: 0.9454
f1 score: 0.945


The scores are the same becuase are based on the weighted average for each.