# Unit2-Example6 Use of Random Forest Classifier
## Subject: Machine Learning - MSc. Computational Biology 
Author: Esteban García-Cuesta, Departamento de Inteligencia Artificial, UPM (License CC-BY-NC)

This code has been developed to be used exclusively for educational purposes.

## Objectives: 
  - Learn how the Random forest Classifier model works (Part I)
  - Learn how to interpret confusion matrix results (Part II)
  - Learn how to apply Random Forest Classifiers (Parts I, II)

## TO-DO as homework
  - Try different parameters of the model.

In [3]:
#Part I Learn how the Random Forest Classifier model works

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score


#Load the data from the zoo.csv file
path ='zoo.csv'
data = pd.read_csv(path)

#Extract the X input data and the labels y
X = data.iloc[:,1:17]
y = data.iloc[:,17]

#This counters are used to obtain the accuracy "by hand"
contTrain = 0
contTest = 0

# Split train and test using sklearn.model_selection.train_test_split function
XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size=0.10, random_state=np.random.randint(100,size=1)[0])

#Convert pandas df to numerical arrays
XTrain = XTrain.values
XTest = XTest.values
yTrain = yTrain.values
yTest = yTest.values

#Define parameters of the random forest to be checked with cross-validation
param_grid = {'min_samples_leaf': [1],
              'min_samples_split': [2],}

# Create a gridsearch using the Random Forest Classifier
clf = GridSearchCV(RandomForestClassifier(bootstrap='false'), param_grid, cv=3)

# Train the classifier using training dataset
clf.fit(XTrain , yTrain)

#Obtain in the variable model the best random forest classifier
print("Best estimator found by grid search:")
print(clf.best_estimator_)
model = clf.best_estimator_

#Calculate the Cross-validation error during the training step
scores = cross_val_score(model, XTrain, yTrain, cv=10, scoring='accuracy')
print('Validation classification Accuracies: ', + scores)
print('Mean Validation Classification Accuracy: ', + np.mean(scores))


# Compute the train prediction according to the model
yhatTrain = clf.predict(XTrain)

# Check the result on the train examples
#print('Predicted value : ' + str(yhat) , ', real target : ' + str(yTrain))
for i in range(0,len(yTrain),1) :
    if (yhatTrain[i] == yTrain[i]):
        contTrain = contTrain + 1

# Compute the test prediction according to the model
yhatTest = clf.predict(XTest)

# Check the result on the test examples
#print('Predicted value : ' + str(yhat) , ', real target : ' + str(yTest))
for i in range(0,len(yTest),1) :
    if (yhatTest[i] == yTest[i]):
        contTest = contTest + 1


#Return the metric of accuracy
print('The train accuracy is: ' + str(contTrain/len(yTrain)))
print('The test accuracy is: ' + str(contTest/len(yTest)))


Best estimator found by grid search:
RandomForestClassifier(bootstrap='false')




Validation classification Accuracies:  [0.88888889 1.         0.88888889 0.77777778 1.         1.
 0.88888889 1.         0.88888889 0.88888889]
Mean Validation Classification Accuracy:  0.9222222222222223
The train accuracy is: 1.0
The test accuracy is: 1.0


In [4]:
#Part II Learn how to interpret confusion matrix results

from sklearn.metrics import classification_report,confusion_matrix

#Print train confusion matrix
print('----------------Train Confusion Matrix------------------')
print(confusion_matrix(yTrain,yhatTrain))
print(classification_report(yTrain,yhatTrain))


----------------Train Confusion Matrix------------------
[[39  0  0  0  0  0  0]
 [ 0 18  0  0  0  0  0]
 [ 0  0  4  0  0  0  0]
 [ 0  0  0 13  0  0  0]
 [ 0  0  0  0  3  0  0]
 [ 0  0  0  0  0  6  0]
 [ 0  0  0  0  0  0  7]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        39
           2       1.00      1.00      1.00        18
           3       1.00      1.00      1.00         4
           4       1.00      1.00      1.00        13
           5       1.00      1.00      1.00         3
           6       1.00      1.00      1.00         6
           7       1.00      1.00      1.00         7

    accuracy                           1.00        90
   macro avg       1.00      1.00      1.00        90
weighted avg       1.00      1.00      1.00        90



In [5]:
#Part II Learn how to interpret confusion matrix results

#Print test confusion matrix
print('----------------Test Confusion Matrix------------------')
print(confusion_matrix(yTest,yhatTest))
print(classification_report(yTest,yhatTest))


----------------Test Confusion Matrix------------------
[[2 0 0 0 0 0]
 [0 2 0 0 0 0]
 [0 0 1 0 0 0]
 [0 0 0 1 0 0]
 [0 0 0 0 2 0]
 [0 0 0 0 0 3]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         2
           2       1.00      1.00      1.00         2
           3       1.00      1.00      1.00         1
           5       1.00      1.00      1.00         1
           6       1.00      1.00      1.00         2
           7       1.00      1.00      1.00         3

    accuracy                           1.00        11
   macro avg       1.00      1.00      1.00        11
weighted avg       1.00      1.00      1.00        11

