# Using the readings, try and create a RandomForestClassifier for the iris dataset
# Using a 25/75 training/test split, compare the results with the original decision tree model and describe the result to the best of your ability in your PR

In [20]:
import pandas as pd
%matplotlib inline
from sklearn import datasets
from sklearn import tree
from sklearn import metrics
import numpy as np
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split

In [2]:
iris = datasets.load_iris()

In [3]:
iris

 'data': array([[ 5.1,  3.5,  1.4,  0.2],
        [ 4.9,  3. ,  1.4,  0.2],
        [ 4.7,  3.2,  1.3,  0.2],
        [ 4.6,  3.1,  1.5,  0.2],
        [ 5. ,  3.6,  1.4,  0.2],
        [ 5.4,  3.9,  1.7,  0.4],
        [ 4.6,  3.4,  1.4,  0.3],
        [ 5. ,  3.4,  1.5,  0.2],
        [ 4.4,  2.9,  1.4,  0.2],
        [ 4.9,  3.1,  1.5,  0.1],
        [ 5.4,  3.7,  1.5,  0.2],
        [ 4.8,  3.4,  1.6,  0.2],
        [ 4.8,  3. ,  1.4,  0.1],
        [ 4.3,  3. ,  1.1,  0.1],
        [ 5.8,  4. ,  1.2,  0.2],
        [ 5.7,  4.4,  1.5,  0.4],
        [ 5.4,  3.9,  1.3,  0.4],
        [ 5.1,  3.5,  1.4,  0.3],
        [ 5.7,  3.8,  1.7,  0.3],
        [ 5.1,  3.8,  1.5,  0.3],
        [ 5.4,  3.4,  1.7,  0.2],
        [ 5.1,  3.7,  1.5,  0.4],
        [ 4.6,  3.6,  1. ,  0.2],
        [ 5.1,  3.3,  1.7,  0.5],
        [ 4.8,  3.4,  1.9,  0.2],
        [ 5. ,  3. ,  1.6,  0.2],
        [ 5. ,  3.4,  1.6,  0.4],
        [ 5.2,  3.5,  1.5,  0.2],
        [ 5.2,  3.4,  1.4,  0.2],
      

In [14]:
x = iris.data[:,2:] 
y = iris.target

# RandomForestClassifier 

In [15]:
x_train, x_test, y_train, y_test = cross_validation.train_test_split(x, y, stratify=y,random_state=42)
forest = RandomForestClassifier(n_estimators=5, random_state=2)
forest.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=1,
            oob_score=False, random_state=2, verbose=0, warm_start=False)

In [16]:
print("accuracy on training set: %f" % forest.score(x_train, y_train))
print("accuracy on test set: %f" % forest.score(x_test, y_test))

accuracy on training set: 0.981982
accuracy on test set: 0.923077


#  Original decision tree model

In [17]:
dt = tree.DecisionTreeClassifier()

In [21]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25,train_size=0.75)

In [22]:
dt = dt.fit(x_train,y_train) 

In [23]:
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
    y_pred=clf.predict(X)
    if show_accuracy:
        print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")
    if show_classification_report:
        print("Classification report")
        print(metrics.classification_report(y,y_pred),"\n")
    if show_confussion_matrix:
        print("Confusion matrix")
        print(metrics.confusion_matrix(y,y_pred),"\n")

In [26]:
measure_performance(x_train,y_train,dt) 
# I measure the performance of my classifier with train data
#The accuracy is 1, which means is 100% accurate. 
#And my confusion matrix is not showing mistakes in the classification

Accuracy:1.000 

Classification report
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        42
          1       1.00      1.00      1.00        36
          2       1.00      1.00      1.00        34

avg / total       1.00      1.00      1.00       112
 

Confusion matrix
[[42  0  0]
 [ 0 36  0]
 [ 0  0 34]] 



In [27]:
measure_performance(x_test,y_test,dt)
# I measure the performance of my classifier with test data
# Accuracy of 100%

Accuracy:0.974 

Classification report
             precision    recall  f1-score   support

          0       1.00      1.00      1.00         8
          1       1.00      0.93      0.96        14
          2       0.94      1.00      0.97        16

avg / total       0.98      0.97      0.97        38
 

Confusion matrix
[[ 8  0  0]
 [ 0 13  1]
 [ 0  0 16]] 



# For the RandomForestClassifier
accuracy on training set: 0.981982

accuracy on test set: 0.923077

# For the Original decision tree model
accuracy on training set: 1.000 

accuracy on test set: 0.974

My main takeaway is that random forests are a way of addressing the problem of overfitting. Decision trees tend to overfit the training data, and since random forests are made up of a number of these decision trees, they are all going to overfit the data in different ways. So what we do is averaging the results of all of the trees in our random forest to get a more accurate fit.
The accuracy of the training set for the Random Forest Classifier is of 98% (and I am not sure about the following ...) which means that the model is not overfitting. On the contrary, the accuracy of the training set for the desicion tree model is of 100%, which probably means is overfitting. The accuracy test for the decision tree model is better than the one for the random forest classifier, which confused me a little bit since I was expecting the one for the random forest classifier to be better. If the data is not overfitted, the model is more likely to be more accurate right?
