In [1]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt
%matplotlib inline

#### Importing the data set

In [2]:
df = pd.read_csv("Iris.csv")
df.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
y = df.Species
X = df.iloc[:,0:4].copy()

### Using RandomForest classifier

In [4]:
from sklearn.ensemble import RandomForestClassifier

In [9]:
classifier_rf = RandomForestClassifier(random_state=42, n_jobs=-1)

In [11]:
?RandomForestClassifier

### Using Grid Search and cross validation for finding the best parameters

In [12]:
from sklearn.model_selection import GridSearchCV

In [14]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'max_depth': [1, 2, 5, 10, 20],
    'max_features': [1, 2, 3],
    'min_samples_leaf': [1, 3, 4, 5],
#    'min_samples_split': [8, 10, 12],
    'n_estimators': [10, 30, 50, 100, 200]
}

In [16]:
# Instantiate the grid search model
grid_search = GridSearchCV(estimator=classifier_rf, param_grid=param_grid, 
                          cv=4, n_jobs=-1, verbose=1, scoring = "accuracy")

In [17]:
grid_search.fit(X,y)

Fitting 4 folds for each of 300 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   24.4s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   37.6s
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:   53.0s finished


GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=-1,
                                              oob_score=False, random_state=42,
                                              verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'max_

#### Extracting the best model

In [18]:
grid_search.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=1, max_features=2, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=-1,
                       oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [19]:
best_model = grid_search.best_estimator_

#### Evaluating performance

In [20]:
y_pred = best_model.predict(X)

In [21]:
from sklearn.metrics import confusion_matrix

In [22]:
confusion_matrix(y, y_pred)

array([[50,  0,  0],
       [ 0, 48,  2],
       [ 0,  2, 48]], dtype=int64)