In [1]:
from pathlib import Path

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

#### Set Paths

In [3]:
proj_path = Path().absolute().parents[0]
data_raw_path = Path(proj_path, 'data', 'raw')

#### Read in data

In [4]:
df = pd.read_csv(Path(data_raw_path, 'datasets_19_420_Iris.csv'))

#### Process data

In [5]:
# Split into feature and response
X = df.iloc[:, 1:5]
y_ = df.iloc[:, 5]

In [6]:
# One hot encode multi-class Y
y = pd.get_dummies(y_)

In [7]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

#### Build model

In [8]:
rf = RandomForestClassifier(oob_score=True)
rf.fit(X_train, y_train)

RandomForestClassifier(oob_score=True)

In [9]:
pred = rf.predict(X_test)

#### Model performance (Test)

In [10]:
y.columns.values.tolist()

['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']

In [11]:
# Test error
print(
    classification_report(y_true=y_test.values, 
                      y_pred=pred, 
                      target_names=y.columns.values.tolist())
)

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        15
Iris-versicolor       1.00      1.00      1.00        11
 Iris-virginica       1.00      1.00      1.00        12

      micro avg       1.00      1.00      1.00        38
      macro avg       1.00      1.00      1.00        38
   weighted avg       1.00      1.00      1.00        38
    samples avg       1.00      1.00      1.00        38



In [12]:
print(
    confusion_matrix(y_true=np.argwhere(y_test.values).T[1], 
                      y_pred=np.argwhere(pred).T[1])
)

[[15  0  0]
 [ 0 11  0]
 [ 0  0 12]]


#### Model performance (Train)

In [13]:
# Training error
print(
    classification_report(y_true=y_train.values, 
                      y_pred=rf.predict(X_train), 
                      target_names=y.columns.values.tolist())
)

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        35
Iris-versicolor       1.00      1.00      1.00        39
 Iris-virginica       1.00      1.00      1.00        38

      micro avg       1.00      1.00      1.00       112
      macro avg       1.00      1.00      1.00       112
   weighted avg       1.00      1.00      1.00       112
    samples avg       1.00      1.00      1.00       112



In [14]:
print(
    confusion_matrix(y_true=np.argwhere(y_train.values).T[1], 
                      y_pred=np.argwhere(rf.predict(X_train)).T[1])
)

[[35  0  0]
 [ 0 39  0]
 [ 0  0 38]]
