In [1]:
from pathlib import Path

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

#### Set Paths

In [3]:
proj_path = Path().absolute().parents[0]
data_raw_path = Path(proj_path, 'data', 'raw')

#### Read in data

In [4]:
df = pd.read_csv(Path(data_raw_path, 'datasets_19_420_Iris.csv'))

#### Process data

In [5]:
# Split into feature and response
X = df.iloc[:, 1:5]
y = df.iloc[:, 5]

In [6]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

#### Build model

In [7]:
rf = RandomForestClassifier(oob_score=True)
rf.fit(X_train, y_train)

RandomForestClassifier(oob_score=True)

In [8]:
pred = rf.predict(X_test)

In [9]:
pred_proba = rf.predict_proba(X_test)

#### Model performance (Test)

In [10]:
# Test error
print(
    classification_report(y_true=y_test.values, 
                      y_pred=pred)
)

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        15
Iris-versicolor       1.00      1.00      1.00        11
 Iris-virginica       1.00      1.00      1.00        12

       accuracy                           1.00        38
      macro avg       1.00      1.00      1.00        38
   weighted avg       1.00      1.00      1.00        38



In [11]:
print(
    confusion_matrix(y_true=y_test, 
                      y_pred=pred)
)

[[15  0  0]
 [ 0 11  0]
 [ 0  0 12]]


#### Model performance (Train)

In [12]:
# Training error
print(
    classification_report(y_true=y_train, 
                      y_pred=rf.predict(X_train))
)

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        35
Iris-versicolor       1.00      1.00      1.00        39
 Iris-virginica       1.00      1.00      1.00        38

       accuracy                           1.00       112
      macro avg       1.00      1.00      1.00       112
   weighted avg       1.00      1.00      1.00       112



In [13]:
print(
    confusion_matrix(y_true=y_test, 
                      y_pred=pred)
)

[[15  0  0]
 [ 0 11  0]
 [ 0  0 12]]
