In [47]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler
from sklearn.metrics import classification_report
from tqdm import tqdm
from sklearn.utils import shuffle

In [50]:
df_train = pd.read_csv('plant-train.csv')
df_train.replace({'setosa': 0, 'virginica': 1}, inplace=True)
df_train = shuffle(df_train)
df_test = pd.read_csv('plant-test.csv')
df_test.replace({'setosa': 0, 'virginica': 1}, inplace=True)
df_test = shuffle(df_test)
target_names = ['setosa', 'virginica']

print(df_train.head())
print(df_test.head())

# The data matrix X
X_train = df_train.iloc[:,1:-1]
# The labels
y_train = (df_train.iloc[:,-1:])

# The data matrix X
X_test = df_test.iloc[:,1:-1]
# The labels
y_test = (df_test.iloc[:,-1:])

    sepal length in cm  sepal width in cm  petal length in cm  \
64                 6.7                3.3                 5.7   
33                 4.4                3.0                 1.3   
65                 7.2                3.2                 6.0   
71                 7.9                3.8                 6.4   
31                 5.5                3.5                 1.3   

    petal width in cm  target/label  
64                2.1             1  
33                0.2             0  
65                1.8             1  
71                2.0             1  
31                0.2             0  
    sepal length in cm  sepal width in cm  petal length in cm  \
3                  4.8                3.0                 1.4   
12                 5.8                2.7                 5.1   
7                  4.6                3.2                 1.4   
2                  4.8                3.4                 1.6   
6                  5.1                3.8               

In [41]:
from sklearn import set_config
set_config(display="diagram")


## GaussianNB

### W/O Normalisation

In [51]:
GNB = GaussianNB()
GNB.fit(X_train.values, y_train.values.ravel())
GNB_preds = GNB.predict(X_test.values)
print(classification_report(y_test.values, GNB_preds, target_names=target_names))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
   virginica       1.00      1.00      1.00        10

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20



### W/ Normalisation

In [52]:
GNB_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', GaussianNB())
])

GNB_pipe.fit(X_train.values, y_train.values.ravel())
GNB_pipe_preds = GNB_pipe.predict(X_test.values)
print(classification_report(y_test.values, GNB_pipe_preds, target_names=target_names))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
   virginica       1.00      1.00      1.00        10

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20



## RandomForestClassifier

### W/O Normalisation

In [53]:
RFC = RandomForestClassifier(max_depth=2, random_state=42)
RFC.fit(X_train.values, y_train.values.ravel())
RFC_preds = RFC.predict(X_test.values)
print(classification_report(y_test.values, RFC_preds, target_names=target_names))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
   virginica       1.00      1.00      1.00        10

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20



### W/ Normalisation

In [46]:
RFC_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(max_depth=2, random_state=42))
])

RFC_pipe.fit(X_train.values, y_train.values.ravel())
RFC_pipe_preds = RFC_pipe.predict(X_test.values)
print(classification_report(y_test.values, RFC_pipe_preds, target_names=target_names))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
   virginica       1.00      1.00      1.00        10

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

