In [170]:
from os import cpu_count

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import set_config
set_config(display="diagram")

from sklearn.impute import SimpleImputer

from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector

from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree, export_graphviz

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import roc_curve, roc_auc_score, classification_report

In [33]:
df = sns.load_dataset("iris")
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [34]:
species = df.species.value_counts().index.to_list()
species

['setosa', 'versicolor', 'virginica']

In [35]:
X = df.drop(columns=["species"], axis=1)
out = pd.DataFrame(OrdinalEncoder(categories=[species], dtype=int).fit_transform(df[['species']]))

In [40]:
xtrain,xtest,ytrain,ytest = train_test_split(X, out, test_size=0.2, random_state=29)

In [151]:
num_pipeline = Pipeline([
    ("num_impute", SimpleImputer(strategy="median")),
    ("scale", StandardScaler())
])

cat_pipeline = Pipeline([
    ("cat_impute", SimpleImputer(strategy="most_frequent")),
    ("encode", OneHotEncoder(dtype=int))
])


preprocessing = ColumnTransformer([
    ("numerical", num_pipeline, make_column_selector(dtype_include=np.number)),
    ("category", cat_pipeline, make_column_selector(dtype_include="object"))
])

decision_tree = make_pipeline(
    preprocessing,
    DecisionTreeClassifier(random_state=29)
)

In [152]:
decision_tree.fit(xtrain, ytrain)

In [153]:
ypred = decision_tree.predict(xtest)
ypred

array([2, 2, 2, 2, 0, 1, 2, 1, 2, 1, 2, 0, 1, 2, 1, 2, 2, 2, 1, 0, 2, 2,
       0, 1, 1, 1, 1, 2, 1, 2])

In [154]:
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       0.82      0.90      0.86        10
           2       0.93      0.88      0.90        16

    accuracy                           0.90        30
   macro avg       0.92      0.92      0.92        30
weighted avg       0.90      0.90      0.90        30



In [155]:
decision_tree.predict(pd.DataFrame([xtest.iloc[0]]))

array([2])

In [156]:
ytest.iloc[0]

0    2
Name: 147, dtype: int32

In [157]:
species[ytest.iloc[0].values[0]]

'virginica'

In [158]:
random_forest = Pipeline([
    ("preprocessing", preprocessing),
    ("random_forest", RandomForestClassifier(random_state=29))
])

In [159]:
random_forest.fit(xtrain, ytrain.values[:,0])
forest_ypred = random_forest.predict(xtest)

In [160]:
print(classification_report(ytest, forest_ypred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       0.82      0.90      0.86        10
           2       0.93      0.88      0.90        16

    accuracy                           0.90        30
   macro avg       0.92      0.92      0.92        30
weighted avg       0.90      0.90      0.90        30



In [161]:
## Uncomment and run
# Define the parameter grid with the correct prefix
# params =  {
#     "random_forest__max_depth": [2,3,4,5]
# }

# grid_search = GridSearchCV(random_forest, param_grid=params, cv=5, scoring="f1")
# grid_search.fit(xtrain, ytrain.values.ravel())

For a multi-class classification problem, the scoring parameter for GridSearchCV needs to be set to a metric that supports multi-class classification, such as "f1_macro", "f1_micro", or "f1_weighted"

In [171]:

# Define the parameter grid with the correct prefix
params =  {
    "preprocessing__numerical__num_impute__strategy": ["mean", "median"],
    "random_forest__max_depth": [2,3,4,5],
    "random_forest__criterion": ["gini", "entropy"],
    "random_forest__n_estimators": [100,200,300]
}

grid_search = GridSearchCV(random_forest, param_grid=params, cv=5, scoring="f1_macro", n_jobs=cpu_count())
grid_search.fit(xtrain, ytrain.values.ravel())

In [172]:
grid_search.best_params_

{'preprocessing__numerical__num_impute__strategy': 'mean',
 'random_forest__criterion': 'entropy',
 'random_forest__max_depth': 4,
 'random_forest__n_estimators': 100}

In [173]:
est = grid_search.best_estimator_
est

In [174]:
rnf = est["random_forest"]
rnf.estimator_

In [178]:
pd.DataFrame({"importance": rnf.feature_importances_, "feature": grid_search.feature_names_in_}).sort_values(by="importance", ascending=False)

Unnamed: 0,importance,feature
2,0.453597,petal_length
3,0.413377,petal_width
0,0.114486,sepal_length
1,0.01854,sepal_width


In [179]:
ypred = grid_search.predict(xtest)

In [180]:
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       0.82      0.90      0.86        10
           2       0.93      0.88      0.90        16

    accuracy                           0.90        30
   macro avg       0.92      0.92      0.92        30
weighted avg       0.90      0.90      0.90        30

