In [3]:
import polars as pl

from sklearn.model_selection import train_test_split, GridSearchCV
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
from sklearn.ensemble import (
    GradientBoostingClassifier,
    HistGradientBoostingClassifier,
    RandomForestClassifier,
)
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression

from metric_functions import calculate_basic_metrics

from setup.constants import PROJECT_DATA

In [4]:
simplefilter("ignore", category=ConvergenceWarning)

In [5]:
iris_df = pl.read_csv(PROJECT_DATA)

In [6]:
iris_df_combined = iris_df.with_columns(
    (pl.col("sepal_length") + pl.col("sepal_width")).alias("sepal_sum"),
    (pl.col("petal_length") + pl.col("petal_width")).alias("petal_sum"),
).with_columns((pl.col("sepal_sum") + pl.col("petal_sum")).alias("total_sum"))

In [7]:
X = iris_df_combined.drop("class")
print(X)

y = iris_df_combined.get_column("class").to_frame()

unique_y = y["class"].unique().to_list()
y_mapping = {species: i for i, species in enumerate(unique_y)}

y = (
    y.with_columns(pl.col("class").replace_strict(y_mapping).alias("encoded_class"))
    .drop("class")
    .to_series()
)
print(y)

shape: (150, 7)
┌──────────────┬─────────────┬──────────────┬─────────────┬───────────┬───────────┬───────────┐
│ sepal_length ┆ sepal_width ┆ petal_length ┆ petal_width ┆ sepal_sum ┆ petal_sum ┆ total_sum │
│ ---          ┆ ---         ┆ ---          ┆ ---         ┆ ---       ┆ ---       ┆ ---       │
│ f64          ┆ f64         ┆ f64          ┆ f64         ┆ f64       ┆ f64       ┆ f64       │
╞══════════════╪═════════════╪══════════════╪═════════════╪═══════════╪═══════════╪═══════════╡
│ 5.1          ┆ 3.5         ┆ 1.4          ┆ 0.2         ┆ 8.6       ┆ 1.6       ┆ 10.2      │
│ 4.9          ┆ 3.0         ┆ 1.4          ┆ 0.2         ┆ 7.9       ┆ 1.6       ┆ 9.5       │
│ 4.7          ┆ 3.2         ┆ 1.3          ┆ 0.2         ┆ 7.9       ┆ 1.5       ┆ 9.4       │
│ 4.6          ┆ 3.1         ┆ 1.5          ┆ 0.2         ┆ 7.7       ┆ 1.7       ┆ 9.4       │
│ 5.0          ┆ 3.6         ┆ 1.4          ┆ 0.2         ┆ 8.6       ┆ 1.6       ┆ 10.2      │
│ …            ┆ …      

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(
    X_test, y_test, test_size=0.5, stratify=y_test
)
print(len(X_train), len(y_train), len(X_val), len(y_val), len(X_test), len(y_test))
print(y_train.unique_counts(), y_val.unique_counts(), y_test.unique_counts())
results = {}

120 120 15 15 15 15
shape: (3,)
Series: 'encoded_class' [u32]
[
	40
	40
	40
] shape: (3,)
Series: 'encoded_class' [u32]
[
	5
	5
	5
] shape: (3,)
Series: 'encoded_class' [u32]
[
	5
	5
	5
]


In [9]:
lr = LogisticRegression()

lr_param_grid = {
    "penalty": ["l2"],
    "solver": ["lbfgs", "newton-cg", "newton-cholesky", "sag", "saga"],
    "C": [0.0001, 0.001, 0.01, 0.1, 1],
    "tol": [0.00001, 0.0001, 0.001, 0.01],
    "max_iter": [100, 300, 500],
}

lr_grid_search = GridSearchCV(
    estimator=lr,
    param_grid=lr_param_grid,
    scoring="accuracy",
    refit=True,
)
lr_grid_search.fit(X_train, y_train)

0,1,2
,estimator,LogisticRegression()
,param_grid,"{'C': [0.0001, 0.001, ...], 'max_iter': [100, 300, ...], 'penalty': ['l2'], 'solver': ['lbfgs', 'newton-cg', ...], ...}"
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l2'
,dual,False
,tol,1e-05
,C,0.1
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'sag'
,max_iter,300


In [10]:
lr_best_params = lr_grid_search.best_params_
lr_best = LogisticRegression(
    C=lr_best_params["C"],
    max_iter=lr_best_params["max_iter"],
    penalty=lr_best_params["penalty"],
    solver=lr_best_params["solver"],
    tol=lr_best_params["tol"],
)
lr_best.fit(X_train, y_train)
results["LinearRegression"] = pl.from_dict(
    calculate_basic_metrics(y_val, lr_best.predict(X_val), "LinearRegression")
)
print(results["LinearRegression"])

shape: (1, 5)
┌──────────────────┬──────────┬────────────┬────────┬─────┐
│ model_name       ┆ accuracy ┆ precission ┆ recall ┆ f1  │
│ ---              ┆ ---      ┆ ---        ┆ ---    ┆ --- │
│ str              ┆ f64      ┆ f64        ┆ f64    ┆ f64 │
╞══════════════════╪══════════╪════════════╪════════╪═════╡
│ LinearRegression ┆ 1.0      ┆ 1.0        ┆ 1.0    ┆ 1.0 │
└──────────────────┴──────────┴────────────┴────────┴─────┘


In [11]:
svc = SVC()

svc_param_grid = {
    "C": [0.0001, 0.001, 0.01, 0.1, 1],
    "kernel": ["poly", "rbf", "sigmoid"],
    "degree": [1, 2, 3, 4],
    "gamma": ["scale", "auto"],
    "max_iter": [100, 300, 500],
}

svc_grid_search = GridSearchCV(
    estimator=svc, param_grid=svc_param_grid, scoring="accuracy", refit=True
)
svc_grid_search.fit(X_train, y_train)

0,1,2
,estimator,SVC()
,param_grid,"{'C': [0.0001, 0.001, ...], 'degree': [1, 2, ...], 'gamma': ['scale', 'auto'], 'kernel': ['poly', 'rbf', ...], ...}"
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,C,0.0001
,kernel,'poly'
,degree,3
,gamma,'auto'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [12]:
svc_best_params = svc_grid_search.best_params_
svc_best = SVC(
    C=svc_best_params["C"],
    degree=svc_best_params["degree"],
    gamma=svc_best_params["gamma"],
    kernel=svc_best_params["kernel"],
    max_iter=svc_best_params["max_iter"],
)
svc_best.fit(X_train, y_train)
results["SVC"] = pl.from_dict(
    calculate_basic_metrics(y_val, svc_best.predict(X_val), "SVC")
)
print(results["SVC"])

shape: (1, 5)
┌────────────┬──────────┬────────────┬────────┬─────┐
│ model_name ┆ accuracy ┆ precission ┆ recall ┆ f1  │
│ ---        ┆ ---      ┆ ---        ┆ ---    ┆ --- │
│ str        ┆ f64      ┆ f64        ┆ f64    ┆ f64 │
╞════════════╪══════════╪════════════╪════════╪═════╡
│ SVC        ┆ 1.0      ┆ 1.0        ┆ 1.0    ┆ 1.0 │
└────────────┴──────────┴────────────┴────────┴─────┘


In [13]:
lsvc = LinearSVC()
lsvc_param_grid = {
    "loss": ["hinge", "squared_hinge"],
    "C": [0.0001, 0.001, 0.01, 0.1, 1],
    "multi_class": ["ovr", "crammer_singer"],
    "max_iter": [100, 300, 500],
}

lsvc_grid_search = GridSearchCV(
    estimator=lsvc, param_grid=lsvc_param_grid, scoring="accuracy", refit=True
)
lsvc_grid_search.fit(X_train, y_train)

0,1,2
,estimator,LinearSVC()
,param_grid,"{'C': [0.0001, 0.001, ...], 'loss': ['hinge', 'squared_hinge'], 'max_iter': [100, 300, ...], 'multi_class': ['ovr', 'crammer_singer']}"
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l2'
,loss,'hinge'
,dual,'auto'
,tol,0.0001
,C,1
,multi_class,'crammer_singer'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [15]:
lsvc_best_params = lsvc_grid_search.best_params_
lsvc_best = LinearSVC(
    C=lsvc_best_params["C"],
    loss=lsvc_best_params["loss"],
    max_iter=lsvc_best_params["max_iter"],
    multi_class=lsvc_best_params["multi_class"],
)
lsvc_best.fit(X_train, y_train)
results["LinearSVC"] = pl.from_dict(
    calculate_basic_metrics(y_val, lsvc_best.predict(X_val), "LinearSVC")
)
print(results["LinearSVC"])

shape: (1, 5)
┌────────────┬──────────┬────────────┬──────────┬─────────┐
│ model_name ┆ accuracy ┆ precission ┆ recall   ┆ f1      │
│ ---        ┆ ---      ┆ ---        ┆ ---      ┆ ---     │
│ str        ┆ f64      ┆ f64        ┆ f64      ┆ f64     │
╞════════════╪══════════╪════════════╪══════════╪═════════╡
│ LinearSVC  ┆ 0.933333 ┆ 0.944444   ┆ 0.933333 ┆ 0.93266 │
└────────────┴──────────┴────────────┴──────────┴─────────┘


In [24]:
gbc = GradientBoostingClassifier()

gbc_param_grid = {
    "loss": ["log_loss"],
    "learning_rate": [0.0001, 0.001, 0.01, 0.1, 1],
    "n_estimators": [1, 10, 100, 1000],
    "max_depth": [1, 3, 5],
}

gbc_grid_search = GridSearchCV(
    estimator=gbc, param_grid=gbc_param_grid, scoring="accuracy", refit=True
)
gbc_grid_search.fit(X_train, y_train)

0,1,2
,estimator,GradientBoostingClassifier()
,param_grid,"{'learning_rate': [0.0001, 0.001, ...], 'loss': ['log_loss'], 'max_depth': [1, 3, ...], 'n_estimators': [1, 10, ...]}"
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,loss,'log_loss'
,learning_rate,0.0001
,n_estimators,10
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [25]:
gbc_best_params = gbc_grid_search.best_params_
gbc_best = GradientBoostingClassifier(
    learning_rate=gbc_best_params["learning_rate"],
    loss=gbc_best_params["loss"],
    max_depth=gbc_best_params["max_depth"],
    n_estimators=gbc_best_params["n_estimators"],
)
gbc_best.fit(X_train, y_train)
results["GradientBoostingClassifier"] = pl.from_dict(
    calculate_basic_metrics(
        y_val, gbc_best.predict(X_val), "GradientBoostingClassifier"
    )
)
print(results["GradientBoostingClassifier"])

shape: (1, 5)
┌────────────────────────────┬──────────┬────────────┬──────────┬─────────┐
│ model_name                 ┆ accuracy ┆ precission ┆ recall   ┆ f1      │
│ ---                        ┆ ---      ┆ ---        ┆ ---      ┆ ---     │
│ str                        ┆ f64      ┆ f64        ┆ f64      ┆ f64     │
╞════════════════════════════╪══════════╪════════════╪══════════╪═════════╡
│ GradientBoostingClassifier ┆ 0.933333 ┆ 0.944444   ┆ 0.933333 ┆ 0.93266 │
└────────────────────────────┴──────────┴────────────┴──────────┴─────────┘


In [26]:
hgbc = HistGradientBoostingClassifier()
hgbc_param_grid = {
    "learning_rate": [0.0001, 0.001, 0.01, 0.1, 1],
    "max_iter": [100, 300, 500],
    "max_depth": [1, 3, 5, 7],
}

hgbc_grid_search = GridSearchCV(
    estimator=hgbc, param_grid=hgbc_param_grid, scoring="accuracy", refit=True
)
hgbc_grid_search.fit(X_train, y_train)

0,1,2
,estimator,HistGradientB...ngClassifier()
,param_grid,"{'learning_rate': [0.0001, 0.001, ...], 'max_depth': [1, 3, ...], 'max_iter': [100, 300, ...]}"
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,loss,'log_loss'
,learning_rate,0.0001
,max_iter,100
,max_leaf_nodes,31
,max_depth,3
,min_samples_leaf,20
,l2_regularization,0.0
,max_features,1.0
,max_bins,255
,categorical_features,'from_dtype'


In [28]:
hgbc_best_params = hgbc_grid_search.best_params_
hgbc_best = HistGradientBoostingClassifier(
    learning_rate=hgbc_best_params["learning_rate"],
    max_iter=hgbc_best_params["max_iter"],
    max_depth=hgbc_best_params["max_depth"],
)

hgbc_best.fit(X_train, y_train)
results["HistGradientBoostingClassifier"] = pl.from_dict(
    calculate_basic_metrics(
        y_val, hgbc_best.predict(X_val), "HistGradientBoostingClassifier"
    )
)
print(results["HistGradientBoostingClassifier"])

shape: (1, 5)
┌────────────────────────────────┬──────────┬────────────┬──────────┬─────────┐
│ model_name                     ┆ accuracy ┆ precission ┆ recall   ┆ f1      │
│ ---                            ┆ ---      ┆ ---        ┆ ---      ┆ ---     │
│ str                            ┆ f64      ┆ f64        ┆ f64      ┆ f64     │
╞════════════════════════════════╪══════════╪════════════╪══════════╪═════════╡
│ HistGradientBoostingClassifier ┆ 0.933333 ┆ 0.944444   ┆ 0.933333 ┆ 0.93266 │
└────────────────────────────────┴──────────┴────────────┴──────────┴─────────┘


In [29]:
rf = RandomForestClassifier()

rf_param_grid = {
    "n_estimators": [1, 3, 5, 10, 100],
    "criterion": ["gini", "entropy", "log_loss"],
    "max_depth": [3, 5, 7, 10, None],
    "max_features": ["sqrt", "log2", None],
}

rf_grid_search = GridSearchCV(
    estimator=rf, param_grid=rf_param_grid, scoring="accuracy", refit=True
)
rf_grid_search.fit(X_train, y_train)

0,1,2
,estimator,RandomForestClassifier()
,param_grid,"{'criterion': ['gini', 'entropy', ...], 'max_depth': [3, 5, ...], 'max_features': ['sqrt', 'log2', ...], 'n_estimators': [1, 3, ...]}"
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,10
,criterion,'gini'
,max_depth,7
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'log2'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [32]:
rf_best_params = rf_grid_search.best_params_
rf_best = RandomForestClassifier(
    criterion=rf_best_params["criterion"],
    max_depth=rf_best_params["max_depth"],
    max_features=rf_best_params["max_features"],
    n_estimators=rf_best_params["n_estimators"],
)
rf_best.fit(X_train, y_train)
results["RandomForest"] = pl.from_dict(
    calculate_basic_metrics(y_val, rf_best.predict(X_val), "RandomForest")
)
print(results["RandomForest"])

shape: (1, 5)
┌──────────────┬──────────┬────────────┬──────────┬─────────┐
│ model_name   ┆ accuracy ┆ precission ┆ recall   ┆ f1      │
│ ---          ┆ ---      ┆ ---        ┆ ---      ┆ ---     │
│ str          ┆ f64      ┆ f64        ┆ f64      ┆ f64     │
╞══════════════╪══════════╪════════════╪══════════╪═════════╡
│ RandomForest ┆ 0.933333 ┆ 0.944444   ┆ 0.933333 ┆ 0.93266 │
└──────────────┴──────────┴────────────┴──────────┴─────────┘


In [33]:
final_performance = pl.concat(
    [results[model] for model in results.keys()], how="vertical"
)
print(final_performance)

shape: (6, 5)
┌────────────────────────────────┬──────────┬────────────┬──────────┬─────────┐
│ model_name                     ┆ accuracy ┆ precission ┆ recall   ┆ f1      │
│ ---                            ┆ ---      ┆ ---        ┆ ---      ┆ ---     │
│ str                            ┆ f64      ┆ f64        ┆ f64      ┆ f64     │
╞════════════════════════════════╪══════════╪════════════╪══════════╪═════════╡
│ LinearRegression               ┆ 1.0      ┆ 1.0        ┆ 1.0      ┆ 1.0     │
│ SVC                            ┆ 1.0      ┆ 1.0        ┆ 1.0      ┆ 1.0     │
│ LinearSVC                      ┆ 0.933333 ┆ 0.944444   ┆ 0.933333 ┆ 0.93266 │
│ GradientBoostingClassifier     ┆ 0.933333 ┆ 0.944444   ┆ 0.933333 ┆ 0.93266 │
│ HistGradientBoostingClassifier ┆ 0.933333 ┆ 0.944444   ┆ 0.933333 ┆ 0.93266 │
│ RandomForest                   ┆ 0.933333 ┆ 0.944444   ┆ 0.933333 ┆ 0.93266 │
└────────────────────────────────┴──────────┴────────────┴──────────┴─────────┘
