In [86]:
import polars as pl
import sklearn as sk
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from setup.constants import PROJECT_DATA
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

In [87]:
iris_df = pl.read_csv(PROJECT_DATA)

In [88]:
iris_df_combined = (
    iris_df.with_columns(
        (pl.col('sepal_length') + pl.col('sepal_width')).alias('sepal_sum'),
        (pl.col('petal_length') + pl.col('petal_width')).alias('petal_sum'),
    )
    .with_columns(
        (pl.col('sepal_sum') + pl.col('petal_sum')).alias('total_sum')
    )
)

In [89]:
X = iris_df_combined.drop('class')
print(X)

y = iris_df_combined.get_column('class').to_frame()

unique_y = y['class'].unique().to_list()
y_mapping = {species: i for i, species in enumerate(unique_y)}

y = y.with_columns(
    pl.col('class').replace_strict(y_mapping).alias('encoded_class')
).drop('class').to_series()
print(y)


shape: (150, 7)
┌──────────────┬─────────────┬──────────────┬─────────────┬───────────┬───────────┬───────────┐
│ sepal_length ┆ sepal_width ┆ petal_length ┆ petal_width ┆ sepal_sum ┆ petal_sum ┆ total_sum │
│ ---          ┆ ---         ┆ ---          ┆ ---         ┆ ---       ┆ ---       ┆ ---       │
│ f64          ┆ f64         ┆ f64          ┆ f64         ┆ f64       ┆ f64       ┆ f64       │
╞══════════════╪═════════════╪══════════════╪═════════════╪═══════════╪═══════════╪═══════════╡
│ 5.1          ┆ 3.5         ┆ 1.4          ┆ 0.2         ┆ 8.6       ┆ 1.6       ┆ 10.2      │
│ 4.9          ┆ 3.0         ┆ 1.4          ┆ 0.2         ┆ 7.9       ┆ 1.6       ┆ 9.5       │
│ 4.7          ┆ 3.2         ┆ 1.3          ┆ 0.2         ┆ 7.9       ┆ 1.5       ┆ 9.4       │
│ 4.6          ┆ 3.1         ┆ 1.5          ┆ 0.2         ┆ 7.7       ┆ 1.7       ┆ 9.4       │
│ 5.0          ┆ 3.6         ┆ 1.4          ┆ 0.2         ┆ 8.6       ┆ 1.6       ┆ 10.2      │
│ …            ┆ …      

In [90]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, stratify=y_test)
print(len(X_train), len(y_train), len(X_val), len(y_val), len(X_test), len(y_test))
print(y_train.unique_counts(), y_val.unique_counts(), y_test.unique_counts())
results = {}

120 120 15 15 15 15
shape: (3,)
Series: 'encoded_class' [u32]
[
	40
	40
	40
] shape: (3,)
Series: 'encoded_class' [u32]
[
	5
	5
	5
] shape: (3,)
Series: 'encoded_class' [u32]
[
	5
	5
	5
]


In [91]:
lr = LogisticRegression()

lr_param_grid = {
    'penalty': ['l2'],
    'solver': ['lbfgs', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
    'C': [0.001, 0.01, 0.1, 1],
    'tol': [0.00001, 0.0001, 0.001, 0.01],
    'max_iter': [100, 300, 500]
}

lr_grid_search = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid=lr_param_grid,
    scoring='accuracy',
    refit=True
)
lr_grid_search.fit(X_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to sca

0,1,2
,estimator,LogisticRegression()
,param_grid,"{'C': [0.001, 0.01, ...], 'max_iter': [100, 300, ...], 'penalty': ['l2'], 'solver': ['lbfgs', 'newton-cg', ...], ...}"
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l2'
,dual,False
,tol,1e-05
,C,0.1
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'sag'
,max_iter,500


In [93]:
def calculate_basic_metrics(y_true, y_pred) -> dict:
    return {
        'accuracy': accuracy_score(y_true, y_pred),
        'precission': precision_score(y_true, y_pred, average='weighted'),
        'recall': recall_score(y_true, y_pred, average='weighted'),
        'f1': f1_score(y_true, y_pred, average='weighted')
    }

In [92]:
lr_best_params = lr_grid_search.best_params_
lr_best = LogisticRegression(
    C=lr_best_params['C'],
    max_iter=lr_best_params['max_iter'],
    penalty=lr_best_params['penalty'],
    solver=lr_best_params['solver'],
    tol=lr_best_params['tol']
)
lr_best.fit(X_train, y_train)



0,1,2
,penalty,'l2'
,dual,False
,tol,1e-05
,C,0.1
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'sag'
,max_iter,500


In [97]:
lr_y_pred = lr_best.predict(X_val)
results['LinearRegression'] = calculate_basic_metrics(y_test, lr_best.predict(X_test))

{'LinearRegression': {'accuracy': 1.0, 'precission': 1.0, 'recall': 1.0, 'f1': 1.0}}
