# Parameter tuning for classification models
## Manual

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.datasets import load_breast_cancer

### Setting up

In [None]:
# Load the breast cancer dataset
dataObj = load_breast_cancer()
print(dataObj.DESCR)

In [None]:
# Create a dataframe
df = pd.DataFrame(dataObj.data, columns=dataObj.feature_names)
df.info()

In [None]:
X = dataObj.data
y = dataObj.target

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y,
    test_size=0.30,
    stratify=y,
    random_state=1)

# Standardization
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

# Classifier
lr = LogisticRegression(random_state=1)

### Cross-validation (Stratified K-fold)
- Note that it is possible to get test accuracy higher than validation accuracy.
- This might ring an alarm (i.e. too few test data as pointed out in https://stats.stackexchange.com/a/59632).
- However, keep in mind that when using the test data, `gs.fit()` already fit to the entire training set so the amount of data that is used to train the final model is different from the amount of data used to train during grid search.


In [None]:
from sklearn.model_selection import cross_validate

scores = cross_validate(estimator=lr,
                         X=X_train_std,
                         y=y_train,
                         cv=5,
                         scoring=['accuracy','f1'],
                         n_jobs=-1)

In [None]:
df = pd.DataFrame(scores)
display(df)

In [None]:
df.describe().T.loc[['test_accuracy', 'test_f1'],['mean','std']]

### Parameter turning (manual)


In [None]:
Cs = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]

datas = []
for C in Cs:
    lr = LogisticRegression(random_state=1, C=C)
    scores = cross_validate(
        estimator=lr, X=X_train_std, y=y_train, cv=5, scoring=["accuracy", "f1"], n_jobs=-1
    )

    data = {
        "C": C,
        "accuracy_mean": scores["test_accuracy"].mean(),
        "accuracy_std": scores["test_accuracy"].std(),
        "f1_mean": scores["test_f1"].mean(),
        "f1_std": scores["test_f1"].std(),
    }
    datas.append(data)

df = pd.DataFrame.from_dict(datas)
df = df.sort_values(by="accuracy_mean", ascending=False)
display(df)

### Refit

In [None]:
# Choose the best C
C_best = df.iloc[0]["C"]
print(C_best)

In [None]:
lr = LogisticRegression(random_state=1, C=C_best, max_iter=5000) # max_iter=5000 to avoid convergence warning
lr.fit(X_train_std, y_train)

### Actual test result

Note that the test accuracy can be higher than CV accuracy due to different numbers of training and test data.

In [None]:
y_pred = lr.predict(X_test_std)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(f"F1: {f1_score(y_test, y_pred):.2f}")