## Improving Efficacy in binary classification

### 1. Load the data

In [2]:
from holisticai.efficacy.metrics import classification_efficacy_metrics
import pandas as pd
import numpy as np

import warnings
# Filter out all warnings
warnings.filterwarnings("ignore")

In [3]:
from holisticai.datasets import load_dataset
loaded = load_dataset(dataset='adult', preprocessed=False, as_array=False)
df = pd.DataFrame(data=loaded.data, columns=loaded.feature_names)
df['class'] = loaded.target.apply(lambda x: 1 if x == '>50K' else 0)

df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25.0,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,0
1,38.0,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,0
2,28.0,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,1
3,44.0,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,1
4,18.0,,103497.0,Some-college,10.0,Never-married,,Own-child,White,Female,0.0,0.0,30.0,United-States,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27.0,Private,257302.0,Assoc-acdm,12.0,Married-civ-spouse,Tech-support,Wife,White,Female,0.0,0.0,38.0,United-States,0
48838,40.0,Private,154374.0,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0.0,0.0,40.0,United-States,1
48839,58.0,Private,151910.0,HS-grad,9.0,Widowed,Adm-clerical,Unmarried,White,Female,0.0,0.0,40.0,United-States,0
48840,22.0,Private,201490.0,HS-grad,9.0,Never-married,Adm-clerical,Own-child,White,Male,0.0,0.0,20.0,United-States,0


### 2. Train a model

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical features
categorical_features = X.select_dtypes(include=['category']).columns

# Create transformers for numerical and categorical features
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine transformers into a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, X.select_dtypes(exclude=['category']).columns),
        ('cat', categorical_transformer, categorical_features)
    ])

# Fit and transform your data using the ColumnTransformer
X_train_transformed = preprocessor.fit_transform(X_train)

# Create a logistic regression model
model = LogisticRegression()

# Fit the model to the training data
model.fit(X_train_transformed, y_train)

X_test_transformed = preprocessor.transform(X_test)
# Make predictions on the test data
y_pred = model.predict(X_test_transformed)
y_proba = model.predict_proba(X_test_transformed)

### 3. Measuaring Efficacy

In [23]:
from holisticai.efficacy.metrics import classification_efficacy_metrics
classification_efficacy_metrics(y_pred, y_test, y_proba)

Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Accuracy,0.857201,1
Balanced Accuracy,0.811777,1
Precision,0.606114,1
Recall,0.737905,1
F1-Score,0.665548,1
AUC,0.811777,1
Log Loss,5.146985,0


### 4. Improving Efficacy metrics

In [24]:
from Hyperparameters_grid import optimize_hyperparameters_grid
from sklearn.metrics import accuracy_score
new_model = optimize_hyperparameters_grid(model, X_train_transformed, y_train, accuracy_score, param_range_factor=0.2, cv=8)

In [29]:
# Make predictions on the test data
y_pred = new_model.predict(X_test_transformed)
y_proba = new_model.predict_proba(X_test_transformed)

In [31]:
classification_efficacy_metrics(y_pred, y_test, y_proba)

Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Accuracy,0.857201,1
Balanced Accuracy,0.811777,1
Precision,0.606114,1
Recall,0.737905,1
F1-Score,0.665548,1
AUC,0.811777,1
Log Loss,5.146985,0
