# Logistic Regression

This NB uses;
0) Pipline for handling null values both categorical and numeric columns + encoding + analysis. So, if you want to deploy i on Streamlit, you will not need to call encoder. Just calling model is enough. 

1) "class_weight="balanced" " and

2) "stratfy = y" for handling unbalance at the target variable. Thus, it can be use at "balanced" an "unbalanced" data.

3) Target variable is "Default", meaning taht customer will not pay credit. 0--->No Default Risk, 1---> Default

4) We don't need apply scaling in tree base models.

5) We don't need apply encoding on Target variable in tree base models.

6) We should use ordinal encoding on categorical variables in features. But this data set not including any categorical data as feature.

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import joblib

import warnings  # for ignoring unnecessary warnings
warnings.filterwarnings("ignore")
warnings.warn("this will not show")

# ------------------------ #

# 1) Load the dataset 
df = pd.read_excel('sample_data.xlsx') 
# Ensure the target variable is named "Target" in the Excel file.

# 2) Separate features and target variable from the dataset
X = df.drop(columns=["Target"])
y = df["Target"]

# Separate numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# 3) Create transformers for data preprocessing
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# 4) Create a pipeline with a Logistic Regression model
base_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(class_weight="balanced", random_state=42, max_iter=1000))
])

# 5) Perform a train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 6) Train and evaluate the Base Model
base_pipeline.fit(X_train, y_train)

# 7) Predictions and evaluation metrics
print("EVAL METRICS FOR BASE MODEL")
y_train_pred = base_pipeline.predict(X_train)
y_test_pred = base_pipeline.predict(X_test)

def evaluate_model(y_true, y_pred, data_type="Train"):
    print(f"Evaluation metrics for {data_type} data:")
    print(f"Accuracy: {accuracy_score(y_true, y_pred)}")
    print(f"F1 Score: {f1_score(y_true, y_pred)}")
    print("Classification Report:")
    print(classification_report(y_true, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("\n")

evaluate_model(y_train, y_train_pred, "Train")
evaluate_model(y_test, y_test_pred, "Test")

# 8) Hyperparameter Tuning (for Logistic Regression)
param_grid = {
    'classifier__C': [0.01, 0.1, 1.0, 10.0, 100.0],  # Regularization parameter
    'classifier__solver': ['liblinear', 'lbfgs']     # Solvers for logistic regression
}

grid_search = GridSearchCV(base_pipeline, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train)

print(f"Best Hyperparameters: {grid_search.best_params_}")

# 9) Build and evaluate the Best Model
print("")
print("----EVAL METRICS FOR BEST MODEL----")
print("")

best_pipeline = grid_search.best_estimator_

y_train_best_pred = best_pipeline.predict(X_train)
y_test_best_pred = best_pipeline.predict(X_test)

evaluate_model(y_train_best_pred, y_train, "Train (Best Model)")
evaluate_model(y_test_best_pred, y_test, "Test (Best Model)")

# Compare metrics
metrics_comparison = pd.DataFrame({
    'Metric': ['Accuracy', 'F1 Score'],
    'BaseModelTrain': [accuracy_score(y_train, y_train_pred), f1_score(y_train, y_train_pred)],
    'BaseModelTest': [accuracy_score(y_test, y_test_pred), f1_score(y_test, y_test_pred)],
    'BestModelTrain': [accuracy_score(y_train, y_train_best_pred), f1_score(y_train, y_train_best_pred)],
    'BestModelTest': [accuracy_score(y_test, y_test_best_pred), f1_score(y_test, y_test_best_pred)]
})

print(metrics_comparison)

# 10) Train the Final Model on the entire dataset
final_model = best_pipeline.fit(X, y)

# 11) Save the Final Model
joblib.dump(final_model, 'final_logistic_regression_model.pkl')

print(" ")
print("Final model saved as 'final_logistic_regression_model.pkl'")

EVAL METRICS FOR BASE MODEL
Evaluation metrics for Train data:
Accuracy: 0.605
F1 Score: 0.24761904761904763
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.60      0.73       359
           1       0.15      0.63      0.25        41

    accuracy                           0.60       400
   macro avg       0.54      0.62      0.49       400
weighted avg       0.85      0.60      0.68       400

Confusion Matrix:
[[216 143]
 [ 15  26]]


Evaluation metrics for Test data:
Accuracy: 0.57
F1 Score: 0.21818181818181817
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.57      0.70        90
           1       0.13      0.60      0.22        10

    accuracy                           0.57       100
   macro avg       0.53      0.58      0.46       100
weighted avg       0.85      0.57      0.65       100

Confusion Matrix:
[[51 39]
 [ 4  6]]


Best Hyperparameters: {'classi