# Binary Logistic Regression

In [58]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import statsmodels.api as sm

In [59]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer(as_frame=True)

In [60]:
data.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [61]:
print(data.target_names)
print(data.feature_names)

['malignant' 'benign']
['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


In [62]:
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=69)

In [63]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Assuming you have already preprocessed your data and split it into X_train, X_test, y_train, y_test

# Standardize the data to avoid numerical instability
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and fit the logistic regression model with L1 regularization (Lasso)
logit_model = LogisticRegression(penalty='l1', solver='liblinear')
logit_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = logit_model.predict(X_test_scaled)
# Add evaluation metrics here (accuracy, precision, recall, etc.)


In [64]:
print("Classification Report:")
print()
print(classification_report(y_test, y_pred))

# Precision = TP/(TP+FP) i.e. percentage of positives that are actually true positives
# Recall = TP/(TP+FN) i.e. percentage of true positives identified correctly
# F1-score = 2*(precision*recall)/(precision+recall) i.e. the harmonic mean of precision and recall

Classification Report:

              precision    recall  f1-score   support

           0       0.98      0.93      0.95        54
           1       0.94      0.98      0.96        60

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.96       114
weighted avg       0.96      0.96      0.96       114



In [65]:
class_names = data.target_names
conf_df = pd.DataFrame(confusion_matrix(y_test, y_pred), index=[f"Actual {class_name}" for class_name in class_names], columns=[f"Predicted {class_name}" for class_name in class_names])
print("Confusion Matrix:")
print(conf_df)

Confusion Matrix:
                  Predicted malignant  Predicted benign
Actual malignant                   50                 4
Actual benign                       1                59
