In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Useful sklearn imports
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
)
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.datasets import fetch_openml

# Logistic regression

## Data loading and exploration

In [None]:
# Data for logistic regression
def real_data_logistic():
    # We will load the Titanic dataset form openml
    try:
        titanic_data = fetch_openml(
            name="titanic",
            version=1,
            as_frame=True
        )
        df_titanic = titanic_data.frame.drop('body', axis=1)
    except Exception as e:
        print(f"Error loading Titanic dataset: {e}.")

    # Features
    X_titanic = df_titanic.drop('survived', axis=1)
    # Target variable
    y_titanic = df_titanic['survived']

    return X_titanic, y_titanic

### Get real data for logistic regression

In [None]:
X, y = real_data_logistic()

### Explore the data, visualizing it, etc.

In [None]:
# What type of data do we have?
print('X data types:')
print(X.dtypes)
print('\n')
print('y data types:')
print(y.dtypes)

In [None]:
X.head()

In [None]:
# Convert categorical variables to dummies
X = pd.get_dummies(
    X,
    columns=['sex','embarked'],
    drop_first=True)


In [None]:
X.head()

In [None]:
# Let's encode the categorical variables
def encode_categorical(X):
    # Select the categorical columns
    cat_cols = X.select_dtypes(include=['object']).columns
    print(f'Categorical columns: {cat_cols}')
    
    # Create a label encoder object
    le = LabelEncoder()

    # Apply the label encoder to each column
    for col in cat_cols:
        X[col] = le.fit_transform(X[col])

    return X


# The rest, simply encode
X_encoded = encode_categorical(X)

In [None]:
X_encoded.head()

In [None]:
# Drop NaN values
X_features = X_encoded.dropna()
y_target = y.loc[X_features.index]

## Fit a logistic regression model

### Based on a train-test split

In [None]:
# Split the data into training and test sets, using sklearn
from sklearn.model_selection import train_test_split
???

In [None]:
# Define a Logistic Regression model
model_logistic = LogisticRegression(
    random_state=42,
    max_iter=1000
  )

# MLE fitting
model_logistic.fit(
    X = X_train,
    y = y_train
)

In [None]:
# Look at the coefficients
print('Model coefficients:')
print(model_logistic.coef_)
print('Model intercept:')
print(model_logistic.intercept_)
print('Model classes:')
print(model_logistic.classes_)
print('Model number of iterations:')
print(model_logistic.n_iter_)
print('Model score:')
print(model_logistic.score(X_features, y_target))

In [None]:
# Predict on the training set
y_pred_train = model_logistic.predict(X_train)
# Predict on the test set
y_pred_test = model_logistic.predict(X_test)

# Evaluate the model
print("--- Logistic Regression (Train Set) ---")
print("Accuracy:", accuracy_score(y_train, y_pred_train))
print("Confusion Matrix:\n", confusion_matrix(y_train, y_pred_train))
print("\n--- Logistic Regression (Test Set) ---")
print("Accuracy:", accuracy_score(y_test, y_pred_test))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))

### Beyond accuracy: ROC curve

In [None]:
# ROC and AUC, using sklearn
# Import necessary libraries
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import RocCurveDisplay

# Predict probabilities
# Predict probabilities for the test set
# Note: The predict_proba method returns the probabilities for each class
# and we want the probability of the positive class (1)
y_prob = model_logistic.predict_proba(X_test)[:, 1]
# Calculate the ROC curve
fpr, tpr, thresholds = roc_curve(
    y_test,
    y_prob,
    pos_label='1'
)
# Calculate the AUC
roc_auc = roc_auc_score(y_test, y_prob)
print("ROC AUC:", roc_auc)
# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, c='blue', label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()

In [None]:
fpr, tpr, thresholds

In [None]:
# Plot the ROC curve using sklearn's RocCurveDisplay
RocCurveDisplay.from_estimator(
    model_logistic,
    X_test,
    y_test,
    name='Logistic Regression',
    alpha=0.8
)
plt.show()

In [None]:
### How does the model perform on the training set?
# Predict probabilities for the training set
y_prob_train = model_logistic.predict_proba(X_train)[:, 1]
# Calculate the ROC curve for the training set
fpr_train, tpr_train, thresholds_train = roc_curve(
    y_train,
    y_prob_train,
    pos_label='1'
)
# Calculate the AUC for the training set
roc_auc_train = roc_auc_score(y_train, y_prob_train)
print("ROC AUC (Train Set):", roc_auc_train)
# Plot the ROC curve using sklearn's RocCurveDisplay for the training set
RocCurveDisplay.from_estimator(
    model_logistic,
    X_train,
    y_train,
    name='Logistic Regression (Train Set)',
    alpha=0.8
)
plt.show()

In [None]:
# Plot the ROC curve for both training and test sets
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, c='blue', label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot(fpr_train, tpr_train, c='orange', label='ROC curve (Train Set area = {:.2f})'.format(roc_auc_train))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()


In [None]:
# Precision-Recall curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import PrecisionRecallDisplay

# Calculate precision and recall
# Predict probabilities for the training set
y_prob_train = model_logistic.predict_proba(X_train)[:, 1]
# Calculate precision and recall for the training set
precision_train, recall_train, thresholds_pr_train = precision_recall_curve(
    y_train,
    y_prob_train,
    pos_label='1'
)
# Calculate the average precision score for the training set
avg_precision_train = average_precision_score(y_train, y_prob_train, pos_label='1')
print("Average Precision (Train Set):", avg_precision_train)

# Predict probabilities for the test set
y_prob = model_logistic.predict_proba(X_test)[:, 1]
# Calculate precision and recall for the test set
precision, recall, thresholds_pr = precision_recall_curve(
    y_test,
    y_prob,
    pos_label='1'
)
# Calculate the average precision score for the test set
avg_precision = average_precision_score(y_test, y_prob, pos_label='1')
print("Average Precision (Test Set):", avg_precision)

# Plot the precision-recall curve, for both training and test sets
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, c='blue', label='Precision-Recall curve (area = {:.2f})'.format(avg_precision))
plt.plot(recall_train, precision_train, c='orange', label='Precision-Recall curve (Train Set area = {:.2f})'.format(avg_precision_train))
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc='lower left')
plt.show()


In [None]:
# Plot the precision-recall curve using sklearn's PrecisionRecallDisplay
PrecisionRecallDisplay.from_estimator(
    model_logistic,
    X_test,
    y_test,
    name='Logistic Regression',
    alpha=0.8
)
plt.show()

### How does all of the above change with

- Number of features?

- Number of samples?

- Different train-test splits?