In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Useful sklearn imports
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
)
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.datasets import fetch_openml

# Logistic regression

## Data loading and exploration

In [None]:
# Data for logistic regression
def real_data_logistic():
    # We will load the Titanic dataset form openml
    try:
        titanic_data = fetch_openml(
            name="titanic",
            version=1,
            as_frame=True
        )
        df_titanic = titanic_data.frame.drop('body', axis=1)
    except Exception as e:
        print(f"Error loading Titanic dataset: {e}.")

    # Features
    X_titanic = df_titanic.drop('survived', axis=1)
    # Target variable
    y_titanic = df_titanic['survived']

    return X_titanic, y_titanic

### Get real data for logistic regression

In [None]:
X, y = real_data_logistic()

### Explore the data, visualizing it, etc.

In [None]:
# What type of data do we have?
print('X data types:')
print(X.dtypes)
print('\n')
print('y data types:')
print(y.dtypes)

In [None]:
X.head()

In [None]:
# Convert categorical variables to dummies
X = pd.get_dummies(
    X,
    columns=['sex','embarked'],
    drop_first=True)


In [None]:
X.head()

In [None]:
# Let's encode the categorical variables
def encode_categorical(X):
    # Select the categorical columns
    cat_cols = X.select_dtypes(include=['object']).columns
    print(f'Categorical columns: {cat_cols}')
    
    # Create a label encoder object
    le = LabelEncoder()

    # Apply the label encoder to each column
    for col in cat_cols:
        X[col] = le.fit_transform(X[col])

    return X


# The rest, simply encode
X_encoded = encode_categorical(X)

In [None]:
X_encoded.head()

In [None]:
# Drop NaN values
X_features = X_encoded.dropna()
y_target = y.loc[X_features.index]

## Fit a logistic regression model

In [None]:
# Define a Logistic Regression model
???

# MLE fitting
???

In [None]:
# Look at the coefficients
print('Model coefficients:')
print(model_logistic.coef_)
print('Model intercept:')
print(model_logistic.intercept_)
print('Model classes:')
print(model_logistic.classes_)
print('Model number of iterations:')
print(model_logistic.n_iter_)
print('Model score:')
print(model_logistic.score(X_features, y_target))

In [None]:
# Evaluate the model
y_pred_logistic = model_logistic.predict(X_features)
y_pred_proba_logistic = model_logistic.predict_proba(X_features)[:, 1]  # Probabilities for ROC AUC

print("--- Logistic Regression (Titanic Dataset) ---")
print("Accuracy:", accuracy_score(y_target, y_pred_logistic))
print("Confusion Matrix:\n", confusion_matrix(y_target, y_pred_logistic))

# Multinomial logistic regression

## Data loading and exploration

In [None]:
def real_data_multinomial():
    # Load the Iris dataset
    from sklearn.datasets import load_iris
    iris_data = load_iris()
    df_iris = pd.DataFrame(data=iris_data.data, columns=iris_data.feature_names)
    df_iris['target'] = iris_data.target  # 0, 1, or 2

    # Features
    X_iris = df_iris.drop('target', axis=1)
    # No need for significant preprocessing here, but good practice to scale
    scaler = StandardScaler()
    X_iris_scaled = scaler.fit_transform(X_iris) #scale the data
    # Target variable
    y_iris = df_iris['target']

    return X_iris_scaled, y_iris, iris_data.feature_names, iris_data.target_names

### Get real data for multinomial regression

In [None]:
X, y, feature_names, target_name = real_data_multinomial()

### Feel free to explore the data, visualizing it, etc.

In [None]:
# What type of data do we have?
print('X with features={}'.format(feature_names))
print(X)

In [None]:
print('y with target={}'.format(target_name))
print(y)

## Fit a multinomial regression model

In [None]:
# Define a multinomial logistic regression
model_multinomial = LogisticRegression(
    ???
)

# Fit the model
model_multinomial.fit(
    ???
)

In [None]:
# Evaluate the model
y_pred_multinomial = model_multinomial.predict(X)
print("\n--- Multinomial Logistic Regression (Iris Dataset) ---")
print("Accuracy:", accuracy_score(y, y_pred_multinomial))
print("Confusion Matrix:\n", confusion_matrix(y, y_pred_multinomial))