In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Useful sklearn imports
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
)
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.datasets import fetch_openml

# Logistic regression

## Data loading and exploration

In [2]:
# Data for logistic regression
def real_data_logistic():
    # We will load the Titanic dataset form openml
    try:
        titanic_data = fetch_openml(
            name="titanic",
            version=1,
            as_frame=True
        )
        df_titanic = titanic_data.frame.drop('body', axis=1)
    except Exception as e:
        print(f"Error loading Titanic dataset: {e}.")

    # Features
    X_titanic = df_titanic.drop('survived', axis=1)
    # Target variable
    y_titanic = df_titanic['survived']

    return X_titanic, y_titanic

### Get real data for logistic regression

In [3]:
X, y = real_data_logistic()

### Explore the data, visualizing it, etc.

In [4]:
# What type of data do we have?
print('X data types:')
print(X.dtypes)
print('\n')
print('y data types:')
print(y.dtypes)

X data types:
pclass          int64
name           object
sex          category
age           float64
sibsp           int64
parch           int64
ticket         object
fare          float64
cabin          object
embarked     category
boat           object
home.dest      object
dtype: object


y data types:
category


In [5]:
X.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,home.dest
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,"St Louis, MO"
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,"Montreal, PQ / Chesterville, ON"
2,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,"Montreal, PQ / Chesterville, ON"
3,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,"Montreal, PQ / Chesterville, ON"
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,"Montreal, PQ / Chesterville, ON"


In [6]:
# Convert categorical variables to dummies
X = pd.get_dummies(
    X,
    columns=['sex','embarked'],
    drop_first=True)


In [7]:
X.head()

Unnamed: 0,pclass,name,age,sibsp,parch,ticket,fare,cabin,boat,home.dest,sex_male,embarked_Q,embarked_S
0,1,"Allen, Miss. Elisabeth Walton",29.0,0,0,24160,211.3375,B5,2.0,"St Louis, MO",False,False,True
1,1,"Allison, Master. Hudson Trevor",0.9167,1,2,113781,151.55,C22 C26,11.0,"Montreal, PQ / Chesterville, ON",True,False,True
2,1,"Allison, Miss. Helen Loraine",2.0,1,2,113781,151.55,C22 C26,,"Montreal, PQ / Chesterville, ON",False,False,True
3,1,"Allison, Mr. Hudson Joshua Creighton",30.0,1,2,113781,151.55,C22 C26,,"Montreal, PQ / Chesterville, ON",True,False,True
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",25.0,1,2,113781,151.55,C22 C26,,"Montreal, PQ / Chesterville, ON",False,False,True


In [8]:
# Let's encode the categorical variables
def encode_categorical(X):
    # Select the categorical columns
    cat_cols = X.select_dtypes(include=['object']).columns
    print(f'Categorical columns: {cat_cols}')
    
    # Create a label encoder object
    le = LabelEncoder()

    # Apply the label encoder to each column
    for col in cat_cols:
        X[col] = le.fit_transform(X[col])

    return X


# The rest, simply encode
X_encoded = encode_categorical(X)

Categorical columns: Index(['name', 'ticket', 'cabin', 'boat', 'home.dest'], dtype='object')


In [9]:
X_encoded.head()

Unnamed: 0,pclass,name,age,sibsp,parch,ticket,fare,cabin,boat,home.dest,sex_male,embarked_Q,embarked_S
0,1,21,29.0,0,0,187,211.3375,43,11,308,False,False,True
1,1,23,0.9167,1,2,49,151.55,79,2,230,True,False,True
2,1,24,2.0,1,2,49,151.55,79,27,230,False,False,True
3,1,25,30.0,1,2,49,151.55,79,27,230,True,False,True
4,1,26,25.0,1,2,49,151.55,79,27,230,False,False,True


In [10]:
# Drop NaN values
X_features = X_encoded.dropna()
y_target = y.loc[X_features.index]

## Fit a logistic regression model

In [11]:
# Define a Logistic Regression model
model_logistic = LogisticRegression(
    random_state=42,
    max_iter=1000
  )

# MLE fitting
model_logistic.fit(
    X_features,
    y_target
)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
# Look at the coefficients
print('Model coefficients:')
print(model_logistic.coef_)
print('Model intercept:')
print(model_logistic.intercept_)
print('Model classes:')
print(model_logistic.classes_)
print('Model number of iterations:')
print(model_logistic.n_iter_)
print('Model score:')
print(model_logistic.score(X_features, y_target))

Model coefficients:
[[ 9.06430852e-02  1.05799938e-04 -2.07071708e-02 -1.13347297e-01
  -1.95116291e-01 -9.79387959e-04 -5.57286576e-04 -6.58450548e-03
  -9.70339151e-01 -2.97942858e-03 -2.59614568e+00 -2.48367951e+00
  -5.35545298e-01]]
Model intercept:
[28.31151998]
Model classes:
['0' '1']
Model number of iterations:
[1000]
Model score:
0.9655502392344497


In [13]:
# Evaluate the model
y_pred_logistic = model_logistic.predict(X_features)
y_pred_proba_logistic = model_logistic.predict_proba(X_features)[:, 1]  # Probabilities for ROC AUC

print("--- Logistic Regression (Titanic Dataset) ---")
print("Accuracy:", accuracy_score(y_target, y_pred_logistic))
print("Confusion Matrix:\n", confusion_matrix(y_target, y_pred_logistic))

--- Logistic Regression (Titanic Dataset) ---
Accuracy: 0.9655502392344497
Confusion Matrix:
 [[614   4]
 [ 32 395]]


# Multinomial logistic regression

## Data loading and exploration

In [14]:
def real_data_multinomial():
    # Load the Iris dataset
    from sklearn.datasets import load_iris
    iris_data = load_iris()
    df_iris = pd.DataFrame(data=iris_data.data, columns=iris_data.feature_names)
    df_iris['target'] = iris_data.target  # 0, 1, or 2

    # Features
    X_iris = df_iris.drop('target', axis=1)
    # No need for significant preprocessing here, but good practice to scale
    scaler = StandardScaler()
    X_iris_scaled = scaler.fit_transform(X_iris) #scale the data
    # Target variable
    y_iris = df_iris['target']

    return X_iris_scaled, y_iris, iris_data.feature_names, iris_data.target_names

### Get real data for multinomial regression

In [15]:
X, y, feature_names, target_name = real_data_multinomial()

### Feel free to explore the data, visualizing it, etc.

In [16]:
# What type of data do we have?
print('X with features={}'.format(feature_names))
print(X)

X with features=['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
[[-9.00681170e-01  1.01900435e+00 -1.34022653e+00 -1.31544430e+00]
 [-1.14301691e+00 -1.31979479e-01 -1.34022653e+00 -1.31544430e+00]
 [-1.38535265e+00  3.28414053e-01 -1.39706395e+00 -1.31544430e+00]
 [-1.50652052e+00  9.82172869e-02 -1.28338910e+00 -1.31544430e+00]
 [-1.02184904e+00  1.24920112e+00 -1.34022653e+00 -1.31544430e+00]
 [-5.37177559e-01  1.93979142e+00 -1.16971425e+00 -1.05217993e+00]
 [-1.50652052e+00  7.88807586e-01 -1.34022653e+00 -1.18381211e+00]
 [-1.02184904e+00  7.88807586e-01 -1.28338910e+00 -1.31544430e+00]
 [-1.74885626e+00 -3.62176246e-01 -1.34022653e+00 -1.31544430e+00]
 [-1.14301691e+00  9.82172869e-02 -1.28338910e+00 -1.44707648e+00]
 [-5.37177559e-01  1.47939788e+00 -1.28338910e+00 -1.31544430e+00]
 [-1.26418478e+00  7.88807586e-01 -1.22655167e+00 -1.31544430e+00]
 [-1.26418478e+00 -1.31979479e-01 -1.34022653e+00 -1.44707648e+00]
 [-1.87002413e+00 -1.31979479e

In [17]:
print('y with target={}'.format(target_name))
print(y)

y with target=['setosa' 'versicolor' 'virginica']
0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: target, Length: 150, dtype: int64


## Fit a multinomial regression model

In [18]:
# Define a multinomial logistic regression
model_multinomial = LogisticRegression(
    random_state=42,
    multi_class='multinomial',
    solver='lbfgs', #use solver='lbfgs'
    max_iter=1000
)

# Fit the model
model_multinomial.fit(X, y)



In [19]:
# Evaluate the model
y_pred_multinomial = model_multinomial.predict(X)
print("\n--- Multinomial Logistic Regression (Iris Dataset) ---")
print("Accuracy:", accuracy_score(y, y_pred_multinomial))
print("Confusion Matrix:\n", confusion_matrix(y, y_pred_multinomial))


--- Multinomial Logistic Regression (Iris Dataset) ---
Accuracy: 0.9733333333333334
Confusion Matrix:
 [[50  0  0]
 [ 0 47  3]
 [ 0  1 49]]
