# Logistic regression for a 2D classification problem

Imports...

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm

## Load and visualize data

In [None]:
data = pd.read_csv("2d-classification-data.csv")
data.head(10)

In [None]:
data.plot.scatter(x="x1", y="x2", c="y");

## Define and fit the model

In [None]:
# Select model input variables
X = data[["x1", "x2"]].copy()
X

In [None]:
# Define a model and fit it
# Note: sklearn.linear_model.LogisticRegression adds an intercept automatically.
model = LogisticRegression(penalty=None)
model.fit(X, data["y"])

In [None]:
# Parameters
beta_1, beta_2 = model.coef_[0]
beta_0 = model.intercept_.item()
print(beta_0, beta_1, beta_2)

In [None]:
# Alternative: statsmodels
model_sm = sm.Logit(data["y"], sm.add_constant(X))
results = model_sm.fit()
print(results.summary())

# beta_0, beta_1, beta_2 = results.params
# print(beta_0, beta_1, beta_2)

## Use model for Prediction

In [None]:
# Prediction of training samples
data['prediction'] = model.predict(X)
data['probability'] = model.predict_proba(X)[:,1]

In [None]:
# Alternative: statsmodels
data['prediction_sm'] = (model_sm.predict(results.params) > 0.5) * 1.0
data['probability_sm'] = model_sm.predict(results.params)

## Visualization

In [None]:
# Predictions with decision boundary
data.plot.scatter(x="x1", y="x2", c="prediction");
x1 = np.array([-3, 3])
x2 = (- x1 * beta_1 - beta_0) / beta_2
plt.plot(x1, x2);

In [None]:
# Decision boundary vs. ground truth
data.plot.scatter(x="x1", y="x2", c="y");
x1 = np.array([-3, 3])
x2 = (- x1 * beta_1 - beta_0) / beta_2
plt.plot(x1, x2);

In [None]:
# Probabilities
data.plot.scatter(x="x1", y="x2", c="probability");
x1 = np.array([-3, 3])
x2 = (- x1 * beta_1 - beta_0) / beta_2
plt.plot(x1, x2);

In [None]:
# data.to_csv("results.csv", index=False)

In [None]:
# Bonus: Visualize learned decision probability (here: with statsmodels)
X, Y = np.meshgrid(np.linspace(-3, 3, 100), np.linspace(-4, 4, 100))
points = np.concatenate([X[..., None], Y[..., None]], axis=-1).reshape(-1, 2)
probs = model_sm.predict(results.params, sm.add_constant(points))

plt.scatter(X, Y, c=probs);
plt.colorbar();
plt.xlabel("$x_1$");
plt.ylabel("$x_2$");
plt.title("Probability $p(y = 1 | x)$");

from matplotlib import cm
fig, ax = plt.subplots(subplot_kw={"projection": "3d"})
surf = ax.plot_surface(X.reshape(100, 100), 
                       Y.reshape(100, 100), 
                       probs.reshape(100, 100), 
                       cmap=cm.coolwarm,
                       linewidth=0, antialiased=False)
ax.set_xlabel("$x_1$");
ax.set_ylabel("$x_2$");
ax.set_zlabel("$p(y = 1 | x)$");
ax.view_init(30, 360-155);
ax.zaxis.labelpad=-2.5;
fig.tight_layout();