In [1]:
#5.2.1 Generating example classification data

import numpy as np
import scipy.stats as ss
import matplotlib.pyplot as plt
%matplotlib notebook

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [9]:
h = 1
sd = 1
n = 50


def gen_data(n, h, sd1, sd2):
    x1 = ss.norm.rvs(-h, sd, n)
    y1 = ss.norm.rvs(0, sd, n)

    x2 = ss.norm.rvs(h, sd, n)
    y2 = ss.norm.rvs(0, sd, n)
    return (x1, y1, x2, y2)

In [10]:
(x1, y1, x2, y2)= gen_data(50, 1, 1, 1.5)

In [11]:
(x1, y1, x2, y2)= gen_data(1000, 1.5, 1, 1.5)

In [13]:
def plot_data(x1,y1,x2,y2):
    plt.figure()
    plt.plot(x1, y1, "o", ms=2)
    plt.plot(x2, y2, "o", ms=2)
    plt.xlabel("$X_1$")
    plt.ylabel("$X_2$")

In [15]:
plot_data(x1,y1,x2,y2)

<IPython.core.display.Javascript object>

In [31]:
#internal_exercise
(x1,y1,x2,y2)=gen_data(1000, 10, 100, 100)
plot_data(x1,y1,x2,y2)
(x1,y1,x2,y2)=gen_data(1000, 20, .5, .5)
plot_data(x1,y1,x2,y2)
(x1,y1,x2,y2)=gen_data(1000, 1, 2, 2.5)
plot_data(x1,y1,x2,y2)
(x1,y1,x2,y2)=gen_data(1000, 0, 1, 1)
plot_data(x1,y1,x2,y2)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [32]:
#5.2.2 Logistic Regression 

In [33]:
def prob_to_odds(p):
    if p <= 0 or p >= 1:
        print("Probabilities must be between 0 and 1.")
    return p / (1-p)

In [37]:
prob_to_odds(0.2)

0.25

In [124]:
#5.2.3 Logistic Regression in Code
from sklearn.linear_model import LogisticRegression
clf=LogisticRegression(penalty='l2',dual=False,tol=0.0001,
    C=1.0,
    fit_intercept=True,
    intercept_scaling=1,
    class_weight=None,
    random_state=None,
    max_iter=100,
    verbose=0,
    warm_start=False,
    n_jobs=1,solver='liblinear',multi_class='ovr')
X=np.vstack((np.vstack((x1, y1)).T, np.vstack((x2, y2)).T))

In [125]:
X.shape

(2000, 2)

In [126]:
n=1000
y=np.hstack((np.repeat(1, n), np.repeat(2, n)))

In [127]:
y.shape

(2000,)

In [134]:
X_train, X_test, y_train, y_test= train_test_split(X, y, train_size= 0.5,random_state=1)

In [135]:
y_train.shape

(1000,)

In [136]:
clf.fit(X_train, y_train)

LogisticRegression(multi_class='ovr', n_jobs=1, solver='liblinear')

In [137]:
clf.score(X_test, y_test)

0.488

In [138]:
clf.predict_proba(np.array([-2, 0]).reshape(1, -1))

array([[0.52228076, 0.47771924]])

In [139]:
clf.predict(np.array([-2, 0]).reshape(1, -1))

array([1])

In [140]:
#5.2.4 Computing Predictive Probabilities Across the Grid

def plot_probs(ax, clf, class_no):
    xx1, xx2 = np.meshgrid(np.arange(-5, 5, 0.1), np.arange(-5, 5, 0.1))
    probs = clf.predict_proba(np.stack((xx1.ravel(), xx2.ravel()), axis=1))
    Z = probs[:,class_no]
    Z = Z.reshape(xx1.shape)
    CS = ax.contourf(xx1, xx2, Z)
    cbar = plt.colorbar(CS)
    plt.xlabel("$X_1$")
    plt.ylabel("$X_2$")

In [141]:
plt.figure(figsize=(5,8))
ax = plt.subplot(211)
plot_probs(ax, clf, 0)
plt.title("Pred. prob for class 1")
ax = plt.subplot(212)
plot_probs(ax, clf, 1)
plt.title("Pred. prob for class 2");

<IPython.core.display.Javascript object>