In [None]:
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def plot_customers(X, y, xlabel='Inches (in)', ylabel='Pounds (lb)'):
    colors = ['g', 'y']
    labels = ['Not large', 'Large']
    for i, (color, label) in enumerate(zip(colors, labels)):
        plt.scatter(X[:, 0][y==i], X[:, 1][y==i], color=color, label=label)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)

In [None]:
np.random.seed(1)
inches = np.arange(60, 78, .1)
random_fluctuations = np.random.normal(scale=10, size=inches.size)

In [None]:
pounds = 4 * inches - 130 + random_fluctuations

In [None]:
X = np.array([inches, pounds]).T

In [None]:
y = ((X[:,0] > 72) & (X[:,1] > 160)).astype(int)

In [None]:
plot_customers(X, y)
plt.legend()

In [None]:
def boundary(inches): return -3.5 * inches + 415

In [None]:
plt.plot(X[:, 0], boundary(X[:,0]), color='k', label='Boundary')
plot_customers(X, y)
plt.legend()

In [None]:
from sklearn.metrics import f1_score

In [None]:
y_pred = []
for inches, lbs in X:
    prediction = int(lbs > boundary(inches))
    y_pred.append(prediction)
f_measure = f1_score(y_pred, y)
f"The f-measure is {f_measure:.2f}"

In [None]:
weights = np.array([3.5, 1, -415])
predictions = [int((weights @ [inches, lbs, 1]) > 0) for inches, lbs in X] 

In [None]:
predictions

In [None]:
assert y_pred == predictions

In [None]:
M = np.column_stack((X, np.ones(X.shape[0])))

In [None]:
M

In [None]:
predictions = (M @ weights > 0).astype(int)

In [None]:
predictions

In [None]:
assert predictions.tolist() == y_pred

In [None]:
def linear_classifier(X, weights):
    M = np.column_stack([X, np.ones(X.shape[0])])
    return (M @ weights > 0).astype(int)

In [None]:
predictions = linear_classifier(X, weights)

In [None]:
assert predictions.tolist() == y_pred

In [None]:
np.random.seed(0)
weights = np.random.normal(size=3)

In [None]:
y_pred = linear_classifier(X, weights)

In [None]:
f_measure = f1_score(y_pred, y)

In [None]:
f_measure

In [None]:
weights

In [None]:
def get_bias_shift(predicted, actual):
    if predicted == actual:
        return 0
    if predicted > actual:
        return 1
    return -1

In [None]:
for predicted, actual in [(0, 0), (1, 0), (0, 1), (1, 1)]:
    bias_shift = get_bias_shift(predicted, actual)
    assert bias_shift == predicted - actual

In [None]:
def get_bias_shift(predicted, actual, learning_rate=0.1):
    return learning_rate * (predicted - actual)

In [None]:
def predict(v, weights): return int(v @ weights > 0)

In [None]:
starting_bias = weights[-1]

In [None]:
for i, actual in enumerate(y):
    predicted = predict(M[i], weights)
    bias_shift = get_bias_shift(predicted, actual)
    weights[-1] -= bias_shift

In [None]:
new_bias = weights[-1]

In [None]:
print(f"Our starting bias equaled {starting_bias:.2f}.")
print(f"The adjusted bias equals {new_bias:.2f}.")

In [None]:
y_pred = linear_classifier(X, weights)

In [None]:
f_measure = f1_score(y_pred, y)

In [None]:
f"The f-measure is {f_measure:.2f}"

In [None]:
old_weights = weights.copy()

In [None]:
for i, actual in enumerate(y):
    predicted = predict(M[i], weights)
    bias_shift = get_bias_shift(predicted, actual)
    weights -= bias_shift * M[i]


In [None]:
y_pred = linear_classifier(X, weights)

In [None]:
f_measure = f1_score(y_pred, y)

In [None]:
f_measure

In [None]:
np.random.seed(0)
weights = np.random.normal(size=3)

In [None]:
f_measures = []

In [None]:
for _ in range(1000):
    y_pred = linear_classifier(X, weights)
    f_measures.append(f1_score(y_pred, y))
    for i, actual in enumerate(y):
        predicted = predict(M[i], weights)
        bias_shift = get_bias_shift(predicted, actual)
        weights -= bias_shift * M[i]

In [None]:
print(f'The f-measure after 1000 iterations is {f_measures[-1]:.2f}')
plt.plot(range(len(f_measures)), f_measures)
plt.xlabel('Iteration')
plt.ylabel('F-measure')

In [None]:
def train(X, y, predict=predict):
    M = np.column_stack([X, np.ones(X.shape[0])])
    weights = np.random.normal(size=X.shape[1]+1)
    f_measures = []
    for k in range(1, 1000):
        y_pred = linear_classifier(X, weights)
        f_measures.append(f1_score(y_pred, y))
        for i, actual in enumerate(y):
            predicted = predict(M[i], weights)
            bias_shift = get_bias_shift(predicted, actual)
            weights -= bias_shift * M[i] / k
    return weights, f_measures

In [None]:
weights, f_measures = train(X, y)

In [None]:
print(f'The f-measure after 1000 iterations is {f_measures[-1]:.2f}')
plt.plot(range(len(f_measures)), f_measures)
plt.xlabel('Iteration')
plt.ylabel('F-measure')

In [None]:
inches_coef, lbs_coef, bias = weights
def new_boundary(inches):
    return -(inches_coef * inches + bias) / lbs_coef
plt.plot(X[:,0], new_boundary(X[:,0]), color='k', linestyle='--', label='Trained Boundary', linewidth=2)
plt.plot(X[:,0], boundary(X[:,0]), color='k', label='Initial Boundary')
plot_customers(X, y)
plt.legend()

In [None]:
means = X.mean(axis=0)

In [None]:
stds = X.std(axis=0)

In [None]:
means

In [None]:
stds

In [None]:
def standardize(X):
    return (X - means) / stds

In [None]:
X_s = standardize(X)

In [None]:
X_s

In [None]:
assert np.allclose(X_s.mean(axis=0), 0)

In [None]:
assert np.allclose(X_s.std(axis=0), 1.)

In [None]:
np.random.seed(0)

In [None]:
weights, f_measures = train(X_s, y)

In [None]:
print(f'After standardization, the f-measure is {f_measures[-1]:.2f}')

In [None]:
def plot_boundary(weights):
    a, b, c = weights
    new_boundary = lambda x: -(a * x + c) / b
    plt.plot(X_s[:,0], new_boundary(X_s[:,0]), color='k', linestyle='--', label='Trained Boundary', linewidth=2)
    plot_customers(X_s, y, xlabel='Standardized Inches',
    ylabel='Standardized Pounds')
    plt.legend()

In [None]:
plot_boundary(weights)

In [None]:
new_data = np.array([63, 110, 76, 199]).reshape(2, 2)

In [None]:
new_data

In [None]:
predictions = linear_classifier(standardize(new_data), weights)

In [None]:
predictions

In [None]:
np.random.seed(0)
poor_train_count = sum([train(X_s, y)[1][-1] < 0.97 for _ in range(5)])

In [None]:
print("The f-measure fell below our baseline of 0.97 in "
f"{poor_train_count} out of 5 training instances")

In [None]:
from scipy import stats

In [None]:
z = np.arange(-10, 10, .1)
assert stats.norm.cdf(0.0) == .5

In [None]:
plt.plot(z, stats.norm.cdf(z))
plt.xlabel('Directed distance')
plt.ylabel('Confidence in Class 1')

In [None]:
from math import e
plt.plot(z, stats.norm.cdf(z), label='CDF')
plt.plot(z, 1 / (1 + e ** -z), label='Logistic Curve', linestyle='--')
plt.xlabel('Directed Distance')
plt.ylabel('Confidence in Class 1')
plt.legend()

In [None]:
M = np.column_stack([X_s, np.ones(X_s.shape[0])])
distances = M @ weights

In [None]:
distances

In [None]:
likelihoods = 1 / (1 + e ** -distances)

In [None]:
plt.scatter(distances, likelihoods, label='Class 1 Likelihood')
plt.scatter(distances, distances > 0,
label='Perceptron Prediction', marker='x')
plt.xlabel('Directed Distance')
plt.legend()

In [None]:
def logistic_predict(v, weights): return 1 / (1 + e ** -(weights @ v))

In [None]:
def train_logistic(X, y): return train(X, y, predict=logistic_predict)

In [None]:
np.random.seed(0)
logistic_weights = train_logistic(X_s, y)[0]

In [None]:
plot_boundary(logistic_weights)

In [None]:
np.random.seed(0)
poor_train_count = sum([train_logistic(X_s, y)[1][-1] < 0.97 for _ in range(5)])
print("The f-measure fell below our baseline of 0.97 in {poor_train_count} out of 5 training instances")

In [None]:
np.random.seed(0)
random_fluctuations = np.random.normal(size=X.shape[0],scale=.1)

In [None]:
waist = .45 * X[:, 0] + random_fluctuations

In [None]:
X_w_waist = np.column_stack([X_s, (waist - waist.mean())/ waist.std()])

In [None]:
weights, f_measures = train_logistic(X_w_waist, y)

In [None]:
print("Our trained model has the following weights:")
print(np.round(weights, 2))
print(f'\nThe f-measure is {f_measures[-1]:.2f}')

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
clf = LogisticRegression()

In [None]:
clf.fit(X_s, y)

In [None]:
X_s.shape

In [None]:
clf.coef_

In [None]:
clf.intercept_

In [None]:
coefficients = clf.coef_
bias = clf.intercept_
print(f"The coefficients equal {np.round(coefficients, 2)}")
print(f"The bias equals {np.round(bias, 2)}")
plot_boundary(np.hstack([clf.coef_[0], clf.intercept_]))

In [None]:
new_data = np.array([[63, 110], [76, 199]])
predictions = clf.predict(standardize(new_data))
print(predictions)

In [None]:
probabilities = clf.predict_proba(standardize(new_data))
print(probabilities)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
standard_scaler = StandardScaler()

In [None]:
X_transformed = standard_scaler.fit_transform(X)

In [None]:
assert np.allclose(X_transformed.mean(axis=0), 0)

In [None]:
assert np.allclose(X_transformed.std(axis=0), 1)

In [None]:
assert np.allclose(X_transformed, X_s)

In [None]:
data_transformed = standard_scaler.transform(new_data)
assert np.array_equal(clf.predict(data_transformed), predictions)

In [None]:
from sklearn.datasets import load_wine

In [None]:
data = load_wine()

In [None]:
num_classes = data.target_names

In [None]:
len(num_classes)

In [None]:
num_features = data.feature_names

In [None]:
len(num_features)

In [None]:
print(f"The wine dataset contains {num_classes} classes of wine:")
print(data.target_names)
print(f"\nIt contains the {num_features} features:")
print(data.feature_names)

In [None]:
X, y = load_wine(return_X_y=True)

In [None]:
X.shape

In [None]:
y.shape

In [None]:
clf.fit(standard_scaler.fit_transform(X), y)

In [None]:
biases = clf.intercept_

In [None]:
biases

In [None]:
for i, bias in enumerate(biases):
    label = data.target_names[i]
    print(f"The {label} decision boundary has a bias of {bias:0.2f}")

In [None]:
plt.figure(figsize=(20, 10))
coefficients = clf.coef_

In [None]:
sns.heatmap(coefficients.T, cmap='YlGnBu', annot=True, yticklabels=data.feature_names, xticklabels=[f"Class {i} Boundary" for i in range(3)],)
plt.yticks(rotation=0)
sns.set(font_scale=2)

In [None]:
def rank_features(class_label):
    absulute_values = np.abs(clf.coef_[class_label])
    for i in np.argsort(absulute_values)[::-1]:
        name = data.feature_names[i]
        coef = clf.coef_[class_label][i]
        print(f"{name}: {coef:.2f}")
rank_features(0)

In [None]:
index = data.feature_names.index('proline')

In [None]:
plt.hist(X[y==0][:, index], label='Class 0')
plt.hist(X[y==1][:, index], label='Class 1', color='y')
plt.xlabel('Proline concentration')
plt.legend()

In [None]:
np.random.seed(0)
X = np.array([[np.random.normal(), np.random.normal()] for _ in range(200)])
y = (np.linalg.norm(X, axis=1) < 2).astype(int)

In [None]:
clf = LogisticRegression()
clf.fit(X, y)

In [None]:
weights = np.hstack([clf.coef_[0], clf.intercept_])

In [None]:
a,b,c = weights

In [None]:
a, b, c = weights
boundary = lambda x: -(a * x + c) / b
plt.plot(range(-4, 5), boundary(range(-4, 5)), color='k', linestyle='--', linewidth=2, label='Decision Boundary')
for i in [0, 1]:
    plt.scatter(X[y == i][:, 0], X[y == i][:, 1], label= ['Suburban', 'Urban'][i], color=['b', 'y'][i])
plt.legend()