# Linear models for classification

In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

## Intro to classification

In [None]:
x_train = np.array([0., 1, 2, 3, 4, 5])
y_train = np.array([0,  0, 0, 1, 1, 1])
X_train2 = np.array([[0.5, 1.5], [1,1], [1.5, 0.5], [3, 0.5], [2, 2], [1, 2.5]])
y_train2 = np.array([0, 0, 0, 1, 1, 1])

In [None]:
from helpers.lab_utils_common_lc import dlc, plot_data
from helpers.plt_one_addpt_onclick import plt_one_addpt_onclick

plt.style.use('helpers/deeplearning.mplstyle')

pos = y_train == 1
neg = y_train == 0

fig,ax = plt.subplots(1,2,figsize=(8,3))
#plot 1, single variable
ax[0].scatter(x_train[pos], y_train[pos], marker='x', s=80, c = 'red', label="y=1")
ax[0].scatter(x_train[neg], y_train[neg], marker='o', s=100, label="y=0", facecolors='none', 
              edgecolors=dlc["dlblue"],lw=3)

ax[0].set_ylim(-0.08,1.1)
ax[0].set_ylabel('y', fontsize=12)
ax[0].set_xlabel('x', fontsize=12)
ax[0].set_title('one variable plot')
ax[0].legend()

#plot 2, two variables
plot_data(X_train2, y_train2, ax[1])
ax[1].axis([0, 4, 0, 4])
ax[1].set_ylabel('$x_1$', fontsize=12)
ax[1].set_xlabel('$x_0$', fontsize=12)
ax[1].set_title('two variable plot')
ax[1].legend()
plt.tight_layout()
plt.show()

In [None]:
%matplotlib widget
w_in = np.zeros((1))
b_in = 0
plt.close('all') 
addpt = plt_one_addpt_onclick( x_train,y_train, w_in, b_in, logistic=False)

## Logistic Regression

In [None]:
plt.close('all') 
addpt = plt_one_addpt_onclick( x_train,y_train, w_in, b_in, logistic=True)

In [None]:
from helpers.datasets import make_forge
from helpers.plot_2d_separator import plot_2d_separator
from helpers.plot_helpers import discrete_scatter

from sklearn.linear_model import LogisticRegression
%matplotlib inline

X, y = make_forge()
model = LogisticRegression()
clf = model.fit(X, y)

fig, axes = plt.subplots(1, 2, figsize=(10, 3))

for model, ax in zip([LogisticRegression(), LogisticRegression(penalty="none")], axes):
    clf = model.fit(X, y)
    plot_2d_separator(clf, X, fill=False, eps=0.5,ax=ax, alpha=.7)
    discrete_scatter(X[:, 0], X[:, 1], y, ax=ax)
    ax.set_title(f"{clf.__class__.__name__}_{clf.penalty}")
    ax.set_xlabel("Feature 0")
    ax.set_ylabel("Feature 1")
    
axes[0].legend()
plt.show()

**Types of Logistic Regression**:
- `Binary Logistic Regression`: The target variable has only two possible outcomes such as Spam or Not Spam, Cancer or No Cancer.
- `Multinomial Logistic Regression`: The target variable has three or more nominal categories such as predicting the type of Wine.
- `Ordinal Logistic Regression`: the target variable has three or more ordinal categories such as restaurant or product rating from 1 to 5.

**Advantages**:
- Because of its efficient and straightforward nature, it doesn't require high computation power, is easy to implement, easily interpretable, and used widely by data analysts and scientists.
- Also, it doesn't require scaling of features (it's faster with scaling). 
- Logistic regression provides a probability score for observations.

**Disadvantages**:
- Logistic regression is **not able to handle a large number of categorical** features/variables.
- It is **vulnerable to overfitting**.
- Also, **can't solve the non-linear problem** with the logistic regression that is why it requires a transformation of non-linear features (Polynomial features?).
- Logistic regression will **not perform well with independent variables that are not correlated to the target variable** and are very similar or correlated to each other.


## Regularization for Logistic Regression

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler

cancer = load_breast_cancer()

X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, stratify=cancer.target, random_state=42)

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

logreg = LogisticRegression(max_iter=10000).fit(X_train_scaled, y_train)

print(f"Training set score: {logreg.score(X_train_scaled, y_train):.3f}")
print(f"Test set score: {logreg.score(X_test_scaled, y_test):.3f}")

In [None]:
logreg100 = LogisticRegression(max_iter=10000, C=100).fit(X_train_scaled, y_train)

print(f"Training set score: {logreg100.score(X_train_scaled, y_train):.3f}")
print(f"Test set score: {logreg100.score(X_test_scaled, y_test):.3f}")

In [None]:
logreg001 = LogisticRegression(max_iter=10000, C=0.01).fit(X_train_scaled, y_train)

print(f"Training set score: {logreg001.score(X_train_scaled, y_train):.3f}")
print(f"Test set score: {logreg001.score(X_test_scaled, y_test):.3f}")

In [None]:
traning_scores = {}
testing_scores = {}

c_values = [0.01, 0.1, 0.5, 1, 10, 100, 500, 1000, 5000]

for c in c_values:
    logreg_diff = LogisticRegression(C=c, max_iter=100000).fit(X_train_scaled, y_train)
    traning_scores[c] = logreg_diff.score(X_train_scaled, y_train)
    testing_scores[c] = logreg_diff.score(X_test_scaled, y_test)
    
plt.plot(traning_scores.keys(), traning_scores.values(), c="blue", label="training")
plt.plot(testing_scores.keys(), testing_scores.values(), c="red", label="testing")
plt.legend()
plt.xscale('log')
plt.show()

In [None]:
cancer = load_breast_cancer()

X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, stratify=cancer.target, random_state=42)

logreg = LogisticRegression(max_iter=10000).fit(X_train, y_train)
logreg100 = LogisticRegression(max_iter=10000, C=100).fit(X_train, y_train)
logreg001 = LogisticRegression(max_iter=10000, C=0.01).fit(X_train, y_train)

plt.plot(logreg.coef_.T, 'o', label="C=1")
plt.plot(logreg100.coef_.T, '^', label="C=100")
plt.plot(logreg001.coef_.T, 'v', label="C=0.001")
plt.xticks(range(cancer.data.shape[1]), cancer.feature_names, rotation=90)
plt.hlines(0, 0, cancer.data.shape[1])
plt.ylim(-5, 5)
plt.xlabel("Coefficient index")
plt.ylabel("Coefficient magnitude")
plt.legend()
plt.show()

In [None]:
for C, marker in zip([0.001, 1, 100], ['o', '^', 'v']):
    lr_l1 = LogisticRegression(C=C, penalty="l1", solver="liblinear").fit(X_train_scaled, y_train)
    print(f"Training accuracy of l1 logreg with C={C:.3f}: {lr_l1.score(X_train_scaled, y_train):.2f}")
    print(f"Test accuracy of l1 logreg with C={C:.3f}: {lr_l1.score(X_test_scaled, y_test):.2f}")
    plt.plot(lr_l1.coef_.T, marker, label=f"C={C:.3f}")

plt.xticks(range(cancer.data.shape[1]), cancer.feature_names, rotation=90)
plt.hlines(0, 0, cancer.data.shape[1])
plt.xlabel("Coefficient index")
plt.ylabel("Coefficient magnitude")
plt.ylim(-5, 5)
plt.legend(loc=3)
plt.show()

## Uncertainty Estimates from Classifiers

In [None]:
from sklearn.linear_model import LogisticRegression
from helpers.plot_helpers import discrete_scatter

X, y = make_forge()

In [None]:
discrete_scatter(X[:, 0], X[:, 1], y, markers='o')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

# we rename the classes "blue" and "red" for illustration purposes
y_named = np.array(["blue", "red"])[y]

# we can call train_test_split with arbitrarily many arrays;
# all will be split in a consistent manner
X_train, X_test, y_train_named, y_test_named, y_train, y_test = train_test_split(X, y_named, y, random_state=0)

# build the gradient boosting model
lc = LogisticRegression(random_state=0)
lc.fit(X_train, y_train_named)

### The Decision Function

In [None]:
print(f"X_test.shape: {X_test.shape}")
print(f"Decision function shape: {lc.decision_function(X_test).shape}")

In [None]:
# show the first few entries of decision_function
lc.decision_function(X_test)

In [None]:
lc.decision_function(X_test) > 0

In [None]:
lc.predict(X_test)

In [None]:
lc.classes_

In [None]:
# make the boolean True/False into 0 and 1
greater_zero = (lc.decision_function(X_test) > 0).astype(int)

# use 0 and 1 as indices into classes_
pred = lc.classes_[greater_zero]

# pred is the same as the output of gbrt.predict
print(f"pred is equal to predictions: {np.all(pred == lc.predict(X_test))}")

In [None]:
decision_function = lc.decision_function(X_test)

print(f"Decision function minimum: {np.min(decision_function):.2f} maximum: {np.max(decision_function):.2f}")

In [None]:
from helpers import tools
from helpers import plot_helpers

fig, axes = plt.subplots(1, 2, figsize=(13, 5))

tools.plot_2d_separator(lc, X, ax=axes[0], alpha=.4, fill=True, cm=plot_helpers.cm2)

scores_image = tools.plot_2d_scores(lc, X, ax=axes[1], alpha=.4, cm=plot_helpers.ReBl)


for ax in axes:
    # plot training and test points
    discrete_scatter(X_test[:, 0], X_test[:, 1], y_test, markers='^', ax=ax)
    discrete_scatter(X_train[:, 0], X_train[:, 1], y_train, markers='o', ax=ax)
    ax.set_xlabel("Feature 0")
    ax.set_ylabel("Feature 1")
    
plt.rcParams['axes.grid'] = False
cbar = plt.colorbar(scores_image, ax=axes.tolist())
axes[0].legend(["Test class 0", "Test class 1", "Train class 0", "Train class 1"], ncol=4, loc=(.1, 1.1))
plt.show()

### Predicting Probabilities

In [None]:
lc.predict_proba(X_test).shape

In [None]:
lc.predict_proba(X_test)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(13, 5)) 

tools.plot_2d_separator(lc, X, ax=axes[0], alpha=.4, fill=True, cm=plot_helpers.cm2)
scores_image = tools.plot_2d_scores(lc, X, ax=axes[1], alpha=.5, cm=plot_helpers.ReBl, function='predict_proba')

for ax in axes:
    # plot training and test points
    discrete_scatter(X_test[:, 0], X_test[:, 1], y_test, markers='^', ax=ax)
    discrete_scatter(X_train[:, 0], X_train[:, 1], y_train, markers='o', ax=ax)
    ax.set_xlabel("Feature 0")
    ax.set_ylabel("Feature 1")

plt.rcParams['axes.grid'] = False
cbar = plt.colorbar(scores_image, ax=axes.tolist())
axes[0].legend(["Test class 0", "Test class 1", "Train class 0", "Train class 1"], ncol=4, loc=(.1, 1.1))
plt.show()

## Binary Logistic Regression in Scikit-learn

In [None]:
col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
pima = pd.read_csv("data/diabetes.csv", header=None, names=col_names)

In [None]:
pima.head()

In [None]:
#split dataset in features and target variable
feature_cols = ['pregnant', 'insulin', 'bmi', 'age','glucose','bp','pedigree']
X = pima[feature_cols] # Features
y = pima["label"] # Target variable

In [None]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=16)

In [None]:
# import the class
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# instantiate the model (using the default parameters)
logreg = LogisticRegression(random_state=16)

# fit the model with data
logreg.fit(X_train_scaled, y_train)

logreg.score(X_test_scaled, y_test)

## Linear models for multiclass classification

In [None]:
from sklearn.datasets import make_blobs
from helpers.tools import discrete_scatter

X, y = make_blobs(random_state=42)

discrete_scatter(X[:, 0], X[:, 1], y)

plt.xlabel("Feature 0")
plt.ylabel("Feature 1")
plt.legend(["Class 0", "Class 1", "Class 2"])
plt.show()

In [None]:
linear_lr = LogisticRegression().fit(X, y)

print("Coefficient shape: ", linear_lr.coef_.shape)
print("Intercept shape: ", linear_lr.intercept_.shape)

In [None]:
discrete_scatter(X[:, 0], X[:, 1], y)

line = np.linspace(-15, 15)

for coef, intercept, color in zip(linear_lr.coef_, linear_lr.intercept_, ['b', 'r', 'g']):
    plt.plot(line, -(line * coef[0] + intercept) / coef[1], c=color)
    
plt.ylim(-10, 15)
plt.xlim(-10, 8)
plt.xlabel("Feature 0")
plt.ylabel("Feature 1")
plt.legend(['Class 0', 'Class 1', 'Class 2', 'Line class 0', 'Line class 1','Line class 2'], loc=(1.01, 0.3))
plt.show()

In [None]:
from helpers.plot_2d_separator import plot_2d_classification

plot_2d_classification(linear_lr, X, fill=True, alpha=.7)
discrete_scatter(X[:, 0], X[:, 1], y)

line = np.linspace(-15, 15)

for coef, intercept, color in zip(linear_lr.coef_, linear_lr.intercept_,['b', 'r', 'g']):
    plt.plot(line, -(line * coef[0] + intercept) / coef[1], c=color)

plt.legend(['Class 0', 'Class 1', 'Class 2', 'Line class 0', 'Line class 1', 'Line class 2'], loc=(1.01, 0.3))
plt.xlabel("Feature 0")
plt.ylabel("Feature 1")
plt.show()

> All classifiers in scikit-learn do multiclass classification out-of-the-box. You don’t need to use the sklearn.multiclass module unless you want to experiment with different multiclass strategies.

## Uncertainty in Multiclass Classification

In [None]:
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler

iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=42)

scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

lr = LogisticRegression(random_state=0)
lr.fit(X_train_scaled, y_train)

In [None]:
lr.decision_function(X_test_scaled).shape

In [None]:
lr.decision_function(X_test_scaled)[:6, :]

In [None]:
np.argmax(lr.decision_function(X_test_scaled), axis=1)

In [None]:
lr.predict(X_test_scaled)

In [None]:
#show the first few entries of predict_proba
print(f"Predicted probabilities:\n{lr.predict_proba(X_test_scaled)[:6]}")
# show that sums across rows are one
print(f"Sums: {lr.predict_proba(X_test_scaled)[:6].sum(axis=1)}")

In [None]:
np.argmax(lr.predict_proba(X_test_scaled), axis=1)

In [None]:
lr.predict(X_test_scaled)

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(max_iter=1000)

# represent each target by its class name in the iris dataset
named_target = iris.target_names[y_train]

logreg.fit(X_train_scaled, named_target)

print(f"unique classes in training data: {logreg.classes_}")
print(f"predictions: {logreg.predict(X_test_scaled)[:10]}")

argmax_dec_func = np.argmax(logreg.decision_function(X_test_scaled), axis=1)
print(f"argmax of decision function: {argmax_dec_func[:10]}")
print(f"argmax combined with classes_: {logreg.classes_[argmax_dec_func][:10]}")

## Multinomial Logistic Regression in Scikit-learn

- [OpenML](https://www.openml.org/): A worldwide machine learning lab
- [Kaggle](https://www.kaggle.com/)

In [None]:
from sklearn.datasets import load_digits
digits = load_digits()

In [None]:
# Print to show there are 1797 images (8 by 8 images for a dimensionality of 64)
print("Image Data Shape" , digits.data.shape)

# Print to show there are 1797 labels (integers from 0-9)
print("Label Data Shape", digits.target.shape)

In [None]:
import numpy as np 
import matplotlib.pyplot as plt

plt.figure(figsize=(20,4))
for index, (image, label) in enumerate(zip(digits.data[0:5], digits.target[0:5])):
    plt.subplot(1, 5, index + 1)
    plt.imshow(np.reshape(image, (8,8)), cmap=plt.cm.gray)
    plt.title('Training: %i\n' % label, fontsize = 20)

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.25, random_state=0)

In [None]:
from sklearn.linear_model import LogisticRegression

logisticRegr = LogisticRegression(max_iter=10000)
logisticRegr.fit(x_train, y_train)

In [None]:
# Use score method to get accuracy of model
score = logisticRegr.score(x_test, y_test)
print(score)