# DS-SF-38 | 12 | Logistic Regression | Codealong | Starter Code

In [None]:
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

from sklearn import linear_model, metrics

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

## Part A | Logistic Regression

In [None]:
df = pd.read_csv(os.path.join('..', 'datasets', 'dataset-12-iris.csv'))

In [None]:
df

### Feature matrix and response vector

In [None]:
X = df[ ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth'] ]

In [None]:
X

In [None]:
c = df.Species

In [None]:
c

## Activity | Create a one-hot encoding/binary variables for the outcome variable

In [None]:
# TODO

In [None]:
c_Setosa = cs.Setosa
c_Versicolor = cs.Versicolor
c_Virginica = cs.Virginica

## Activity | Run logistic regression to learn whether or not an iris plant is a `Setosa`

- (http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

In [None]:
# TODO

> ## How good is the model?

In [None]:
# TODO

## Versicolor

In [None]:
model_Versicolor = linear_model.LogisticRegression().fit(X, c_Versicolor)

In [None]:
model_Versicolor.score(X, c_Versicolor)

In [None]:
c_hat_Versicolor = model_Versicolor.predict(X)

pd.crosstab(c_hat_Versicolor,
    c_Versicolor,
    rownames = ['Hypothesized Class'],
    colnames = ['True Class'])

## Virginica

In [None]:
model_Virginica = linear_model.LogisticRegression().fit(X, c_Virginica)

In [None]:
model_Virginica.score(X, c_Virginica)

In [None]:
c_hat_Virginica = model_Virginica.predict(X)

pd.crosstab(c_hat_Virginica,
    c_Virginica,
    rownames = ['Hypothesized Class'],
    colnames = ['True Class'])

## Activity | Let's combine the models together!

In [None]:
hats = pd.DataFrame({'Setosa': c_hat_Setosa,
    'Versicolor': c_hat_Versicolor,
    'Virginica': c_hat_Virginica})

In [None]:
hats.sum()

In [None]:
hats.sum().sum()

In [None]:
hats.sum(axis = 1).value_counts()

In [None]:
pd.crosstab(hats.Setosa, [hats.Versicolor, hats.Virginica])

In [None]:
p_hat_Setosa = pd.Series(model_Setosa.predict_proba(X).T[1])
p_hat_Versicolor = pd.Series(model_Versicolor.predict_proba(X).T[1])
p_hat_Virginica = pd.Series(model_Virginica.predict_proba(X).T[1])

hats = pd.DataFrame({'p_Setosa': p_hat_Setosa,
    'p_Versicolor': p_hat_Versicolor,
    'p_Virginica': p_hat_Virginica})

hats['p'] = hats.max(axis = 1)

hats['c'] = 'Setosa'
hats.loc[hats.p_Versicolor == hats.p, 'c'] = 'Versicolor'
hats.loc[hats.p_Virginica == hats.p, 'c'] = 'Virginica'

In [None]:
p_hat_Setosa

In [None]:
hats

In [None]:
(hats.c == c).mean()

In [None]:
(hats.c == c).sum()

In [None]:
(hats.c != c).sum()

In [None]:
pd.crosstab(
    hats.c,
    c,
    rownames = ['Hypothesized Class'],
    colnames = ['True Class'])

## Logistic Regression with `statsmodels`

- (http://statsmodels.sourceforge.net/stable/generated/statsmodels.discrete.discrete_model.Logit.from_formula.html)
- (http://statsmodels.sourceforge.net/stable/generated/statsmodels.discrete.discrete_model.Logit.html)

## Multiclass Logistic Regression with `sklearn`

In [None]:
X = df[ ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth'] ]
c = df.Species

model = linear_model.LogisticRegression().fit(X, c)

In [None]:
model.score(X, c)

In [None]:
c_hat = model.predict(X)

pd.crosstab(c_hat,
    c,
    rownames = ['Hypothesized Class'],
    colnames = ['True Class'])

## Part B | ROC/AUC

### Setosa

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(c_Setosa, p_hat_Setosa)

plt.figure()
plt.plot(fpr, tpr, label = 'ROC curve (area = %0.2f)' % metrics.auc(fpr, tpr))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0, 1.01])
plt.ylim([0, 1.01])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title("Setosa's ROC")
plt.legend(loc = 'lower right')

In [None]:
metrics.roc_auc_score(c_Setosa, c_hat_Setosa)

### Versicilor

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(c_Versicolor, p_hat_Versicolor)

plt.figure()
plt.plot(fpr, tpr, label = 'ROC curve (area = %0.2f)' % metrics.auc(fpr, tpr))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0, 1.01])
plt.ylim([0, 1.01])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title("Versicolor's ROC")
plt.legend(loc = 'lower right')

In [None]:
metrics.roc_auc_score(c_Versicolor, c_hat_Versicolor)

### Virginica

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(c_Virginica, p_hat_Virginica)

plt.figure()
plt.plot(fpr, tpr, label = 'ROC curve (area = %0.2f)' % metrics.auc(fpr, tpr))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0, 1.01])
plt.ylim([0, 1.01])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title("Virginica's ROC")
plt.legend(loc = 'lower right')

In [None]:
metrics.roc_auc_score(c_Virginica, c_hat_Virginica)