# Logistic Regression

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from penguin_dataset import preprocess_penguins
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

### 1. Prepare the data

In [None]:
Xtrain, ytrain, Xval, yval, Xtest, ytest = preprocess_penguins()

In [None]:
# We want to do a binary classification:
# remove all Chinstrap penguins
train_index = ytrain != 'Chinstrap'
val_index = yval != 'Chinstrap'

Xtrain = Xtrain[train_index]
Xval = Xval[val_index]
ytrain = ytrain[train_index]
yval = yval[val_index]

In [None]:
sns.scatterplot(x=Xtrain[:,1], y=Xtrain[:,4], hue=ytrain)

### 2. Build a Logistic Regression model

* classify a penguin as 'Adelie' or 'Gentoo' 
* use feature columns 1 and 4 (Body Mass and Culmen Length)
* We would like to find the coefficients (a1, a2, b) of the logistic function that result in the most accurate model (smallest possible log loss)

In [None]:
# select 2 columns
Xtrain2 = Xtrain[:,[1, 4]]
Xval2 = Xval[:, [1, 4]]

In [None]:
# normalize, because body mass is much bigger
scaler = MinMaxScaler()
scaler.fit(Xtrain2)
Xscaled = scaler.transform(Xtrain2)
Xscaled_val = scaler.transform(Xval2)

In [None]:
m = LogisticRegression()    # create the model
m.fit(Xscaled, ytrain)                 # train the model -> find best coefficients

m.coef_, m.intercept_

### 3. Evaluate the Model

In [None]:
# training accuracy
m.score(Xscaled, ytrain)

In [None]:
# validation accuracy
m.score(Xscaled_val, yval)

#### Exercise: Add more features and see if the accuracy improves

### 4. Probabilities

In [None]:
# LogReg gives us probabilities for each data point
m.predict_proba(Xscaled_val) # Columns are Adelie and Gentoo 