<img src="img/titanic2.jpg">

# Welcome back to the Titanic.
## WHO LIVES, WHO DIES
## Can we predict the odds of surival using metadata about the passengers?


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, LogisticRegression

### First, read in the data

In [7]:
df = pd.read_csv('../data/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,Died,Third,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,Unknown,S
1,2,Lived,First,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,Lived,Third,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,Unknown,S
3,4,Lived,First,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,Died,Third,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,Unknown,S


## What will our target variable be for this Regression?
### Is this a continuous or discrete variable? Is this Linear or classification?

In [8]:
df['Survived'].unique()

array(['Died', 'Lived'], dtype=object)

In [9]:
df['Survived'] = df['Survived'].map({'Died': 0, 'Lived': 1})

In [10]:
df['Survived'].unique()

array([0, 1])

## What features do we currently have that are usable for a regression model?
* What do machine learning models need as inputs?

In [11]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass          object
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [12]:
lr = LogisticRegression()

In [21]:
X = df[['Fare', 'SibSp','Parch', 'Age']]
y = df['Survived']

### Train/Test Split!
#### Gives you the ability to evaluate fit

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 737)

### Fit it & Score It

In [23]:
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

### What metric does `.score()` output for Logistic Regression?

In [24]:
#training score
lr.score(X_train, y_train)

0.6910112359550562

In [25]:
#test score
lr.score(X_test, y_test)

0.6966292134831461

In [26]:
#cross val score
cross_val_score(lr, X_train, y_train, cv = 5).mean()

0.6909539763710104

## What processing will we need to do to create features for our model?

* What columns contain information that might be easily encoded from strings/objects to numerical data?

In [27]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass          object
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

### Categorical Encoding
##### _OneHotEncoding / 'Dummy'_
Using [sklearn's OneHotEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html) we will process our Categorical columns
* Which columns represent potential categorical variables?

In [28]:
onehot = OneHotEncoder(drop='first', sparse=False)

In [29]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass          object
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

### First fit the encoder to your data.

In [30]:
onehot.fit(df[['Pclass']])

OneHotEncoder(categories='auto', drop='first', dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=False)

### Then transform your data, examine what is is!

In [31]:
pclass_hot = onehot.transform(df[['Pclass']])
pclass_hot

array([[0., 1.],
       [0., 0.],
       [0., 1.],
       ...,
       [0., 0.],
       [0., 0.],
       [0., 1.]])

### OneHotEncoder has a `.categories_` attribute
* How can we access just the category we want, given that we set `drop_first = True`
* Assign your categories to a variable

In [36]:
onehot.categories_

[array(['First', 'Second', 'Third'], dtype=object)]

In [38]:
pclass_cats = onehot.categories_[0][1:]

### Using PANDAS we can create a DataFrame and add it back to our original DataFrame

In [39]:
pclass_df = pd.DataFrame(pclass_hot, columns = pclass_cats)
pclass_df.head()

Unnamed: 0,Second,Third
0,0.0,1.0
1,0.0,0.0
2,0.0,1.0
3,0.0,0.0
4,0.0,1.0


In [40]:
pd.concat([df,pclass_df], axis = 1).head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Second,Third
0,1,0,Third,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,Unknown,S,0.0,1.0
1,2,1,First,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0.0,0.0
2,3,1,Third,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,Unknown,S,0.0,1.0
3,4,1,First,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0.0,0.0
4,5,0,Third,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,Unknown,S,0.0,1.0


### _Many_ sklearn encoders can fit & transform in one method.
#### Here is one cell with the whole workflow

In [43]:
onehot = OneHotEncoder(drop='first', sparse=False)
pclass_hot = onehot.fit_transform(df[['Pclass']])
pclass_cats = onehot.categories_[0][1:]
pclass_df = pd.DataFrame(pclass_hot, columns = pclass_cats)
df = pd.concat([df,pclass_df], axis = 1)

In [44]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Second,Third
0,1,0,Third,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,Unknown,S,0.0,1.0
1,2,1,First,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0.0,0.0
2,3,1,Third,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,Unknown,S,0.0,1.0
3,4,1,First,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0.0,0.0
4,5,0,Third,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,Unknown,S,0.0,1.0


### Can you repeat with the other columns?
A common acronymn in programming is **DRY**:

**D**on't **R**epeat **Y**ourself 

#### Can you repeat this for all of those potential features using a loop or a function?

In [45]:
def get_hot(data, feature):
    onehot = OneHotEncoder(drop='first', sparse=False)
    hot = onehot.fit_transform(data[[feature]])
    cats = onehot.categories_[0][1:]
    hot_df = pd.DataFrame(hot, columns = cats)
    data = pd.concat([data,hot_df], axis = 1)
    return(data)

In [46]:
hot_features = ['Pclass',
               'Sex',
               'Embarked']

In [47]:
for feature in hot_features:
    df = get_hot(data = df, feature = feature)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Second,Third,Second.1,Third.1,male,Q,S
0,1,0,Third,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,Unknown,S,0.0,1.0,0.0,1.0,1.0,0.0,1.0
1,2,1,First,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,1,Third,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,Unknown,S,0.0,1.0,0.0,1.0,0.0,0.0,1.0
3,4,1,First,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5,0,Third,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,Unknown,S,0.0,1.0,0.0,1.0,1.0,0.0,1.0


## Make Your Model
### Using LinearRegression

In [48]:
lr = LogisticRegression(max_iter = 10_000)

In [49]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass          object
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
Second         float64
Third          float64
Second         float64
Third          float64
male           float64
Q              float64
S              float64
dtype: object

In [52]:
X = df[['Age', 'SibSp','Parch','Second','Third','male','Q','S','Fare']]
y = df['Survived']

### Train/Test Split!

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 737)

### Fit it & Score It

In [55]:
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [56]:
#training score
lr.score(X_train, y_train)

0.7902621722846442

In [57]:
#test score
lr.score(X_test, y_test)

0.7865168539325843

In [58]:
#cross val score
cross_val_score(lr, X_train, y_train, cv = 5).mean()

0.7883971080938107

### Stretch goals:
* Write a data science problem for this.
* Try to improve your metrics.
* Look for other encoders in [scikit-learn's documentation](https://scikit-learn.org/stable/modules/preprocessing.html)

### Ordinal Encoding
Using [sklearn's OrdinalEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html) we will process our Ordinal Columns
Ordinal Columns can be "ranked".
* Which columns might represent a ranking?

In [19]:
ord_enc = OrdinalEncoder()