<img src="../img/titanic2.jpg">

# Welcome back to the Titanic.
## We are going to attempt to predict the Fare of each ticket
## Can we predict the price of each ticket given the other features in this dataset?


In [209]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LinearRegression

### First, read in the data

In [210]:
df = pd.read_csv('../data/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,Died,Third,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,Unknown,S
1,2,Lived,First,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,Lived,Third,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,Unknown,S
3,4,Lived,First,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,Died,Third,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,Unknown,S


## What will our target variable be for this Regression?

## What features do we currently have that are usable for a regression model?
* What do machine learning models need as inputs?

In [211]:
lr = LinearRegression()

In [212]:
df.dtypes

PassengerId      int64
Survived        object
Pclass          object
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [213]:
X = df[['Age', 'SibSp','Parch']]
y = df['Fare']

In [214]:
lr.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [215]:
lr.score(X, y)

0.07158001329859276

## What processing will we need to do to create features for our model?

* What columns contain information that might be easily encoded from strings/objects to numerical data?

### Categorical Encoding
##### _OneHotEncoding / 'Dummy'_
Using [sklearn's OneHotEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html) we will process our Categorical columns
* Which columns represent potential categorical variables?

In [164]:
onehot = OneHotEncoder(drop='first', sparse=False)

In [165]:
df.dtypes

PassengerId      int64
Survived        object
Pclass          object
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

df['survived_hot'] = onehot.fit_transform(df[['Survived']])
df.head()

### First fit the encoder to your data.

In [175]:
onehot.fit(df[['Survived']])

OneHotEncoder(categories='auto', drop='first', dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=False)

### Then transform your data, examine what is is!

In [1]:
survived_hot = onehot.transform(df[['Survived']])
# survived_hot

NameError: name 'onehot' is not defined

### OneHotEncoder has a `.categories_` attribute
* How can we access just the category we want, given that we set `drop_first = True`
* Assign your categories to a variable

In [None]:
onehot.categories_

### Using PANDAS we can create a DataFrame and add it back to our original DataFrame

In [180]:
survived_df = pd.DataFrame(survived_hot, columns = survived_cats)
survived_df.head()

Unnamed: 0,Lived
0,0.0
1,1.0
2,1.0
3,1.0
4,0.0


In [181]:
pd.concat([df,survived_df], axis = 1).head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Lived
0,1,Died,Third,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,Unknown,S,0.0
1,2,Lived,First,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1.0
2,3,Lived,Third,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,Unknown,S,1.0
3,4,Lived,First,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1.0
4,5,Died,Third,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,Unknown,S,0.0


### _Many_ sklearn encoders can fit & transform in one method.
#### Here is one cell with the whole workflow

In [182]:
onehot = OneHotEncoder(drop='first', sparse=False)
survived_hot = onehot.fit_transform(df[['Survived']])
survived_cats = onehot.categories_[0][1:]
survived_df = pd.DataFrame(survived_hot, columns = survived_cats)
df = pd.concat([df,survived_df], axis = 1)

In [183]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Lived
0,1,Died,Third,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,Unknown,S,0.0
1,2,Lived,First,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1.0
2,3,Lived,Third,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,Unknown,S,1.0
3,4,Lived,First,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1.0
4,5,Died,Third,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,Unknown,S,0.0


### Can you repeat with the other columns?
A common acronymn in programming is **DRY**:

**D**on't **R**epeat **Y**ourself 

#### Can you repeat this for all of those potential features using a loop or a function?

In [195]:
def get_hot(data, feature):
    onehot = OneHotEncoder(drop='first', sparse=False)
    hot = onehot.fit_transform(data[[feature]])
    cats = onehot.categories_[0][1:]
    hot_df = pd.DataFrame(hot, columns = cats)
    data = pd.concat([data,hot_df], axis = 1)
    return(data)

In [196]:
hot_features = ['Survived',
               'Pclass',
               'Sex',
               'Embarked']

In [197]:
for feature in hot_features:
    df = get_hot(data = df, feature = feature)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Lived,Second,Third,male,Q,S
0,1,Died,Third,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,Unknown,S,0.0,0.0,1.0,1.0,0.0,1.0
1,2,Lived,First,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1.0,0.0,0.0,0.0,0.0,0.0
2,3,Lived,Third,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,Unknown,S,1.0,0.0,1.0,0.0,0.0,1.0
3,4,Lived,First,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1.0,0.0,0.0,0.0,0.0,1.0
4,5,Died,Third,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,Unknown,S,0.0,0.0,1.0,1.0,0.0,1.0


## Make Your Model
### Using LinearRegression

In [198]:
lr = LinearRegression()

In [200]:
df.dtypes

PassengerId      int64
Survived        object
Pclass          object
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
Lived          float64
Second         float64
Third          float64
male           float64
Q              float64
S              float64
dtype: object

In [201]:
X = df[['Age', 'SibSp','Parch','Lived','Second','Third','male','Q','S']]
y = df['Fare']

In [202]:
lr.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [207]:
lr.score(X, y)

0.4335303815890381

### Stretch goals:
* Write a data science problem for this.
* Try to improve your metrics.
* Look for other encoders in [scikit-learn's documentation](https://scikit-learn.org/stable/modules/preprocessing.html)

### Ordinal Encoding
Using [sklearn's OrdinalEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html) we will process our Ordinal Columns
Ordinal Columns can be "ranked".
* Which columns might represent a ranking?

In [19]:
ord_enc = OrdinalEncoder()