# make_pipelines (Titanic Dataset)

In [1]:
import numpy as np # for linear algebra
import pandas as pd # for data manipulation
pd.set_option("display.precision", 2)
%matplotlib inline

## Acquire Data

In [2]:
train_df = pd.read_csv('../data/titanic_train.csv')
test_df = pd.read_csv('../data/titanic_test.csv')

In [3]:
train_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,S


In [4]:
test_df.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.83,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.69,,Q


## Model, predict and solve (only first-level predictions)

In [5]:
from sklearn.model_selection import train_test_split

X = train_df.drop("Survived", axis=1)
y = train_df["Survived"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=42)
X_test  = test_df.copy()
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape

((757, 11), (757,), (134, 11), (134,), (418, 11))

First, we want to **split numeric and categorical features** to make two different pipelines.

In [6]:
numeric_features = ['Pclass', 'Age', 'Fare', 'SibSp', 'Parch']
categorical_features = ['Sex', 'Embarked']

- **Standardization**:

    *3 main scalers*
    - MinMaxScaler
    - StandardScaler: standardize features by removing the mean and scaling to unit variance. Sensitive to outliers.
    - RobustScaler: removes the median and scales the data according to the quantile range. Robust to outliers.
    
  *5 other scalers*
    - minmax_scale
    - MaxAbsScaler
    - Normalizer
    - QuantileTransformer
    - PowerTransformer
    
```python3
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler # We're only going to use StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
```

- **SimpleImputer strategies**

If **“mean”**, then replace missing values using the mean along each column. Can only be used with numeric data.

If **“median”**, then replace missing values using the median along each column. Can only be used with numeric data.

If **“most_frequent”**, then replace missing using the most frequent value along each column. Can be used with strings or numeric data.

If **“constant”**, then replace missing values with fill_value. Can be used with strings or numeric data.

- **Encoder for categorical features**:


1. if the categorical features are ordinal ones, use label encoder;
1. if non-ordinal relation, use one hot encoder

In [7]:
# For both:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import StandardScaler
numeric_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'), StandardScaler())

from sklearn.preprocessing import OneHotEncoder
categorical_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder())

In [8]:
# ColumnTransformer is an estimator that allows different columns or column subsets of the input 
# to be transformed separately (here numeric and categorical features). 
# It's now time to combine the features into a single transformer.

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
        transformers=[
        ('num', numeric_pipeline, numeric_features),
        ('cat', categorical_pipeline, categorical_features)])

#### Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression

logreg = make_pipeline(preprocessor, LogisticRegression())
logreg.fit(X_train, y_train)
acc_log = round(logreg.score(X_val, y_val) * 100, 2)
acc_log

81.34

#### Support Vector Machines

In [10]:
from sklearn.svm import SVC

svc = make_pipeline(preprocessor, SVC())
svc.fit(X_train, y_train)
acc_svc = round(svc.score(X_val, y_val) * 100, 2)
acc_svc

82.84

#### KNeighborsClassifier

In [11]:
from sklearn.neighbors import KNeighborsClassifier

knn = make_pipeline(preprocessor, KNeighborsClassifier(n_neighbors = 3))
knn.fit(X_train, y_train)
acc_knn = round(knn.score(X_val, y_val) * 100, 2)
acc_knn

79.85

#### Gaussian Naive Bayes

In [12]:
from sklearn.naive_bayes import GaussianNB

gaussian = make_pipeline(preprocessor, GaussianNB())
gaussian.fit(X_train, y_train)
acc_gaussian = round(gaussian.score(X_val, y_val) * 100, 2)
acc_gaussian

76.87

#### Perceptron

In [13]:
from sklearn.linear_model import Perceptron

perceptron = make_pipeline(preprocessor, Perceptron())
perceptron.fit(X_train, y_train)
acc_perceptron = round(perceptron.score(X_val, y_val) * 100, 2)
acc_perceptron

73.13

#### Linear SVC

In [14]:
from sklearn.svm import LinearSVC

linear_svc = make_pipeline(preprocessor, LinearSVC())
linear_svc.fit(X_train, y_train)
acc_linear_svc = round(linear_svc.score(X_val, y_val) * 100, 2)
acc_linear_svc



79.1

#### Stochastic Gradient Descent

In [15]:
from sklearn.linear_model import SGDClassifier

sgd = make_pipeline(preprocessor, SGDClassifier())
sgd.fit(X_train, y_train)
acc_sgd = round(sgd.score(X_val, y_val) * 100, 2)
acc_sgd

76.12

#### Decision Tree

In [16]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = make_pipeline(preprocessor, DecisionTreeClassifier())
decision_tree.fit(X_train, y_train)
acc_decision_tree = round(decision_tree.score(X_val, y_val) * 100, 2)
acc_decision_tree

76.87

#### Random Forest

In [17]:
from sklearn.ensemble import RandomForestClassifier

random_forest = make_pipeline(preprocessor, RandomForestClassifier(n_estimators=100))
random_forest.fit(X_train, y_train)
acc_random_forest = round(random_forest.score(X_val, y_val) * 100, 2)
acc_random_forest

81.34

### Grid Search CV

Exhaustive search over specified parameter values for an estimator.

In [18]:
from sklearn.preprocessing import PolynomialFeatures

model = make_pipeline(preprocessor, PolynomialFeatures(), SGDClassifier(random_state=42))

In [19]:
from sklearn.model_selection import GridSearchCV

params = {
    'polynomialfeatures__degree' : [2, 3, 4],
    'sgdclassifier__penalty' : ['l1', 'l2']
}

grid = GridSearchCV(model, param_grid=params, cv=4, n_jobs=-1)

In [20]:
grid.fit(X_train, y_train)

grid.best_params_

grid.score(X_val, y_val)

0.5970149253731343

In [21]:
grid.best_params_

{'polynomialfeatures__degree': 2, 'sgdclassifier__penalty': 'l2'}

### Model evaluation

In [22]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree'],
    'Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision_tree]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
0,Support Vector Machines,82.84
2,Logistic Regression,81.34
3,Random Forest,81.34
1,KNN,79.85
7,Linear SVC,79.1
4,Naive Bayes,76.87
8,Decision Tree,76.87
6,Stochastic Gradient Decent,76.12
5,Perceptron,73.13


### Predictions

In [23]:
y_pred_1 = logreg.predict(X_test)
y_pred_2 = svc.predict(X_test)
y_pred_3 = knn.predict(X_test)
y_pred_4 = gaussian.predict(X_test)
y_pred_5 = perceptron.predict(X_test)
y_pred_6 = linear_svc.predict(X_test)
y_pred_7 = sgd.predict(X_test)
y_pred_8 = decision_tree.predict(X_test)
y_pred_9 = random_forest.predict(X_test)

In [24]:
submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": y_pred_1
    })

submission.head(3)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0


In [25]:
# submission.to_csv('submission_1.csv', index=False) # comment if you want to generate your submission