## Imports

In [1]:
# pip install --upgrade scikit-learn

In [15]:
import time
from IPython.display import clear_output
import numpy    as np
import pandas   as pd
import seaborn  as sb
import matplotlib.pyplot as plt
import sklearn  as skl

from sklearn import pipeline      # Pipeline
from sklearn import preprocessing # OrdinalEncoder, LabelEncoder
from sklearn import impute
from sklearn import compose
from sklearn import model_selection # train_test_split
from sklearn import metrics         # accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn import set_config

set_config(display='diagram') # Useful for display the pipeline

print("Pandas  ", pd.__version__)
print("Sklearn ", skl.__version__) # Try to use 0.24

Pandas   1.2.1
Sklearn  0.24.1


## Get the dataset
- **CLOUD = True**: Download dataset from Kaggle. Necesary for cloud enviroments like COLAB. **Specify your [kaggle credentials](https://www.kaggle.com/docs/api)**.
- **CLOUD = False**: Get the dataset from your local machine. **Specify the data path**.

In [3]:
CLOUD = False

if CLOUD:
    import os
    os.environ['KAGGLE_USERNAME'] = "your_kaggle_username"
    os.environ['KAGGLE_KEY']      = "your_kaggle_api_key"  # See https://www.kaggle.com/docs/api
    !pip install --upgrade kaggle
    !kaggle competitions download -c titanic
    DATA_PATH = "./"

else:
    DATA_PATH = "../../datasets/titanic/"

## Load data

In [4]:
df      = pd.read_csv(DATA_PATH + "train.csv", index_col='PassengerId')
df_test = pd.read_csv(DATA_PATH + "test.csv",  index_col='PassengerId')

print("Train DataFrame:", df.shape)
print("Test DataFrame: ", df_test.shape)

Train DataFrame: (891, 11)
Test DataFrame:  (418, 10)


## Check missings

In [5]:
df.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [6]:
df_test.isnull().sum()

Pclass        0
Name          0
Sex           0
Age          86
SibSp         0
Parch         0
Ticket        0
Fare          1
Cabin       327
Embarked      0
dtype: int64

# Exercise 1:
Extract the title (Mr, Mrs, ... ) from the "Name" column.

Tips:
- split(',')[1] to get the 2nd part, and remove the surnamename
- split('.')[0] to get the 1str part, and remove the name

In [7]:
df.head()

In [8]:
# CODE HERE get_Title_from_Name funtion
def get_Title_from_Name(name):
    s = name.split(',')[1].split('.')[0].strip()   
    return s 

df["Title"]      = df['Name'].map(get_Title_from_Name)
df_test["Title"] = df_test['Name'].map(get_Title_from_Name)
df_test.Title.value_counts()

Mr        240
Miss       78
Mrs        72
Master     21
Col         2
Rev         2
Dona        1
Ms          1
Dr          1
Name: Title, dtype: int64

# Exercise 2:
Apply the title_dictionary to get a better information about the title. You have to overwrite the Title variable.

In [9]:
title_dictionary = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Dona": "Royalty",
    "Sir" : "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "the Countess":"Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Royalty"
}

In [10]:
# CODE HERE

df["Title"] = [title_dictionary[n] for n in df['Title']]
df_test["Title"] = [title_dictionary[n] for n in df_test['Title']]

# Exercise BONUS:
Try to extract some information from the feature **Ticket**

# Exercise BONUS:
Try to extract some information from the feature **Cabin**

# Preprocessing
For X data:
- We drop Survived because is the target variable
- We drop Name because we have extracted the Title: Mr, Mrs, ...
- We drop Ticket because it has no information -> see df.Ticket.nunique()
- We drop Cabin because it has a lot of missings (77% are missings)

Then, we identify **numerical** variables and **categorical** variables,

In [11]:
x = df.drop(columns=["Survived", 'Name', 'Ticket', 'Cabin']) # X DATA (WILL BE TRAIN+VALID DATA)
y = df["Survived"] # 0 = No, 1 = Yes

x_test = df_test.drop(columns=['Name', 'Ticket', 'Cabin']) # # X_TEST DATA (NEW DATA)

In [12]:
cat_vars  = ['Sex', 'Embarked', 'Title']         # x.select_dtypes(include=[object]).columns.values.tolist()
num_vars  = ['Pclass', 'SibSp', 'Parch', 'Fare', 'Age'] # x.select_dtypes(exclude=[object]).columns.values.tolist()

print("\nNumerical features:\n", num_vars)
print("\nCategorical features:\n", cat_vars)


Numerical features:
 ['Pclass', 'SibSp', 'Parch', 'Fare', 'Age']

Categorical features:
 ['Sex', 'Embarked', 'Title']


# Exercise 3:
Create a **ColumnTransformer for Tree Models**. Remember:
- Categorical: Some SimpleImputer -> Some Encoder
- Numerical: Some SimpleImputer -> NO Encoder

In [13]:
num_preprocessing = pipeline.Pipeline(steps=[
  # Some SimpleImputer here
  ('imputer', impute.SimpleImputer(strategy='mean',add_indicator=False))
])

cat_preporcessing = pipeline.Pipeline(steps=[
  # Some SimpleImputer here
  ('imputer', impute.SimpleImputer(strategy='constant', fill_value='missing')),
  # Some Encoder here. Remember to handle_unknown
  ('onehot', preprocessing.OneHotEncoder(handle_unknown='ignore'))
])

tree_prepro = compose.ColumnTransformer(transformers=[
    ('num', num_preprocessing, num_vars),
    ('cat', cat_preporcessing, cat_vars),
], remainder='drop') # Drop other vars not specified in num_vars or cat_vars

tree_prepro

# Exercise 4
1. Complete the diccionary with some Tree Models.
2. Then we put each model in a Pipeline where:
   - first is the prepocessing with the column Transformer
   - Then is the Tree model
3. Display the fullpipeline of the LGBMClassifier

In [16]:
from sklearn.tree          import DecisionTreeClassifier
from sklearn.ensemble      import RandomForestClassifier
from sklearn.ensemble      import ExtraTreesClassifier
from sklearn.ensemble      import AdaBoostClassifier
from sklearn.ensemble      import GradientBoostingClassifier
from sklearn.experimental  import enable_hist_gradient_boosting # Necesary for HistGradientBoostingClassifier
from sklearn.ensemble      import HistGradientBoostingClassifier
from xgboost               import XGBClassifier
from lightgbm              import LGBMClassifier
from catboost              import CatBoostClassifier

In [17]:
tree_classifiers = {
  "Decision Tree": DecisionTreeClassifier(),
  "Extra Trees": ExtraTreesClassifier(),
  "Random Forest": RandomForestClassifier(),
  "AdaBoost": AdaBoostClassifier(),
  "Skl GBM": GradientBoostingClassifier(),
  "Skl HistGBM": HistGradientBoostingClassifier(),
  "XGBoost": XGBClassifier(),
  "LightGBM": LGBMClassifier(),
  "CatBoost": CatBoostClassifier(verbose=200)
}

tree_classifiers = {name: pipeline.make_pipeline(tree_prepro, model) for name, model in tree_classifiers.items()}

# Exercise 5:
Define a simple split validation strategy with:
- 80% for train
- 20% for validation
- With stratification
- random_state=0

And train all the models in a for loop

In [18]:
x_train, x_val, y_train, y_val = model_selection.train_test_split(x,y, test_size=0.2, stratify=y)

for model_name, model in tree_classifiers.items():
    # CODE HERE
    # TRAIN PIPELINE (PREPRO + MODEL) WITH TRAIN DATA
    m = tree_classifiers[model_name].fit(x_train, y_train)
    pred = m.predict(x_val) 
    # EVAL PIPELINE WITH VAL DATA (SEE ACCURACY AND BALANCED_ACCURACY)
    print(model_name, " has accuracy of ", metrics.accuracy_score(pred,y_val), " and balnced accuracy ", metrics.balanced_accuracy_score(pred,y_val))

Decision Tree  has accuracy of  0.7486033519553073  and balnced accuracy  0.7358429858429858
Extra Trees  has accuracy of  0.8156424581005587  and balnced accuracy  0.8067176186645213
Random Forest  has accuracy of  0.7877094972067039  and balnced accuracy  0.7763859275053304
AdaBoost  has accuracy of  0.8212290502793296  and balnced accuracy  0.8113306982872199
Skl GBM  has accuracy of  0.8268156424581006  and balnced accuracy  0.8242997198879551
Skl HistGBM  has accuracy of  0.8156424581005587  and balnced accuracy  0.8096912048524951
XGBoost  has accuracy of  0.8100558659217877  and balnced accuracy  0.8012820512820513
LightGBM  has accuracy of  0.8212290502793296  and balnced accuracy  0.8168935815504307
Learning rate set to 0.008911
0:	learn: 0.6873504	total: 48.1ms	remaining: 48s
200:	learn: 0.3787116	total: 377ms	remaining: 1.5s
400:	learn: 0.3411506	total: 753ms	remaining: 1.13s
600:	learn: 0.3166280	total: 1.09s	remaining: 722ms
800:	learn: 0.2955592	total: 1.46s	remaining: 36

# Exercise 6:
Define a 10 Fold cross validation strategy with:
- With stratification
- shuffle=True
- random_state=0

And train all the models in a for loop.

Tip you can use **[cross_val_predict](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_predict.html)** for both training and predict with 

In [27]:
skf = model_selection.StratifiedKFold(n_splits=10)

for model_name, model in tree_classifiers.items():
    # CODE HERE
    # TRAIN PIPELINE (PREPRO + MODEL) WITH TRAIN DATA
    pred = model_selection.cross_val_predict(tree_classifiers[model_name], x_train, y_train, cv=skf)
    # EVAL PIPELINE WITH VAL DATA (SEE ACCURACY AND BALANCED_ACCURACY)
    print(model_name, " has accuracy of ",model.score(x_val,y_val))
    

Decision Tree  has accuracy of  0.7486033519553073
Extra Trees  has accuracy of  0.8156424581005587
Random Forest  has accuracy of  0.7877094972067039
AdaBoost  has accuracy of  0.8212290502793296
Skl GBM  has accuracy of  0.8268156424581006
Skl HistGBM  has accuracy of  0.8156424581005587
XGBoost  has accuracy of  0.8100558659217877
LightGBM  has accuracy of  0.8212290502793296
Learning rate set to 0.008515
0:	learn: 0.6873069	total: 1.02ms	remaining: 1.02s
200:	learn: 0.3752561	total: 369ms	remaining: 1.47s
400:	learn: 0.3357281	total: 688ms	remaining: 1.03s
600:	learn: 0.3109789	total: 1.05s	remaining: 695ms
800:	learn: 0.2906578	total: 1.36s	remaining: 339ms
999:	learn: 0.2711134	total: 1.73s	remaining: 0us
Learning rate set to 0.008515
0:	learn: 0.6875893	total: 1.03ms	remaining: 1.03s
200:	learn: 0.3775901	total: 286ms	remaining: 1.14s
400:	learn: 0.3366131	total: 657ms	remaining: 981ms
600:	learn: 0.3118529	total: 982ms	remaining: 652ms
800:	learn: 0.2917080	total: 1.35s	remaini

# Exercise 7
Train **with all data** the best model

In [21]:
best_model = 'CatBoost' #CatBoostClassifier(verbose=200)# Select your best model
# Train with all data your best model
x_p = tree_classifiers[best_model].fit(x,y)


Learning rate set to 0.009807
0:	learn: 0.6861863	total: 1.19ms	remaining: 1.19s
200:	learn: 0.3779043	total: 342ms	remaining: 1.36s
400:	learn: 0.3454862	total: 678ms	remaining: 1.01s
600:	learn: 0.3227697	total: 1.07s	remaining: 708ms
800:	learn: 0.3009774	total: 1.45s	remaining: 360ms
999:	learn: 0.2789732	total: 1.84s	remaining: 0us


# Exercise 8
With your best model, generate the predicitions for test data (x_test)

In [22]:

test_pred = x_p.predict(x_test)# Get the predictions for x_test

# Exercise 9

Submit to kaggle using the kaggle API. And send us your score. You can try to improve it.

In [23]:
sub = pd.DataFrame(test_pred, index=x_test.index, columns=["Survived"])
sub.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,1


In [24]:
sub.to_csv("sub.csv")

In [None]:
#!kaggle competitions submit -c titanic -f sub.csv -m "My submission message"

# Exercise BONUS

Knowing how to export your models is very important for putting models in production. Try to
- Export and Load the ColumTransformer in pickle
- Export and Load the ColumTransformer in joblib
- Export and load the Pipeline