In [1]:
import pandas as pd
from sklearn.preprocessing import Imputer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
from pylab import *

pd.options.display.max_columns = None
%matplotlib inline
# set default palette to Set2
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=plt.cm.Set2.colors)

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# when I want to specify the result of my print() function
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [2]:
# Set PassengerId as an index to use them as a key value
train = pd.read_csv('train.csv', index_col = 'PassengerId')
test = pd.read_csv('test.csv', index_col = 'PassengerId')

In [3]:
target = train['Survived']
train = train.drop(['Survived'], axis=1)

In [4]:
## concat the two data sets, so it's easy to do feature engineering in one go
titanic = pd.concat([train, test], keys=["train", "test"])

In [5]:
## save it for later submission
passengerid = test.index

In [6]:
titanic.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Unnamed: 0_level_1,PassengerId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
train,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
train,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
train,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
train,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
train,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Defining the class 'engineering'

In [7]:
import re as re


class engineering():
    
    dock = titanic['Embarked'].value_counts().index.tolist()[0]
    
    def __init__(self, data):
        self.data = data
    
    def col_title(self):
        def get_title(name):
            title_search = re.search(' ([A-Za-z]+)\.', name)
            # If the title exists, extract and return it.
            if title_search:
                return title_search.group(1)
            return ""
        self['Title'] = self['Name'].apply(get_title)
        self['Title'] = self['Title'].replace('Mlle', 'Miss')
        self['Title'] = self['Title'].replace('Sir', 'Mr')
        self['Title'] = self['Title'].replace('Ms', 'Miss')
        self['Title'] = self['Title'].replace('Mme', 'Mrs')
        self['Title'] = self['Title'].replace(['Dr', 'Rev', 'Major', 'Col', 'Lady', 'Capt', 'Countess','Don', 
                                                     'Jonkheer', 'Dona'], 'Other')
        return self
    
    def imp_age(self):
        mid = []
        # calculate the median, impute them
        for title in titles:
            a = self.Age[self['Title'] == title].median()
            self.loc[:, 'Age'][(self['Title'] == title) & (self['Age'].isna())] = a
        return self
    
    def imp_embark(self):
        #dock = self.Embarked.value_counts().max()
        self['Embarked'][self['Embarked'].isna()] = engineering.dock
        return self
    
    def col_group(self):
        self['Group_num'] = self["SibSp"] + self["Parch"]
        return self

#### making a 'Title' column

Since there are various titles in the 'Name' column, it should be cleaned.

In [8]:
titanic = engineering.col_title(titanic)

#### Imputation

In [9]:
# check for missing values
print(color.BOLD + "NANs in data" + color.END, "\n", "\n", titanic.isna().sum(), '\n')

[1mNANs in data[0m 
 
 Pclass         0
Name           0
Sex            0
Age          263
SibSp          0
Parch          0
Ticket         0
Fare           1
Cabin       1014
Embarked       2
Title          0
dtype: int64 



The 'Survived' column is from the test dataset. The test and train datasets will be detatched after feature engineering.

#### Imputation - Age

Impute the missing ages with the medians.
<br>
The medians are calculated with respect to title. 
<br>
Mr, Miss, Master, etc will have different medians.

In [10]:
# I have to set the title list, to make the function work.
titles = ['Mr', "Mrs", 'Miss', 'Master', 'Other']

In [11]:
titanic = engineering.imp_age(titanic)

#### Imputation - Embarked

In [12]:
# the missing values 
print(color.BOLD + 'Embark Count' + color.END, '\n', titanic.Embarked.value_counts())
# Will use the most common factor to impute

[1mEmbark Count[0m 
 S    914
C    270
Q    123
Name: Embarked, dtype: int64


In [13]:
titanic = engineering.imp_embark(titanic)

In [14]:
# You can change the variable that you want to impute on column 'Embarked' by using the following code
# As an example, you can do this
# engineering.dock = titanic['Embarked'].value_counts().index.tolist()[1]

# Confirm if it changed
# engineering.__dict__

#### Make a 'group_num' column

In [15]:
titanic = engineering.col_group(titanic)

#### Drop unused columns

In [16]:
unused_col = ['Name', 'Cabin', 'Ticket']
titanic = titanic.drop(unused_col, axis = 1)

# End of feature engineering. Start scaling and encoding

In [17]:
# split the data sets into it's regular form
train = titanic.ix['train']
test  = titanic.ix['test']

### Definging the class for scaling and encoding

In [18]:
from sklearn import preprocessing

class scale_encode():
    
    def __init__(self, data):
        self.data = data
        
    def scaling(self):
        # select numeric features
        self = self.select_dtypes(exclude = [object])
        # make it into an array, to feed it to the scaler
        self = np.array(self)
        scaler = preprocessing.StandardScaler().fit(self)
        scaled = scaler.transform(self)

        return scaled
    
    def encoding(self):
        # select object features
        self = self.select_dtypes(include= [object])
        # factorizing the features
        label_encode = preprocessing.LabelEncoder()
        factor_encoded = self.apply(label_encode.fit_transform)
        # define encoder, fit, and transform
        encoder = preprocessing.OneHotEncoder()
        encoder.fit(factor_encoded)
        one_hot = encoder.transform(factor_encoded).toarray()
        
        return one_hot

[Inspired by][1]

[1]:http://www.ritchieng.com/machinelearning-one-hot-encoding/

#### Standardizing the numerical values

In [19]:
# There was one missing value in the test set. The column is 'Fare'. Impute median.
test.loc[:, 'Fare'][test['Fare'].isna()] = median(test.loc[:, 'Fare'][test['Fare'].isna() == False])

In [20]:
train_num_scaled = scale_encode.scaling(train)
test_num_scaled = scale_encode.scaling(test)

In [21]:
print(color.BOLD + "How it looks like" + color.END, '\n', 
      train_num_scaled.shape, '\n', train_num_scaled)

[1mHow it looks like[0m 
 (891, 6) 
 [[ 0.82737724 -0.55181621  0.43279337 -0.47367361 -0.50244517  0.05915988]
 [-1.56610693  0.65766815  0.43279337 -0.47367361  0.78684529  0.05915988]
 [ 0.82737724 -0.24944512 -0.4745452  -0.47367361 -0.48885426 -0.56097483]
 ...
 [ 0.82737724 -0.55181621  0.43279337  2.00893337 -0.17626324  1.29942929]
 [-1.56610693 -0.24944512 -0.4745452  -0.47367361 -0.04438104 -0.56097483]
 [ 0.82737724  0.20411152 -0.4745452  -0.47367361 -0.49237783 -0.56097483]]


#### one-hot-encoding categorical values

In [22]:
train_factor_1hot = scale_encode.encoding(train)
test_factor_1hot = scale_encode.encoding(test)

Concat the numerical and categorical features

In [23]:
# Concat the two arrays to make the final data set
train_final = np.concatenate((train_num_scaled, train_factor_1hot), axis = 1)
test_final = np.concatenate((test_num_scaled, test_factor_1hot), axis = 1)

In [24]:
print(print(color.BOLD + "The final look of our data" + color.END, '\n', 
      train_final.shape, '\n', train_final))

[1mThe final look of our data[0m 
 (891, 16) 
 [[ 0.82737724 -0.55181621  0.43279337 ...  1.          0.
   0.        ]
 [-1.56610693  0.65766815  0.43279337 ...  0.          1.
   0.        ]
 [ 0.82737724 -0.24944512 -0.4745452  ...  0.          0.
   0.        ]
 ...
 [ 0.82737724 -0.55181621  0.43279337 ...  0.          0.
   0.        ]
 [-1.56610693 -0.24944512 -0.4745452  ...  1.          0.
   0.        ]
 [ 0.82737724  0.20411152 -0.4745452  ...  1.          0.
   0.        ]]
None


# Start  ensemble

In [25]:
target = np.array(target)

In [26]:
target.shape

(891,)

In [27]:
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn import svm

### 1. SVM

#### Grid search - SVM

In [28]:
from sklearn.model_selection import GridSearchCV

# the gridsearch will search for 9 * 18 combinations of estimators
params = [
  {'C': [1, 5, 100], 'gamma': [1, 0.1, 0.001], 'kernel': ['rbf']},
  {'C': [1, 5, 100], 'kernel': ['poly'], 'degree': [3, 4, 5], 'coef0': [1,2] }
 ]

svc = svm.SVC(random_state = 10)

In [29]:
# do cross validation. Cross validation 5 times
grid_search = GridSearchCV(svc, params, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(train_final, target)
print("Done Searching")

Done Searching


In [33]:
# The best parameters with out overfitting problems.
a = grid_search.best_params_

#### Apply grid - SVM

In [215]:
# higher c means a stricter classifier.
# rbf kernel adds similarity features
# a small gamma value makes the bell-shaped curve wider, 
# so instances have a larger range of influence, and the decision boundary ends up smoother
# So γ acts like a regularization hyperparameter: if your model is overfitting, you should reduce it

model_svm = svm.SVC(kernel="rbf", gamma=0.1, C=1, random_state = 10)
model_svm.fit(train_final, target)
model_svm.score(train_final, target)

0.8383838383838383

### 2. Bagging decision trees

In [29]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# 300 models, trained on 100 random samples from the training data set.
# bootstrap = True means the used obervations will be replaced
# n_jobs=-1. Use all available CPU cores
# oob_score. estimate out of bag score

In [214]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=300,
    max_samples=100, bootstrap=True, n_jobs=-1, oob_score=True)
bag_clf.fit(train_final, target)
bag_clf.score(train_final, target)

0.8641975308641975

In [216]:
# out of bag. It is the data observations that are not used when sampling, for each random sampling.
# after the model is trained, it uses out of bag observations to predict.
# since the oob observations are not used at training, they could give a glimpse of how the model will work on the real test data

bag_clf.oob_score_

0.8361391694725028

 ### 3. Randomforest

#### Grid search - Randomforest

In [73]:
from sklearn.ensemble import RandomForestClassifier

rand_f = RandomForestClassifier()
params = [
    {"n_estimators" : [10, 100, 200, 300],
    "max_leaf_nodes" : [5, 10, 15],
    "min_samples_leaf" : [1, 2, 4]}
]

In [74]:
grid_search = GridSearchCV(rand_f, params, cv=5)
grid_search.fit(train_final, target)
print("Done Searching")

Done Searching


In [75]:
grid_search.best_params_

{'max_leaf_nodes': 10, 'min_samples_leaf': 1, 'n_estimators': 300}

#### Apply grid - Randomforest

In [76]:
model_rf = RandomForestClassifier(max_leaf_nodes= 10, min_samples_leaf= 1, n_estimators= 300)
model_rf.fit(train_final, target)
model_rf.score(train_final, target)

0.835016835016835

### 4. ExtraTreesClassifier

In [31]:
from sklearn.ensemble import ExtraTreesClassifier

extree = ExtraTreesClassifier()
params = [
    {"n_estimators" : [10, 100, 200, 300],
    "max_leaf_nodes" : [10, 15, 20],
    "min_samples_leaf" : [1, 2, 4]}
]

In [None]:
grid_search = GridSearchCV(extree, params, cv=5)
grid_search.fit(train_final, target)
print('Done searching')

In [None]:
grid_search.best_params_

#### apply grid

In [199]:
model_et = ExtraTreesClassifier(max_leaf_nodes= 15, min_samples_leaf= 2, n_estimators= 10)
model_et.fit(train_final, target)
model_et.score(train_final, target)

0.8305274971941639

### 5. Gradient Boosting

In [79]:
from sklearn.ensemble import GradientBoostingClassifier

params = [
    {"learning_rate": [0.05, 0.1, 0.5],
     "n_estimators": [100, 200, 300],
     "max_depth": [1, 3, 5]
    }
]

In [None]:
gb = GradientBoostingClassifier(random_state = 10)
grid_search = GridSearchCV(gb, params, cv=5, scoring='roc_auc', verbose = 3)
grid_search.fit(train_final, target)
print("Done searching")

In [None]:
grid_search.best_params_

#### apply grid

In [80]:
model_gb = GradientBoostingClassifier(learning_rate = 0.1, max_depth = 3, n_estimators = 200, random_state = 10)
model_gb.fit(train_final, target)
model_gb.score(train_final, target)

0.9315375982042648

### 6. KNN

In [33]:
from sklearn.neighbors import KNeighborsClassifier

params = [
    {"n_neighbors": [5, 7, 10, 15]}
]

In [221]:
kn = KNeighborsClassifier(n_jobs = -1)
grid_search = GridSearchCV(kn, params, cv=5)
grid_search.fit(train_final, target)
print("Done Searching")

Done Searching


In [223]:
grid_search.best_params_

{'n_neighbors': 10}

In [224]:
model_kn = KNeighborsClassifier(n_neighbors = 10)
model_kn.fit(train_final, target)
model_kn.score(train_final, target)

0.8417508417508418

### 7. Naive Bayes

In [34]:
from sklearn.naive_bayes import GaussianNB

In [53]:
model_nb = GaussianNB()
model_nb.fit(train_final, target)
model_nb.score(train_final, target)

0.8092031425364759

## Start stacking

See the link to the stacking package "vecstack"
<br>
https://github.com/vecxoz/vecstack

#### First layer

In [68]:
clf = XGBClassifier()

params = {
        'n_estimators': [100, 250, 500],
        'eta': [0.05, 0.1, 0.3],
        'max_depth': [6, 9, 12],
        'subsample': [0.9, 1.0],
        'colsample_bytree': [0.9, 1.0],
    }


In [69]:
grid_search = GridSearchCV(xg, params, cv=5)
grid_search.fit(train_final, target)
print("Done Searching")

ValueError: Invalid parameter colsample_bytree for estimator GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=4, num_class=2, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=1, subsample=1, val_metric='auc'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'num_boost_round': [100, 250, 500], 'eta': [0.05, 0.1, 0.3], 'max_depth': [6, 9, 12], 'subsample': [0.9, 1.0], 'colsample_bytree': [0.9, 1.0], 'n_estimators': [100, 200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0). Check the list of available parameters with `estimator.get_params().keys()`.

In [82]:
from vecstack import StackingTransformer
from sklearn.metrics import mean_absolute_error

# Set the parameters according to the ones found above
# Without gradientboost, the target gets better.

estimators_L1 = [
    ('et', ExtraTreesClassifier(max_leaf_nodes= 15, min_samples_leaf= 2, n_estimators= 10)),
        
    ('rf', RandomForestClassifier(max_leaf_nodes= 10, min_samples_leaf= 1, n_estimators= 100)),
        
    ('gb', GradientBoostingClassifier(learning_rate = 0.1, max_depth = 3, n_estimators = 200, random_state = 10)),
    
    ('sv', svm.SVC(kernel="rbf", gamma=0.1, C=1, random_state = 10)),
    
    ('bc', BaggingClassifier(DecisionTreeClassifier(), n_estimators=300, max_samples=100, bootstrap=True, n_jobs=-1)),
    
    ('nb', GaussianNB()),
    
    ('kn', KNeighborsClassifier(n_neighbors = 10))
]

In [83]:
## start stacking

stack = StackingTransformer(estimators=estimators_L1,   # base estimators
                            regression=False,           # regression task (if you need classification - set to False)
                            variant='A',                # oof for train set, predict test set in each fold and find mean
                            metric=mean_absolute_error, # metric: callable
                            n_folds=4,                  # number of folds
                            shuffle=True,               # shuffle the data
                            random_state=10,            # ensure reproducibility
                            verbose=2)                  # print all info

In [84]:
# fit the stacking transformer
stack = stack.fit(train_final, target)

task:         [classification]
n_classes:    [2]
metric:       [mean_absolute_error]
variant:      [A]
n_estimators: [4]

estimator  0: [et: ExtraTreesClassifier]
    fold  0:  [0.14798206]
    fold  1:  [0.17488789]
    fold  2:  [0.22869955]
    fold  3:  [0.13963964]
    ----
    MEAN:     [0.17280229] + [0.03480179]

estimator  1: [rf: RandomForestClassifier]
    fold  0:  [0.14798206]
    fold  1:  [0.16143498]
    fold  2:  [0.22869955]
    fold  3:  [0.13063063]
    ----
    MEAN:     [0.16718681] + [0.03715534]

estimator  2: [sv: SVC]
    fold  0:  [0.15695067]
    fold  1:  [0.15695067]
    fold  2:  [0.22869955]
    fold  3:  [0.14414414]
    ----
    MEAN:     [0.17168626] + [0.03332926]

estimator  3: [bc: BaggingClassifier]
    fold  0:  [0.14349776]
    fold  1:  [0.17488789]
    fold  2:  [0.21973094]
    fold  3:  [0.13063063]
    ----
    MEAN:     [0.16718681] + [0.03434282]



In [85]:
# transform them
stack1_train = stack.transform(train_final)
stack1_test = stack.transform(test_final)

Train set was detected.
Transforming...

estimator  0: [et: ExtraTreesClassifier]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    ----
    DONE

estimator  1: [rf: RandomForestClassifier]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    ----
    DONE

estimator  2: [sv: SVC]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    ----
    DONE

estimator  3: [bc: BaggingClassifier]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    ----
    DONE

Transforming...

estimator  0: [et: ExtraTreesClassifier]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    ----
    DONE

estimator  1: [rf: RandomForestClassifier]
    model from fold  0: done
    mode

Done with the first layer.
<br>
We now have our train set and test set finalized. They are made based on the votes of our seven models.

#### See the correlations between models

Stacking is like democracy. Models raise their hands if they think the person survived or not, and the result with the most votes will be finalized.
<br>
So, it is best to choose discrete models, to lower the correlation. With low correlation, there will be more improvements in the final output.

In [47]:
classifiers = ["ExtraTree", "RandomForest", "GradientBoost", "SVM", "Bagging", "NaiveBayes", "KNN"]
df = pd.DataFrame(stack1_train)
df.columns = classifiers

In [48]:
df.corr()

Unnamed: 0,ExtraTree,RandomForest,SVM,Bagging,NaiveBayes,KNN
ExtraTree,1.0,0.985149,0.957793,0.902756,0.917326,0.829722
RandomForest,0.985149,1.0,0.967799,0.913135,0.921977,0.845358
SVM,0.957793,0.967799,1.0,0.89505,0.915004,0.862289
Bagging,0.902756,0.913135,0.89505,1.0,0.858066,0.839184
NaiveBayes,0.917326,0.921977,0.915004,0.858066,1.0,0.816814
KNN,0.829722,0.845358,0.862289,0.839184,0.816814,1.0


Thanks to the two links below, I was able to kick start stacking.
<br>
https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python
<br>
https://mlwave.com/kaggle-ensembling-guide/

#### Train the second layer

In [86]:
from xgboost import XGBClassifier
model_xgb = XGBClassifier(colsample_bytree = 0.6, max_depth = 3, n_estimators = 1000, reg_alpha = 0.02, subsample = 1)
model_xgb.fit(stack1_train, target)
model_xgb.score(stack1_train, target)

0.8338945005611672

In [87]:
pred = model_xgb.predict(stack1_test)
prediction = pd.DataFrame({"PassengerId": passengerid,
                           "Survived": pred
})
prediction.to_csv("submit.csv", index=False)