# Titanic

## Imports

In [2]:
import sys
!{sys.executable} -m pip install xgboost

Collecting xgboost
  Using cached xgboost-1.5.0-py3-none-macosx_10_14_x86_64.macosx_10_15_x86_64.macosx_11_0_x86_64.whl (1.3 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.5.0


In [304]:
import re

import random
random.seed(42)

import math

# Core
import numpy as np
import pandas as pd

# Transformers
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Import metrics
from sklearn.metrics import accuracy_score

# Model
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
import xgboost as xgb
from xgboost import XGBClassifier

# Model selection
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

# K-Fold
from sklearn.model_selection import StratifiedKFold

# Visualisation
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns

## Data import

In [233]:
# Load train data
train = pd.read_csv('./data/train.csv')

# Load test data
X_test = pd.read_csv('./data/test.csv')

submission_data = pd.read_csv('./data/gender_submission.csv')


X = train.drop("Survived",axis=1)
y = train["Survived"].astype(int)

## Verificatoin stratagy

In [234]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

## Cleaning data

In [6]:
titanic_df.isna().sum()

NameError: name 'titanic_df' is not defined

### Age

In [235]:
class AgeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        print('')
        
    def fit(self, X, y = None):
        self.mean = X['Age'].mean() 
        return self
    
    def transform(self, X, y = None):
        X_ = X.copy()
        X_['Age'] = X_['Age'].fillna((self.mean))
        return X_

# Pipline for Age
age_clean_pipeline = Pipeline([
    ('immuter', AgeTransformer()),
])




### Cabin

In [236]:
class CabinTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        print('')
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        X_ = X.copy()
        X_['Cabin'] = X_['Cabin'].fillna('N')
        return X_
    
# Pipline for Cabin
cabin_clean_pipeline = Pipeline([
    ('immuter', CabinTransformer()),
])




### Fare

In [237]:
class FareTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        print('')
        
    def fit(self, X, y = None):
        self.mean = X['Fare'].mean() 
        return self
    
    def transform(self, X, y = None):
        X_ = X.copy()
        X_['Fare'] = X_['Fare'].fillna((self.mean))
        return X_

# Pipline for Fare
fare_clean_pipeline = Pipeline([
    ('imputer', FareTransformer()),
])




### Embarked

In [238]:
X['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [239]:
class EmbarkedTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        print('')
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        X_ = X.copy()
        X_['Embarked'] = X_['Embarked'].fillna('N')
        return X_
    
# Pipline for Embarked
embarked_clean_pipeline = Pipeline([
    ('immuter', EmbarkedTransformer()),
])




### New features

#### Hase cabin

In [240]:
class HasCabinTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        print('')
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        X_ = X.copy()
        X_['Has_Cabin'] = X_['Cabin'].apply(lambda x: 0 if type(x) == float else 1)
        return X_
    
add_has_cabin_pipline = Pipeline([
    ('new', HasCabinTransformer()),
])




#### Cabin_Label

In [241]:
class CabinLabelTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        print('')
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        X_ = X.copy()
        X_['Cabin_Label'] = X_['Cabin'].str.get(0)
        return X_
    
add_cabin_label_pipline = Pipeline([
    ('new', CabinLabelTransformer()),
])




#### Family size

In [242]:
class FamilySizeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        print('')
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        X_ = X.copy()
        X_['Family_Size'] = X_['SibSp'] + X_['Parch'] + 1
        return X_
    
add_family_size_pipline = Pipeline([
    ('new', FamilySizeTransformer()),
])




#### Alone

In [243]:
class IsAloneTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        print('')
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        X_ = X.copy()
        X_['Is_Alone'] = 0
        X_.loc[X_['Family_Size'] == 1, 'Is_Alone'] = 1
        return X_
    
add_is_alone_pipline = Pipeline([
    ('new', IsAloneTransformer()),
])




#### Title

In [244]:
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""


class TitleTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        print('')
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        X_ = X.copy()
        X_['Title'] = X_['Name'].apply(get_title)
        X_['Title'] = X_['Title'].replace('Mlle', 'Miss')
        X_['Title'] = X_['Title'].replace('Ms', 'Miss')
        X_['Title'] = X_['Title'].replace('Mme', 'Mrs')
        return X_
    
add_title_pipline = Pipeline([
    ('new', TitleTransformer()),
])




### Final features

In [245]:
add_before_clean_pipline = Pipeline([
    ('Has_Cabin', add_has_cabin_pipline),
])

clean_pipeline = Pipeline([
    ('Age', age_clean_pipeline),
    ('Cabin', cabin_clean_pipeline),
    ('Embarked', embarked_clean_pipeline),
    ('Fare', fare_clean_pipeline),
])

add_after_clean_pipline = Pipeline([
    ('Cabin_Label', add_cabin_label_pipline),
    ('Family_Size', add_family_size_pipline),
    ('Is_Alone', add_is_alone_pipline),
    ('Title', add_title_pipline),
])

cat_pipline = Pipeline([
    ('cat', OneHotEncoder(handle_unknown='ignore')),
])

num_pipline = Pipeline([
    ('scaler', StandardScaler()),
])

categoties = ['Has_Cabin', 'Sex', 'Cabin', 'Embarked', 'Pclass', 'Cabin_Label', 'Is_Alone', 'Title']
num = ['Age', 'Fare', 'SibSp', 'Parch', 'Family_Size']

feats = ColumnTransformer(transformers=[
    ('num', num_pipline, num),
    ('cat', cat_pipline, categoties),
])

feature_processing = Pipeline([
    ('add before', add_before_clean_pipline),
    ('clean', clean_pipeline),
    ('add after', add_after_clean_pipline),
    ('feats', feats),
])

## Models

### DecisionTreeClassifier

In [351]:
#Decision tree model
model=DecisionTreeClassifier(random_state=42)

param_grid = {
    'criterion':['gini','entropy'],
    'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150],
    'min_samples_leaf': [1,2,3,4,5,10,20]
}

gscv = GridSearchCV(model, param_grid, cv=skf, scoring="accuracy", return_train_score=True)

# Transform data
X_t = feature_processing.fit_transform(X)


# Fit model
gscv.fit(X_t, y)

print("--------------------------------------------------------")
print("Best estimator:", gscv.best_estimator_)
print("--------------------------------------------------------")
print("Best score:", gscv.best_score_)
print("--------------------------------------------------------")
# print('Mean test score: {}'.format(gscv.cv_results_['mean_test_score']))
# print('Mean train score: {}'.format(gscv.cv_results_['mean_train_score']))

--------------------------------------------------------
Best estimator: DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=4, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')
--------------------------------------------------------
Best score: 0.8260381593714928
--------------------------------------------------------


Base score: 

v1 = 0.8148148148148149
v3 = 0.8237934904601572
v4 = 0.8159371492704827
v5 = 0.8204264870931538
v6 = 0.8260381593714928

#### First submit to kagle

In [408]:
X_test_t = feature_processing.transform(X_test)

last_prediction = gscv.best_estimator_.predict(X_test_t)
submission_data['Survived'] = last_prediction.astype(int)
submission_data.to_csv('./submissions/submission_decision_tree.csv', index = False)

ValueError: Length of values does not match length of index

Result on test data on Kagle: 
v1 = 0.78708
v2 = 0.73684
v3 = 0.76315
v6 = 0.76555

### Bagging

#### Bagging using sklearn 

In [397]:
param_grid = {
    'n_estimators' : [50, 100, 150, 200, 300, 400, 500, 1000, 1500, 2000, 2500, 3000, 3500],
    'max_samples' : [100, 200, 250, 300, 350, 400, 450, 500, 550]
}

model = GridSearchCV(
    BaggingClassifier(
        DecisionTreeClassifier(
            random_state=42),
        bootstrap=True,
        n_jobs=-1,
        oob_score=True,
        random_state=42,
    ),
    param_grid,
    scoring="accuracy",
)

model.fit(X_t, y)

print("--------------------------------------------------------")
print("Best estimator:", gscv.best_estimator_)
print("--------------------------------------------------------")
print("Best score:", gscv.best_score_)
print("--------------------------------------------------------")

--------------------------------------------------------
Best estimator: DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=4, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')
--------------------------------------------------------
Best score: 0.8260381593714928
--------------------------------------------------------


##### First submit to kagle

In [413]:
X_test_t = feature_processing.transform(X_test)
last_prediction = model.best_estimator_.predict(X_test_t)
submission_data['Survived'] = last_prediction.astype(int)
submission_data.to_csv('./submissions/submission_bagging_sklearn.csv', index = False)

Finul score on Kagel: 0.77751

In [464]:
param_grid = {
    'n_estimators' : [10, 20, 50, 100, 150, 200, 300, 400, 500, 1000, 2000, 3000],
    'max_samples' : [0.05, 0.1, 0.2, 0.3, 0.35, 0.4, 0.45, 0.5, 0.7, 0.9],
    'max_features' : [0.6, 0.65, 0.7, 0.71, 0.72, 0.73, 0.8]
}


model = GridSearchCV(BaggingClassifier(
        DecisionTreeClassifier(random_state=42),
        bootstrap=True,
        n_jobs=-1,
        oob_score=True,
        random_state=42,
    ),
    param_grid,         
    scoring="accuracy",
)
model.fit(X_t, y)

print("--------------------------------------------------------")
print("Best estimator:", model.best_estimator_)
print("--------------------------------------------------------")
print("Best score:", model.best_score_)
print("--------------------------------------------------------")

  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have 

  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have 

  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have 

  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])


--------------------------------------------------------
Best estimator: BaggingClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                        class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort='deprecated',
  

Попробуем объединить лучшие результаты:

In [469]:
param_grid = {
    'n_estimators' : [90, 100, 120],
    'max_samples' : [0.6, 0.65, 0.69, 0.7, 0.71, 0.75, 0.8],
    'max_features' : [0.7, 0.71, 0.72, 0.73],
    'base_estimator__max_depth': [3, 4, 5, 6, 7],
    'base_estimator__min_samples_leaf': [1, 2, 3, 4, 5],
}


model = GridSearchCV(BaggingClassifier(
        DecisionTreeClassifier(
            random_state=42,
            criterion='gini',
            max_depth=4,
            min_samples_leaf=1,
        ),
        bootstrap=True,
        n_jobs=-1,
        oob_score=True,
        random_state=42,
    ),
    param_grid,         
    scoring="accuracy",
)
model.fit(X_t, y)

print("--------------------------------------------------------")
print("Best estimator:", model.best_estimator_)
print("--------------------------------------------------------")
print("Best score:", model.best_score_)
print("--------------------------------------------------------")

--------------------------------------------------------
Best estimator: BaggingClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                        class_weight=None,
                                                        criterion='gini',
                                                        max_depth=6,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=2,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort='deprecated',
     

In [470]:
X_test_t = feature_processing.transform(X_test)
last_prediction = model.predict(X_test_t)
submission_data['Survived'] = last_prediction.astype(int)
submission_data.to_csv('./submissions/submission_bagging_sklearn.csv', index = False)

Kegle: 0.77990

#### Bagging with Custom Classifier

In [297]:
class CustomBaggingClassifier(BaseEstimator, ClassifierMixin):
    estimators = []
    
    def __init__(self, base_estimator, n_estimators=10, max_samples=1.0, max_features=1.0, random_state=None):
        self.base_estimator = base_estimator
        self.n_estimators = n_estimators
        self.max_samples = max_samples
        self.max_features = max_features
        self.random_state = random_state
        
    def __filter(self, X, y, features, samples):
        """ Helper which filter samples and features by list of indexes """
        new_X = []
        new_y = []
        
        X_filtered = np.take(X.toarray(), features, axis=1)
        
        for i in samples:
            new_X.append(X_filtered[i])
            new_y.append(y[i])
        
        new_X = np.array(new_X)
        new_y = np.array(new_y)
        
        return (new_X, new_y)
    
    
    def __filter_only_features(self, X, features):
        """ Helper which filter features by list of indexes """
        new_X = []
        
        X_filtered = np.take(X.toarray(), features, axis=1)
        
        return X_filtered
  

    def fit(self, X, y):
        estimators = []
        
        number_of_features = X.shape[1]
        number_of_samples = X.shape[0]
        
        number_of_features_to_use = math.floor(number_of_features * self.max_features)
        number_of_samples_to_use = math.floor(number_of_samples * self.max_samples)
        
        all_features = range(0, number_of_features)
        all_samples = range(0, number_of_samples)
        
        for i in range(0, self.n_estimators):
            features = random.sample(all_features, number_of_features_to_use)
            samples = random.choices(all_samples, k=number_of_samples_to_use)
            features.sort()
            
            (new_X, new_y) = self.__filter(X, y, features, samples)
            
            estimators.append({
                'features': features,
                'samples': samples,
                'estimator': self.base_estimator.fit(new_X, new_y),
            })
            
        self.estimators = estimators
        return self
        
        
    def predict(self, X):
        preds = []
        
        for estimator in self.estimators:
            X_with_droped_features = self.__filter_only_features(X, estimator['features'])
            preds.append(estimator['estimator'].predict(X_with_droped_features))
        
        return pd.DataFrame(preds).mode().transpose()[0]

In [309]:
# Transform data
X_t = feature_processing.fit_transform(X)

# Model
model = CustomBaggingClassifier(base_estimator=DecisionTreeClassifier(
            random_state=42,
            criterion='gini',
            max_depth=4,
            min_samples_leaf=1,
        ),
        n_estimators=90,
        max_samples=0.75,
        max_features=0.72,
)

Почему следующий код падает не понимаю. :(

In [308]:
# # Обучаем на тренировочном датасете
results = cross_val_score(model, X_t, y, cv=skf)

# # Оцениваем точность на тестовом датасете
print(f"CV accuracy score: {results.mean()}")

CV accuracy score: nan


KeyError: 335

KeyError: 281

KeyError: 51



In [307]:
model.fit(X_t, y)
X_test_t = feature_processing.transform(X_test)
last_prediction = model.predict(X_test_t)
last_prediction
submission_data['Survived'] = last_prediction.astype(int)
submission_data.to_csv('./submissions/submission_bagging_custom.csv', index = False)

Получил скор на кагле 0.46172 почему такой низки не понимаю :(

## Random Forest

In [557]:
parameters = {
    'n_estimators': [100, 200, 201, 202, 203, 204, 205, 1000],
    'max_features': [4, 7, 10, 13, 14, 15, 20], 
    'min_samples_leaf': [2, 3, 5, 7], 
    'max_depth': [5,10,15,20],
}

rfc = RandomForestClassifier(random_state=42, n_jobs=-1) 
model = GridSearchCV(rfc, parameters, n_jobs=-1, cv=skf, verbose=1)
model.fit(X_t, y)

print("--------------------------------------------------------")
print("Best estimator:", model.best_estimator_)
print("--------------------------------------------------------")
print("Best score:", model.best_score_)
print("--------------------------------------------------------")

Fitting 3 folds for each of 896 candidates, totalling 2688 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   26.5s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 2688 out of 2688 | elapsed:  7.0min finished


--------------------------------------------------------
Best estimator: RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features=20,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)
--------------------------------------------------------
Best score: 0.8417508417508417
--------------------------------------------------------


In [558]:
X_test_t = feature_processing.transform(X_test)
last_prediction = model.predict(X_test_t)
submission_data['Survived'] = last_prediction.astype(int)
submission_data.to_csv('./submissions/submission_random_forest.csv', index = False)

Kagle: 0.78229

## Extra Trees

In [560]:
parameters = {
    'n_estimators': [100, 200, 201, 202, 203, 204, 205, 1000],
    'max_features': [4, 7, 10, 13, 14, 15, 20], 
    'min_samples_leaf': [2, 3, 5, 7], 
    'max_depth': [5,10,15,20],
}

etc = ExtraTreesClassifier(random_state=42, n_jobs=-1) 
model = GridSearchCV(etc, parameters, n_jobs=-1, cv=skf, verbose=1)
model.fit(X_t, y)

print("--------------------------------------------------------")
print("Best estimator:", model.best_estimator_)
print("--------------------------------------------------------")
print("Best score:", model.best_score_)
print("--------------------------------------------------------")

Fitting 3 folds for each of 896 candidates, totalling 2688 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   23.3s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   58.4s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 2688 out of 2688 | elapsed:  6.2min finished


--------------------------------------------------------
Best estimator: ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=10, max_features=20,
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=3, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
                     oob_score=False, random_state=42, verbose=0,
                     warm_start=False)
--------------------------------------------------------
Best score: 0.8316498316498316
--------------------------------------------------------


In [561]:
X_test_t = feature_processing.transform(X_test)
last_prediction = model.predict(X_test_t)
submission_data['Survived'] = last_prediction.astype(int)
submission_data.to_csv('./submissions/submission_extra_tree.csv', index = False)

Kaggle: 0.78947

## XGBoost

In [26]:
# Transform data
X_t = feature_processing.fit_transform(X)

parameters = {
    #default
    "objective": "binary:logistic",
    "eta": 0.1,
    "verbosity": 0,
    "nthread": 4,
    "random_seed": 1,
    "eval_metric": "auc"
}


xgb_train = xgb.DMatrix(X_t, y)

results = xgb.cv(parameters, xgb_train, num_boost_round=100,
                 folds=skf, verbose_eval=10)

[0]	train-auc:0.90263+0.00594	test-auc:0.86225+0.01217
[10]	train-auc:0.93743+0.00270	test-auc:0.87136+0.00403
[20]	train-auc:0.95242+0.00242	test-auc:0.87927+0.00480
[30]	train-auc:0.96296+0.00405	test-auc:0.88059+0.00737
[40]	train-auc:0.96916+0.00215	test-auc:0.88217+0.00929
[50]	train-auc:0.97430+0.00225	test-auc:0.88163+0.00956
[60]	train-auc:0.97987+0.00314	test-auc:0.88249+0.00809
[70]	train-auc:0.98292+0.00267	test-auc:0.88219+0.00707
[80]	train-auc:0.98576+0.00253	test-auc:0.88013+0.00801
[90]	train-auc:0.98786+0.00223	test-auc:0.87945+0.00827
[99]	train-auc:0.98974+0.00172	test-auc:0.87990+0.00908


In [28]:
parameters = {
    #default
    "objective": "binary:logistic",
    "eta": 0.1,
    "verbosity": 0,
    "nthread": 4,
    "random_seed": 1,
    "eval_metric": "auc",
    
    # regularization parameters
    "max_depth": 5,
    "subsample": 0.7,
    "colsample_bytree": 0.7
}


xgb_train = xgb.DMatrix(X_t, y)

results = xgb.cv(parameters, xgb_train, num_boost_round=100,
                 folds=skf, verbose_eval=10)

[0]	train-auc:0.87799+0.00466	test-auc:0.85208+0.01454
[10]	train-auc:0.91819+0.00959	test-auc:0.87631+0.01530
[20]	train-auc:0.93048+0.00548	test-auc:0.88023+0.01787
[30]	train-auc:0.94078+0.00352	test-auc:0.87892+0.01645
[40]	train-auc:0.94887+0.00462	test-auc:0.87972+0.01822
[50]	train-auc:0.95443+0.00438	test-auc:0.87858+0.01650
[60]	train-auc:0.96124+0.00294	test-auc:0.87949+0.01828
[70]	train-auc:0.96570+0.00354	test-auc:0.88092+0.01714
[80]	train-auc:0.97015+0.00353	test-auc:0.87838+0.01637
[90]	train-auc:0.97454+0.00329	test-auc:0.87760+0.01485
[99]	train-auc:0.97754+0.00241	test-auc:0.87947+0.01548


In [30]:
parameters = {
    #default
    "objective": "binary:logistic",
    "eta": 0.1,
    "verbosity": 0,
    "nthread": 4,
    "random_seed": 1,
    "eval_metric": "auc",
    
    # regularization parameters
    "max_depth": 5,
    "subsample": 0.7,
    "colsample_bytree": 0.7,
    
    #lightgbm approach
    "tree_method": "hist"
}


xgb_train = xgb.DMatrix(X_t, y)

results = xgb.cv(parameters, xgb_train, num_boost_round=100,
                 folds=skf, verbose_eval=10)

[0]	train-auc:0.88136+0.00727	test-auc:0.86125+0.00661
[10]	train-auc:0.92087+0.00658	test-auc:0.87688+0.01655
[20]	train-auc:0.93653+0.00504	test-auc:0.87954+0.01816
[30]	train-auc:0.94617+0.00414	test-auc:0.87993+0.01936
[40]	train-auc:0.95288+0.00563	test-auc:0.88136+0.01899
[50]	train-auc:0.96094+0.00447	test-auc:0.88262+0.01853
[60]	train-auc:0.96481+0.00527	test-auc:0.88234+0.01781
[70]	train-auc:0.96804+0.00502	test-auc:0.88195+0.01741
[80]	train-auc:0.97188+0.00446	test-auc:0.88088+0.01495
[90]	train-auc:0.97619+0.00462	test-auc:0.88208+0.01391
[99]	train-auc:0.97810+0.00413	test-auc:0.88076+0.01374


In [31]:
parameters = {
    #default
    "objective": "binary:logistic",
    "eta": 0.1,
    "verbosity": 0,
    "nthread": 4,
    "random_seed": 1,
    "eval_metric": "auc",
    
    # regularization parameters
    "max_depth": 5,
    "subsample": 0.7,
    "colsample_bytree": 0.7,
    
    #lightgbm approach
    "tree_method": "hist",
    "grow_policy": "lossguide"
}


xgb_train = xgb.DMatrix(X_t, y)

results = xgb.cv(parameters, xgb_train, num_boost_round=100,
                 folds=skf, verbose_eval=10)

[0]	train-auc:0.88136+0.00727	test-auc:0.86125+0.00661
[10]	train-auc:0.92087+0.00658	test-auc:0.87688+0.01655
[20]	train-auc:0.93653+0.00504	test-auc:0.87954+0.01816
[30]	train-auc:0.94617+0.00414	test-auc:0.87993+0.01936
[40]	train-auc:0.95288+0.00563	test-auc:0.88136+0.01899
[50]	train-auc:0.96094+0.00447	test-auc:0.88262+0.01853
[60]	train-auc:0.96481+0.00527	test-auc:0.88234+0.01781
[70]	train-auc:0.96804+0.00502	test-auc:0.88195+0.01741
[80]	train-auc:0.97188+0.00446	test-auc:0.88088+0.01495
[90]	train-auc:0.97619+0.00462	test-auc:0.88208+0.01391
[99]	train-auc:0.97810+0.00413	test-auc:0.88076+0.01374


In [32]:
# instantiate the classifier 
xgb_clf = XGBClassifier(**parameters)


# fit the classifier to the training data
xgb_clf.fit(X_t, y)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7,
              enable_categorical=False, eta=0.1, eval_metric='auc', gamma=0,
              gpu_id=-1, grow_policy='lossguide', importance_type=None,
              interaction_constraints='', learning_rate=0.100000001,
              max_delta_step=0, max_depth=5, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4, nthread=4,
              num_parallel_tree=1, objective='binary:logistic',
              predictor='auto', random_seed=1, random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, ...)

In [33]:
X_test_t = feature_processing.transform(X_test)
last_prediction = xgb_clf.predict(X_test_t)
submission_data['Survived'] = last_prediction.astype(int)
submission_data.to_csv('./submissions/submission_xgboost.csv', index = False)