In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score

from xgboost import XGBClassifier

from scipy import stats

%matplotlib inline

In [3]:
trainCsvFilepath = r'../input/titanic/train.csv'
trainData = pd.read_csv(trainCsvFilepath)

testCsvFilepath = r'../input/titanic/test.csv'
testData = pd.read_csv(testCsvFilepath)

# Custom Pipeline

In [5]:
class FeatureDropper(BaseEstimator, TransformerMixin):
    def __init__(self, featuresToDrop):
        self.featuresToDrop = featuresToDrop
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.drop(columns=self.featuresToDrop)

In [7]:
X_train, X_valid, y_train, y_valid = train_test_split(trainData.drop(columns=['Survived']), trainData['Survived'], test_size=0.2)

# Wrangling

In [13]:
pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder())

preprocessor = make_column_transformer(
    (pipeline, ['Embarked']),
    (OneHotEncoder(), ['Sex']),
    #(OrdinalEncoder(), ['Cabin']),
    remainder='passthrough'
)

In [27]:
models = [
    #RandomForestClassifier(n_estimators=1, random_state=42),
    XGBClassifier(n_jobs=-1),
    #SGDClassifier()
]

In [28]:
my_pipelines = []
for model in models:
    my_pipelines.append(Pipeline(steps=[
        ('drop', FeatureDropper(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Age'])),
        ('preprocessor', preprocessor),
        ('model', model)
    ]))

In [29]:
for idx, pipeline in enumerate(my_pipelines):
    pipeline.fit(X_train, y_train)
    pred = pipeline.predict(X_valid)
    print(accuracy_score(y_valid, pred))

0.7541899441340782


In [30]:
my_pipeline = my_pipelines[0]

In [31]:
my_pipeline.fit(X_train, y_train);

In [38]:
preds = gridSearch.predict(X_valid)

In [39]:
accuracy_score(y_valid, preds)

0.7597765363128491

In [34]:
param_grid = {'model__n_estimators':[50, 90, 100, 110, 200],
             'model__max_depth':[3, 4, 5],
             'model__learning_rate':[0.5, 0.1, 0.01, 0.001]}

gridSearch = GridSearchCV(my_pipeline, param_grid, scoring='accuracy', n_jobs=-1)
gridSearch.fit(X_train, y_train)
print(f'best score={gridSearch.best_score_}')
print(f'best params={gridSearch.best_params_}')



best score=0.8188202247191011
best params={'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 50}


In [37]:

# param_dist = {'model__n_estimators': stats.randint(100, 200),
#               'model__learning_rate': stats.uniform(0.01, 0.6),
#               'model__subsample': stats.uniform(0.3, 0.9),
#               'model__max_depth': [3, 4, 5, 6, 7, 8, 9],
#               'model__colsample_bytree': stats.uniform(0.5, 0.9),
#               'model__min_child_weight': [1, 2, 3, 4]
#              }

one_to_left = stats.beta(10, 1)  
from_zero_positive = stats.expon(0, 50)
param_dis = {  
    "model__n_estimators": stats.randint(40, 200),
    "model__max_depth": stats.randint(3, 6),
    "model__learning_rate": stats.uniform(0.05, 0.4),
    #"colsample_bytree": one_to_left,
    "model__subsample": one_to_left,
    "model__gamma": stats.uniform(0, 10),
    #'model__reg_alpha': from_zero_positive,
    "model__min_child_weight": from_zero_positive,
}

gridSearch = RandomizedSearchCV(my_pipeline, param_distributions=param_dis, scoring='accuracy', n_jobs=-1)
gridSearch.fit(X_train, y_train)
print(f'best score={gridSearch.best_score_}')
print(f'best params={gridSearch.best_params_}')



best score=0.824438202247191
best params={'model__gamma': 2.047218209284222, 'model__learning_rate': 0.07505267774542453, 'model__max_depth': 5, 'model__min_child_weight': 4.088648983867801, 'model__n_estimators': 188, 'model__subsample': 0.9841390352031275}


In [82]:
X_test = cleanupMakeCopy(testData)

In [83]:
y_preds = gridSearch.predict(X_test)

In [60]:
# Save predictions in format used for competition scoring
output = pd.DataFrame({'PassengerId': testData.PassengerId,
                       'Survived': y_preds})
output.to_csv('submission.csv', index=False)