In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC

from xgboost import XGBClassifier

from scipy import stats

%matplotlib inline

In [3]:
trainCsvFilepath = r'../input/titanic/train.csv'
train = pd.read_csv(trainCsvFilepath)

testCsvFilepath = r'../input/titanic/test.csv'
test = pd.read_csv(testCsvFilepath)

In [34]:
merged = pd.concat([train, test], axis=0, sort=False)

In [35]:
merged.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,nRelatives,Ticket2,Cabin2,title
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,unknown,S,2.0,9.0,7.0,Mr
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2.0,8.0,3.0,Mrs
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,unknown,S,1.0,16.0,7.0,Miss
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,2.0,6.0,4.0,Mrs
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,unknown,S,1.0,6.0,7.0,Mr


In [23]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 16 columns):
PassengerId    1309 non-null int64
Survived       891 non-null float64
Pclass         1309 non-null int64
Name           1309 non-null object
Sex            1309 non-null object
Age            1223 non-null float64
SibSp          1309 non-null int64
Parch          1309 non-null int64
Ticket         1309 non-null object
Fare           1308 non-null float64
Cabin          982 non-null object
Embarked       1309 non-null object
nRelatives     891 non-null float64
Ticket2        891 non-null float64
Cabin2         891 non-null float64
title          891 non-null object
dtypes: float64(6), int64(4), object(6)
memory usage: 173.9+ KB


In [5]:
merged.dtypes.sort_values()

PassengerId      int64
Pclass           int64
SibSp            int64
Parch            int64
Survived       float64
Age            float64
Fare           float64
Name            object
Sex             object
Ticket          object
Cabin           object
Embarked        object
dtype: object

In [6]:
nulls = merged.isnull().sum()
nulls[nulls > 0]

Survived     418
Age          263
Fare           1
Cabin       1014
Embarked       2
dtype: int64

# Create new features

# Feature Engineering

In [30]:
class FeatureDropper(BaseEstimator, TransformerMixin):
    def __init__(self, featuresToDrop):
        self.featuresToDrop = featuresToDrop
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.drop(columns=self.featuresToDrop)

In [31]:
class ImputeAge(BaseEstimator, TransformerMixin):
    def __init__(self):
        return
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X['title'] = X.Name.apply(lambda x: x.split('.')[0].split(',')[1].strip())

        newtitles={
            "Capt":       "Officer",
            "Col":        "Officer",
            "Major":      "Officer",
            "Jonkheer":   "Royalty",
            "Don":        "Royalty",
            "Sir" :       "Royalty",
            "Dr":         "Officer",
            "Rev":        "Officer",
            "the Countess":"Royalty",
            "Dona":       "Royalty",
            "Mme":        "Mrs",
            "Mlle":       "Miss",
            "Ms":         "Mrs",
            "Mr" :        "Mr",
            "Mrs" :       "Mrs",
            "Miss" :      "Miss",
            "Master" :    "Master",
            "Lady" :      "Royalty"
        }

        X.title = X.title.map(newtitles)

        meanAgeOfGroup = X.groupby(['title', 'Sex']).Age.mean()

        def newAge(row):
            title = row[0]
            Sex = row[1]
            Age = row[2]

            if pd.isnull(Age):
                return meanAgeOfGroup.loc[(title, Sex)]
            else:
                return Age

        X.Age = X[['title', 'Sex', 'Age']].apply(newAge, axis=1)

        return X

In [36]:
class CustomTransform(BaseEstimator, TransformerMixin):
    def __init__(self):
        return
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X:pd.DataFrame, y=None):
        X.Fare = X['Fare'].fillna(X['Fare'].mean())
        X.Cabin = X['Cabin'].fillna('unknown')
        X.Embarked = X['Embarked'].fillna(X['Embarked'].mode()[0])
        
        X['nRelatives'] = X.SibSp + X.Parch + 1
        X['Ticket2'] = X.Ticket.apply(lambda x: len(x))
        X['Cabin2'] = X.Cabin.apply(lambda x: x[0])
        
        return X

In [37]:
featureEngPipeline = Pipeline(steps=[
    ('customTransform', CustomTransform()),
    ('imputeAge', ImputeAge()),
    ('dropFeatures', FeatureDropper(['PassengerId','Name','SibSp','Parch','Ticket','Cabin', 'title'])),
])

featureEngPipeline.fit_transform(train).head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,nRelatives,Ticket2,Cabin2
0,0,3,male,22.0,7.25,S,2,9,u
1,1,1,female,38.0,71.2833,C,2,8,C
2,1,3,female,26.0,7.925,S,1,16,u
3,1,1,female,35.0,53.1,S,2,6,C
4,0,3,male,35.0,8.05,S,1,6,u


# Transform pipeline

In [44]:
transformPipeline = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), ['Sex', 'Embarked', 'Cabin2', 'Pclass']),
    remainder='passthrough'
)

# Try models

In [45]:
xTrain = train.drop(columns=['Survived'])
yTrain = train['Survived']
xTest = test

## Random Forest

rf = RandomForestClassifier(random_state=42, n_jobs=-1)

fullPipeline = Pipeline(steps=[
    ('featureEngPipeline', featureEngPipeline),
    ('transformPipeline', transformPipeline),
    ('model', rf)
])

param_dist = {'model__n_estimators':[10, 100],
             'model__max_depth':[3, 4, 5, 6],
             'model__criterion':['gini','entropy']}

PRF=[{'model__n_estimators':[10,100],
      'model__max_depth':[3,6],
      'model__criterion':['gini','entropy']}]
RSRF=RandomizedSearchCV(estimator=fullPipeline, param_distributions=param_dist, scoring='accuracy', cv=2, n_jobs=-1, random_state=42)
scores = cross_val_score(RSRF, xTrain, yTrain, scoring='accuracy', cv=5);

scores.mean()

## XGBoost

In [48]:
xgb = XGBClassifier(random_state=42, n_jobs=-1)

fullPipeline = Pipeline(steps=[
    ('featureEngPipeline', featureEngPipeline),
    ('transformPipeline', transformPipeline),
    ('model', xgb)
])

one_to_left = stats.beta(10, 1)  
from_zero_positive = stats.expon(0, 50)
param_dis = {  
    "model__n_estimators": stats.randint(40, 200),
    "model__max_depth": stats.randint(3, 14),
    "model__learning_rate": stats.uniform(0.05, 0.4),
    #"colsample_bytree": one_to_left,
    "model__subsample": one_to_left,
    "model__gamma": stats.uniform(0, 10),
    #'model__reg_alpha': from_zero_positive,
    "model__min_child_weight": from_zero_positive,
}

RSXGB = RandomizedSearchCV(fullPipeline, param_dis, scoring='accuracy', random_state=42, n_jobs=-1, cv=5)
scores = cross_val_score(RSXGB, xTrain, yTrain, scoring='accuracy', cv=5);

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#

In [49]:
scores.mean()

0.8350242057290144

## SVM

svm = SVC(random_state=42)

fullPipeline = Pipeline(steps=[
    ('featureEngPipeline', featureEngPipeline),
    ('transformPipeline', transformPipeline),
    ('standardScaler', StandardScaler()),
    ('model', svm)
])

r=[0.0001,0.001,0.1,1,10,50,100]

PSVM=[{'model__C':r, 'model__kernel':['linear']},
      {'model__C':r, 'model__gamma':r, 'model__kernel':['rbf']}]

GSSVM=GridSearchCV(estimator=fullPipeline, param_grid=PSVM, scoring='accuracy', n_jobs=-1, cv=2)
scores_svm=cross_val_score(GSSVM, xTrain, yTrain, scoring='accuracy', cv=5)

scores_svm.mean()

# Submission

In [None]:
model=RSXGB.fit(xTrain, yTrain)
pred=model.predict(xTest)
output=pd.DataFrame({'PassengerId':xTest['PassengerId'],'Survived':pred})
output.to_csv('submission.csv', index=False)