In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings("ignore")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')

data = pd.concat([train, test])

In [3]:
X_train = data.loc[data['Survived'].notna()].drop('Survived', axis=1)
y_train = data.loc[data['Survived'].notna()]['Survived']

X_test = data.loc[data['Survived'].isna()].drop('Survived', axis=1)

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, MinMaxScaler

class Feature_Enginnering(BaseEstimator, TransformerMixin):
    def __init__(self, feature = 'Name', features_selected = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title']):
        self.feature = feature
        self.features_selected = features_selected
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        X_ = X.copy()
        
        # extract title from name
        X_['Title'] = X_[self.feature].apply(lambda x: x.split(',')[1].split('.')[0].strip())
        X_.loc[X_['Title'] == 'Mlle', 'Title'] = 'Miss'
        X_.loc[X_['Title'] == 'Mme', 'Title'] = 'Mrs'
        X_.loc[X_['Title'] == 'Dona', 'Title'] = 'Mrs'
        X_.loc[X_['Title'] == 'Lady', 'Title'] = 'Ms'
        X_.loc[X_['Title'].isin(['Don', 'Sir']), 'Title'] = 'Mr'
        X_.loc[X_['Title'].isin(['Major', 'Col', 'Capt']), 'Title'] = 'Mr'
        X_.loc[X_['Title'].isin(['the Countess', 'Jonkheer']), 'Title'] = 'Mr'
        X_['Fare'].fillna(X_['Fare'].median())
        # One-Hot-Encoder
        X_ = pd.get_dummies(X_[self.features_selected])
        sc = MinMaxScaler()
        X_ = sc.fit_transform(X_)
        return X_

In [5]:
print('Shape_X_train: {}'.format(X_train.shape))
print('Shape_y_train: {}'.format(y_train.shape))
print('Shape_X_test: {}'.format(X_test.shape))

Shape_X_train: (891, 11)
Shape_y_train: (891,)
Shape_X_test: (418, 11)


In [6]:
fe = Feature_Enginnering()
X_train = fe.fit_transform(X_train)
X_test = fe.transform(X_test)

In [8]:
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.impute import KNNImputer
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier
from xgboost import XGBClassifier

classifer_name = ['RandomForest', 'AdaBoostClassifier', 'GradientBoostingClassifier', 'XGBoostClassifer']
classifier = {
    'RandomForest': RandomForestClassifier(random_state = 1),
    'AdaBoostClassifier': AdaBoostClassifier(random_state = 1),
    'GradientBoostingClassifier': GradientBoostingClassifier(random_state = 1),
    'XGBoostClassifer': XGBClassifier(random_state = 1)
}

acc = []

for name in classifer_name:
    pipe = make_pipeline(KNNImputer(n_neighbors=5), classifier[name])
    acc.append(cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy').mean())

pd.DataFrame({'Classifer': classifer_name, 'Accuracy': acc}).sort_values(by='Accuracy', ascending=False)



Unnamed: 0,Classifer,Accuracy
1,AdaBoostClassifier,0.827217
2,GradientBoostingClassifier,0.821562
3,XGBoostClassifer,0.81708
0,RandomForest,0.802473


In [11]:
pipe = make_pipeline(KNNImputer(), AdaBoostClassifier(random_state = 1))
pipe.get_params()

{'memory': None,
 'steps': [('knnimputer', KNNImputer()),
  ('adaboostclassifier', AdaBoostClassifier(random_state=1))],
 'verbose': False,
 'knnimputer': KNNImputer(),
 'adaboostclassifier': AdaBoostClassifier(random_state=1),
 'knnimputer__add_indicator': False,
 'knnimputer__copy': True,
 'knnimputer__metric': 'nan_euclidean',
 'knnimputer__missing_values': nan,
 'knnimputer__n_neighbors': 5,
 'knnimputer__weights': 'uniform',
 'adaboostclassifier__algorithm': 'SAMME.R',
 'adaboostclassifier__base_estimator': None,
 'adaboostclassifier__learning_rate': 1.0,
 'adaboostclassifier__n_estimators': 50,
 'adaboostclassifier__random_state': 1}

In [12]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

parameters = {
    'knnimputer__n_neighbors': [5, 11, 15],
    'adaboostclassifier__n_estimators': [100, 500, 1000],
    'adaboostclassifier__learning_rate': [0.01, 0.05, 0.1, 0.3]
}

pipe = make_pipeline(KNNImputer(), AdaBoostClassifier(random_state = 1))
clf = RandomizedSearchCV(estimator=pipe, param_distributions = parameters)
clf.fit(X_train, y_train)

RandomizedSearchCV(estimator=Pipeline(steps=[('knnimputer', KNNImputer()),
                                             ('adaboostclassifier',
                                              AdaBoostClassifier(random_state=1))]),
                   param_distributions={'adaboostclassifier__learning_rate': [0.01,
                                                                              0.05,
                                                                              0.1,
                                                                              0.3],
                                        'adaboostclassifier__n_estimators': [100,
                                                                             500,
                                                                             1000],
                                        'knnimputer__n_neighbors': [5, 11, 15]})

In [13]:
clf.best_params_

{'knnimputer__n_neighbors': 11,
 'adaboostclassifier__n_estimators': 1000,
 'adaboostclassifier__learning_rate': 0.3}

In [17]:
adb = AdaBoostClassifier(learning_rate=0.3, n_estimators = 1000)
pipe = make_pipeline(KNNImputer(n_neighbors=11), adb)
cross_val_score(pipe, X_train, y_train, cv = 5, scoring = 'accuracy').mean()

0.8260686711443098

In [18]:
pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)

In [19]:
results = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': pred.astype('int')})
results

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [20]:
results.to_csv('results.csv', index=False)