<a href="https://www.kaggle.com/code/handikanurichsan/titanic?scriptVersionId=289331373" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


# Load Data

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


xtrain = pd.read_csv("/kaggle/input/titanic/train.csv",index_col = 0)
xtest = pd.read_csv("/kaggle/input/titanic/test.csv",index_col = 0)

y = xtrain.Survived
X = xtrain.drop(['Survived'], axis=1,)

# Differs data

In [3]:
num_cols = X.select_dtypes(exclude=['object']).columns
cat_cols = X.select_dtypes(include=['object']).columns

fullcols = num_cols.append(cat_cols)

In [4]:
print(num_cols)
print('-'*50)

print(cat_cols)
print('-'*50)

print(fullcols)
print('-'*50)

print(f'\nx shape: {xtrain.shape}\n')
print('-'*50)
print(f'xtrain columns: {xtrain.columns}')
print('-'*50)
print(f'x columns: {X.columns}')

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')
--------------------------------------------------
Index(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], dtype='object')
--------------------------------------------------
Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Name', 'Sex', 'Ticket',
       'Cabin', 'Embarked'],
      dtype='object')
--------------------------------------------------

x shape: (891, 11)

--------------------------------------------------
xtrain columns: Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'Embarked'],
      dtype='object')
--------------------------------------------------
x columns: Index(['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Cabin', 'Embarked'],
      dtype='object')


In [5]:
print(num_cols)
print(cat_cols)

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')
Index(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], dtype='object')


In [6]:
X.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
X.isnull().sum()

Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [8]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 891 entries, 1 to 891
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Name      891 non-null    object 
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Ticket    891 non-null    object 
 7   Fare      891 non-null    float64
 8   Cabin     204 non-null    object 
 9   Embarked  889 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 76.6+ KB


# Pipeline and Transformer

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

numericalTransformer = SimpleImputer(strategy='median')


categoricalTransformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numericalTransformer, num_cols),
        ('cat', categoricalTransformer, cat_cols)
    ])

myPipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', XGBClassifier(n_estimators=500, learning_rate=0.05,
                                  max_depth=4,random_state=42,eval_metric='logloss'))])

# Cross Validation

In [10]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score (myPipeline, X, y,
                              cv = 5, 
                              scoring = 'accuracy')

In [11]:
print("CV scores:", scores)
print('-'*50)
print("Mean CV Accuracy:", scores.mean())
print('-'*50)


CV scores: [0.78212291 0.80898876 0.87640449 0.82022472 0.85393258]
--------------------------------------------------
Mean CV Accuracy: 0.8283346933651371
--------------------------------------------------


# Creaate Prediction and Submission

In [12]:
myPipeline.fit(X,y)

preds = myPipeline.predict(X)


print('Accuracy:', accuracy_score(y, preds))

Accuracy: 0.920314253647587


In [13]:
prediction = myPipeline.predict(xtest)

# Submission

In [14]:
# Save test predictions to file
output = pd.DataFrame({'PassengerId': xtest.index,
                       'Survived': prediction})
output.to_csv('submission.csv', index=False)