In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/gender_submission.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/train.csv


# Import Data

### Trial 1: import from tfds

In [2]:
def getdata_fromtfds():
    
    import tensorflow_datasets as tfds
    ds = tfds.load('titanic', split='train', as_supervised=True)
    
    # Convert from tfds to df
    df_raw = pd.DataFrame()
    for ds_row in list(ds):
        row = ds_row[0]
        row['Survived'] = ds_row[1]
        df_raw = df_raw.append(row, ignore_index=True)

    # Convert from tensor object to numpy
    for col in df_raw:
        df_raw[col] = df_raw[col].apply(lambda x: x.numpy())
        if col in ['cabin', 'home.dest', 'name', 'ticket', 'boat']:
            df_raw[col] = df_raw[col].apply(lambda x: x.decode())

    df_raw.columns = df_raw.columns.str.capitalize()
    df_raw.Pclass += 1
    df_raw.Sex.replace([0,1], ['male', 'female'], inplace=True)
    df_raw.Cabin.replace('Unknown', np.nan, inplace=True)
    df_raw.Age.replace(-1, np.nan, inplace=True)
    df_raw.Fare.replace(-1, np.nan, inplace=True)
    df_raw.Embarked.replace([0,1,2,3], ['C', 'Q', 'S', None], inplace=True)
    df_raw.rename(columns={'Sibsp':'SibSp'}, inplace=True)

    df_raw.drop(['Boat', 'Home.dest', 'Body'], axis=1, inplace=True)
    
    return df_raw

In [3]:
# df_raw = getdata_fromtfds()
df_raw = pd.read_csv('/kaggle/input/titanic/train.csv')

In [4]:
df_raw.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


# Pre-process Data


In [5]:
def preprocessor_0(df_raw):
    
    df = df_raw.copy()
    
    if 'PassengerId' in df_raw:
        df.drop('PassengerId', axis=1, inplace=True)
    
    df['FamilySize'] = df['Parch'] + df['SibSp'] + 1
    df['FamilyCat'] = np.where(df['FamilySize']==1, 'Singleton', None)
    df['FamilyCat'] = np.where((2 <= df['FamilySize']) & (df['FamilySize'] <= 4), 'SmallFamily', df['FamilyCat'])
    df['FamilyCat'] = np.where(5 <= df['FamilySize'], 'LargeFamily', df['FamilyCat'])
    
    df['Embarked'].fillna('S', inplace=True)
    
    df['Cabin'].fillna('T', inplace=True)
    
    df['Fare'].fillna(df['Fare'].median(), inplace=True)
    
    df['Cabin N'] = df.Cabin.str.count(' ') + 1.0
    df['Cabin Deck'] = df.Cabin.str.slice(0,1)
    df['Cabin Room'] = df.Cabin.str.slice(1,5).str.extract("([0-9]+)", expand=False).astype("float")
    
    df["Title"] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'].replace('Mlle', 'Miss', inplace=True)
    df['Title'].replace('Ms', 'Miss', inplace=True)
    df['Title'].replace('Mme', 'Mrs', inplace=True)    
    df['Title'].replace(["Capt","Don","Major","Dr","Rev", "Col"], 'Officer', inplace=True)
    df['Title'].replace(["Jonkheer","Don", "Dona", "Sir", "Countess", "Lady"], 'Royalty', inplace=True)
    
    grouped_median_train = df.groupby(['Sex', 'Pclass', 'Title']).median().reset_index()[['Sex', 'Pclass', 'Title', 'Age']]
    def fill_age(row):
        condition = (
            (grouped_median_train['Sex']==row['Sex']) \
            & (grouped_median_train['Title']==row['Title']) \
            & (grouped_median_train['Pclass']==row['Pclass'])
        )
        if np.isnan(grouped_median_train[condition]['Age'].values[0]):
            condition = (
                (grouped_median_train['Sex']==row['Sex']) \
                & (grouped_median_train['Pclass']==row['Pclass'])
            )
        return grouped_median_train[condition]['Age'].values[0]
    df['Age'] = df.apply(lambda row: fill_age(row) if np.isnan(row['Age']) else row['Age'], axis = 1)
    
    df["Surname"] = df.Name.str.split(',').str.get(0)
    df['SurnameFreq']=df.groupby('Surname')['Surname'].transform('count')
    df['TicketFreq']=df.groupby('Ticket')['Ticket'].transform('count')
    df['CustomizedFare']=df.Fare/(df.TicketFreq*df.Pclass)
    
    df.drop(['Name', 'Cabin', 'Surname', 'Ticket'], axis=1, inplace=True)
    df = df[df.columns.sort_values()]
    
    return df

In [6]:
df = preprocessor_0(df_raw)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             891 non-null    float64
 1   Cabin Deck      891 non-null    object 
 2   Cabin N         891 non-null    float64
 3   Cabin Room      200 non-null    float64
 4   CustomizedFare  891 non-null    float64
 5   Embarked        891 non-null    object 
 6   FamilyCat       891 non-null    object 
 7   FamilySize      891 non-null    int64  
 8   Fare            891 non-null    float64
 9   Parch           891 non-null    int64  
 10  Pclass          891 non-null    int64  
 11  Sex             891 non-null    object 
 12  SibSp           891 non-null    int64  
 13  SurnameFreq     891 non-null    int64  
 14  Survived        891 non-null    int64  
 15  TicketFreq      891 non-null    int64  
 16  Title           891 non-null    object 
dtypes: float64(5), int64(7), object(5)


## Train-dev split

In [8]:
from sklearn.model_selection import train_test_split

df_X = df.drop('Survived', axis=True)
df_y = df['Survived']

X_train, X_dev, y_train, y_dev = train_test_split(df_X, df_y, test_size=0.1, random_state=9527, stratify=df_y)
print('df is split. Train X, y shapes = {}, {}. Dev X, y shapes = {}, {}'.format(X_train.shape, y_train.shape, X_dev.shape, y_dev.shape))

df is split. Train X, y shapes = (801, 16), (801,). Dev X, y shapes = (90, 16), (90,)


## sklearn pipeline building

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_features = ['Age', 'Fare', 'SibSp', 'Parch', 'Pclass'
#                     , 'Cabin N', 'Cabin Room', 'SurnameFreq', 'TicketFreq', 'CustomizedFare'
                    , 'FamilySize']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
    , ('logtransformer', FunctionTransformer(np.log1p))
    , ('scaler', StandardScaler())
])

categorical_features = ['Embarked', 'Sex', 'Cabin Deck', 'Title', 'FamilyCat']
# categorical_features = ['Sex', 'Pclass', 'Cabin Deck', 'Title']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor_1 = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [10]:
X_train = preprocessor_1.fit_transform(X_train)
X_dev = preprocessor_1.transform(X_dev)

print('X is transformed by sklearn preprocessor. Train X, y shapes = {}, {}. Dev X, y shapes = {}, {}'.format(X_train.shape, y_train.shape, X_dev.shape, y_dev.shape))

X is transformed by sklearn preprocessor. Train X, y shapes = (801, 28), (801,). Dev X, y shapes = (90, 28), (90,)


# Model Lab

In [11]:
models = []

from xgboost import XGBClassifier

param = {'objective':'binary:logistic', 'n_estimators': 20, 'n_jobs': 4
         , 'max_depth': 50, 'learning_rate': 0.3, 'reg_lambda': 0.01
         , 'gamma': 2, 'max_delta_step': 1, 'min_child_weight': 1
         , 'colsample_bytree': 0.65, 'subsample': 0.9, 'base_score': 0.5
        }
models.append(XGBClassifier(**param))

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
param = {'n_estimators': 180, 'min_samples_split': 3
#          , 'min_samples_leaf': 4, 'max_depth': 32, 'bootstrap': True
         , 'max_features': 0.5
        }
models.append(RandomForestClassifier(**param))
models.append(AdaBoostClassifier())
models.append(GradientBoostingClassifier())

from sklearn.tree import DecisionTreeClassifier
models.append(DecisionTreeClassifier())

from sklearn.neighbors import KNeighborsClassifier
models.append(KNeighborsClassifier())

from sklearn.svm import SVC
models.append(SVC(probability=True))
 
from sklearn.naive_bayes import GaussianNB
models.append(GaussianNB())

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
models.append(LinearDiscriminantAnalysis())
models.append(QuadraticDiscriminantAnalysis())

from sklearn.linear_model import LogisticRegression
models.append(LogisticRegression())

print('Models list imported. with length = {}'.format(len(models)))

Models list imported. with length = 11


In [12]:
from sklearn.metrics import accuracy_score
df_acc = pd.DataFrame()

for model in models:
    model_name = model.__class__.__name__
    print('Running {}...'.format(model_name))
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    acc_train = accuracy_score(y_train, y_train_pred)
    y_dev_pred = model.predict(X_dev)
    acc_dev = accuracy_score(y_dev, y_dev_pred)
    
    df_acc = df_acc.append({'name': model_name, 'acc_dev': acc_dev, 'acc_train': acc_train}, ignore_index=True)
        
print(df_acc)

Running XGBClassifier...
Running RandomForestClassifier...
Running AdaBoostClassifier...
Running GradientBoostingClassifier...
Running DecisionTreeClassifier...
Running KNeighborsClassifier...
Running SVC...
Running GaussianNB...
Running LinearDiscriminantAnalysis...
Running QuadraticDiscriminantAnalysis...
Running LogisticRegression...
     acc_dev  acc_train                           name
0   0.833333   0.928839                  XGBClassifier
1   0.811111   0.981273         RandomForestClassifier
2   0.788889   0.850187             AdaBoostClassifier
3   0.822222   0.911361     GradientBoostingClassifier
4   0.777778   0.988764         DecisionTreeClassifier
5   0.788889   0.865169           KNeighborsClassifier
6   0.755556   0.841448                            SVC
7   0.766667   0.806492                     GaussianNB
8   0.766667   0.843945     LinearDiscriminantAnalysis
9   0.677778   0.685393  QuadraticDiscriminantAnalysis
10  0.766667   0.841448             LogisticRegression




# Modelling

In [13]:
def getModel():
    from xgboost import XGBClassifier
    
#     param = {'objective':'binary:logistic', 'n_estimators': 20, 'n_jobs': 4
#              , 'max_depth': 50, 'learning_rate': 0.3, 'reg_lambda': 0.01
#              , 'gamma': 2, 'max_delta_step': 1, 'min_child_weight': 1
#              , 'colsample_bytree': 0.65, 'subsample': 0.9, 'base_score': 0.5
#             }
#     model = XGBClassifier(**param)
    
    from sklearn.ensemble import RandomForestClassifier
    param = {'n_estimators': 180, 'min_samples_split': 3
    #          , 'min_samples_leaf': 4, 'max_depth': 32, 'bootstrap': True
             , 'max_features': 0.5
            }  
    model = RandomForestClassifier(**param)
    
#     from sklearn.tree import DecisionTreeClassifier
#     model = DecisionTreeClassifier()
    
    return model

model = getModel()

### Fine-tune the best parameter set

In [14]:
# from sklearn.model_selection import GridSearchCV

# grid = {'n_estimators':[20]
#         , 'max_depth':[50]
#         , 'learning_rate':[0.3]
#         , 'reg_lambda':[0.01]
#         , 'gamma':[2]
#         , 'max_delta_step':[0,1,2]
#         , 'min_child_weight':[1,2,3]
#         , 'colsample_bytree':[0.55,0.65]
#         , 'subsample':[1,0.9,0.8]
#         , 'base_score':[0.5]        
#        }

# search = GridSearchCV(estimator=model, param_grid=grid, scoring='accuracy', n_jobs=4, refit=True)
# search.fit(X_train, y_train)

# print(search.best_params_)
# print(search.best_score_)

### Model Conclusion

In [15]:
model.fit(X_train, y_train
#           , eval_set=[(X_train, y_train), (X_dev, y_dev)]
#           , eval_metric=['error', 'logloss'], verbose=True
         )

RandomForestClassifier(max_features=0.5, min_samples_split=3, n_estimators=180)

In [16]:
from sklearn.metrics import accuracy_score

y_train_pred = model.predict(X_train)
print('Accuracy of train data:', accuracy_score(y_train, y_train_pred))
y_dev_pred = model.predict(X_dev)
print('Accuracy of train data:', accuracy_score(y_dev, y_dev_pred))

Accuracy of train data: 0.9850187265917603
Accuracy of train data: 0.8222222222222222


# Predict and Output

In [17]:
df_submit_raw = pd.read_csv('/kaggle/input/titanic/test.csv')
df_submit = preprocessor_0(df_submit_raw)
X_submit = preprocessor_1.transform(df_submit)

df_submit_final = pd.DataFrame(df_submit_raw['PassengerId'])
df_submit_final['Survived'] = pd.Series(model.predict(X_submit))

In [18]:
df_submit_final.to_csv('/kaggle/working/submission.csv', index=False)