# Download Data & Initialize

In [21]:
import pandas as pd
import numpy as np

In [9]:
# Run this if you need to install kaggle API
# !pip install kaggle

In [None]:
kaggle_name = input('Input your Kaggle API Username:')
kaggle_key = input('Input your Kaggle API key:')

!set KAGGLE_USERNAME=$kaggle_name
!set KAGGLE_KEY=$kaggle_key

In [12]:
# try to download datasets
!kaggle competitions download -c titanic

Downloading titanic.zip to D:\kyalan\Documents\GitHub\kaggle\02_Titanic




  0%|          | 0.00/34.1k [00:00<?, ?B/s]
100%|██████████| 34.1k/34.1k [00:00<00:00, 36.0MB/s]


In [18]:
import os
for dirname, _, filenames in os.walk('.'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

.\02-titanic.ipynb
.\titanic.zip
.\.ipynb_checkpoints\02-titanic-checkpoint.ipynb


In [19]:
# unzip to extract data
import zipfile
with zipfile.ZipFile('titanic.zip', 'r') as zip_ref:
    zip_ref.extractall('titanic/input')

# Import Data

### Data Source 1: import from Kaggle file

In [43]:
df_raw_1 = pd.read_csv(r'.\titanic\input\train.csv')
df_raw_1.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Data Source 2: import from tfds

In [28]:
def getdata_fromtfds():
    
    import tensorflow_datasets as tfds
    ds = tfds.load('titanic', split='train', as_supervised=True)
    
    # Convert from tfds to df
    df_raw = pd.DataFrame()
    for ds_row in list(ds):
        row = ds_row[0]
        row['Survived'] = ds_row[1]
        df_raw = df_raw.append(row, ignore_index=True)

    # Convert from tensor object to numpy
    for col in df_raw:
        df_raw[col] = df_raw[col].apply(lambda x: x.numpy())
        if col in ['cabin', 'home.dest', 'name', 'ticket', 'boat']:
            df_raw[col] = df_raw[col].apply(lambda x: x.decode())

    df_raw.columns = df_raw.columns.str.capitalize()
    df_raw.Pclass += 1
    df_raw.Sex.replace([0,1], ['male', 'female'], inplace=True)
    df_raw.Cabin.replace('Unknown', np.nan, inplace=True)
    df_raw.Age.replace(-1, np.nan, inplace=True)
    df_raw.Fare.replace(-1, np.nan, inplace=True)
    df_raw.Embarked.replace([0,1,2,3], ['C', 'Q', 'S', None], inplace=True)
    df_raw.rename(columns={'Sibsp':'SibSp'}, inplace=True)

    df_raw.drop(['Boat', 'Home.dest', 'Body'], axis=1, inplace=True)
    
    return df_raw

In [41]:
df_raw_2 = getdata_fromtfds()
df_raw_2.head()

Unnamed: 0,Survived,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Ticket
0,0,30.0,,S,13.0,"McCrie, Mr. James Matthew",0,2,male,0,233478
1,0,37.0,,S,7.925,"Gustafsson, Mr. Anders Vilhelm",0,3,male,2,3101276
2,1,28.0,,S,13.0,"Reynaldo, Ms. Encarnacion",0,2,female,0,230434
3,0,18.0,,S,73.5,"Davies, Mr. Charles Henry",0,2,male,0,S.O.C. 14879
4,0,,,C,7.8958,"Gheorgheff, Mr. Stanio",0,3,male,0,349254


In [44]:
df_raw = pd.concat([df_raw_1, df_raw_2])
df_raw.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1.0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2.0,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3.0,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4.0,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5.0,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [45]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2200 entries, 0 to 1308
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    float64
 1   Survived     2200 non-null   int64  
 2   Pclass       2200 non-null   int64  
 3   Name         2200 non-null   object 
 4   Sex          2200 non-null   object 
 5   Age          1760 non-null   float64
 6   SibSp        2200 non-null   int64  
 7   Parch        2200 non-null   int64  
 8   Ticket       2200 non-null   object 
 9   Fare         2199 non-null   float64
 10  Cabin        499 non-null    object 
 11  Embarked     2196 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 223.4+ KB


# Pre-process Data


In [46]:
def preprocessor_0(df_raw):
    
    df = df_raw.copy()
    
    if 'PassengerId' in df_raw:
        df.drop('PassengerId', axis=1, inplace=True)
    
    df['FamilySize'] = df['Parch'] + df['SibSp'] + 1
    df['FamilyCat'] = np.where(df['FamilySize']==1, 'Singleton', None)
    df['FamilyCat'] = np.where((2 <= df['FamilySize']) & (df['FamilySize'] <= 4), 'SmallFamily', df['FamilyCat'])
    df['FamilyCat'] = np.where(5 <= df['FamilySize'], 'LargeFamily', df['FamilyCat'])
    
    df['Embarked'].fillna('S', inplace=True)
    
    df['Cabin'].fillna('T', inplace=True)
    
    df['Fare'].fillna(df['Fare'].median(), inplace=True)
    
    df['Cabin N'] = df.Cabin.str.count(' ') + 1.0
    df['Cabin Deck'] = df.Cabin.str.slice(0,1)
    df['Cabin Room'] = df.Cabin.str.slice(1,5).str.extract("([0-9]+)", expand=False).astype("float")
    
    df["Title"] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'].replace('Mlle', 'Miss', inplace=True)
    df['Title'].replace('Ms', 'Miss', inplace=True)
    df['Title'].replace('Mme', 'Mrs', inplace=True)    
    df['Title'].replace(["Capt","Don","Major","Dr","Rev", "Col"], 'Officer', inplace=True)
    df['Title'].replace(["Jonkheer","Don", "Dona", "Sir", "Countess", "Lady"], 'Royalty', inplace=True)
    
    grouped_median_train = df.groupby(['Sex', 'Pclass', 'Title']).median().reset_index()[['Sex', 'Pclass', 'Title', 'Age']]
    def fill_age(row):
        condition = (
            (grouped_median_train['Sex']==row['Sex']) \
            & (grouped_median_train['Title']==row['Title']) \
            & (grouped_median_train['Pclass']==row['Pclass'])
        )
        if np.isnan(grouped_median_train[condition]['Age'].values[0]):
            condition = (
                (grouped_median_train['Sex']==row['Sex']) \
                & (grouped_median_train['Pclass']==row['Pclass'])
            )
        return grouped_median_train[condition]['Age'].values[0]
    df['Age'] = df.apply(lambda row: fill_age(row) if np.isnan(row['Age']) else row['Age'], axis = 1)
    
    df["Surname"] = df.Name.str.split(',').str.get(0)
    df['SurnameFreq']=df.groupby('Surname')['Surname'].transform('count')
    df['TicketFreq']=df.groupby('Ticket')['Ticket'].transform('count')
    df['CustomizedFare']=df.Fare/(df.TicketFreq*df.Pclass)
    
    df.drop(['Name', 'Cabin', 'Surname', 'Ticket'], axis=1, inplace=True)
    df = df[df.columns.sort_values()]
    
    return df

In [47]:
df = preprocessor_0(df_raw)

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2200 entries, 0 to 1308
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             2200 non-null   float64
 1   Cabin Deck      2200 non-null   object 
 2   Cabin N         2200 non-null   float64
 3   Cabin Room      489 non-null    float64
 4   CustomizedFare  2200 non-null   float64
 5   Embarked        2200 non-null   object 
 6   FamilyCat       2200 non-null   object 
 7   FamilySize      2200 non-null   int64  
 8   Fare            2200 non-null   float64
 9   Parch           2200 non-null   int64  
 10  Pclass          2200 non-null   int64  
 11  Sex             2200 non-null   object 
 12  SibSp           2200 non-null   int64  
 13  SurnameFreq     2200 non-null   int64  
 14  Survived        2200 non-null   int64  
 15  TicketFreq      2200 non-null   int64  
 16  Title           2200 non-null   object 
dtypes: float64(5), int64(7), object(5

## Train-dev split

In [49]:
from sklearn.model_selection import train_test_split

df_X = df.drop('Survived', axis=True)
df_y = df['Survived']

X_train, X_dev, y_train, y_dev = train_test_split(df_X, df_y, test_size=0.1, random_state=9527, stratify=df_y)
print('df is split. Train X, y shapes = {}, {}. Dev X, y shapes = {}, {}'.format(X_train.shape, y_train.shape, X_dev.shape, y_dev.shape))

df is split. Train X, y shapes = (1980, 16), (1980,). Dev X, y shapes = (220, 16), (220,)


## sklearn pipeline building

In [50]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_features = ['Age', 'Fare', 'SibSp', 'Parch', 'Pclass'
#                     , 'Cabin N', 'Cabin Room', 'SurnameFreq', 'TicketFreq', 'CustomizedFare'
                    , 'FamilySize']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
    , ('logtransformer', FunctionTransformer(np.log1p))
    , ('scaler', StandardScaler())
])

categorical_features = ['Embarked', 'Sex', 'Cabin Deck', 'Title', 'FamilyCat']
# categorical_features = ['Sex', 'Pclass', 'Cabin Deck', 'Title']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor_1 = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [51]:
X_train = preprocessor_1.fit_transform(X_train)
X_dev = preprocessor_1.transform(X_dev)

print('X is transformed by sklearn preprocessor. Train X, y shapes = {}, {}. Dev X, y shapes = {}, {}'.format(X_train.shape, y_train.shape, X_dev.shape, y_dev.shape))

X is transformed by sklearn preprocessor. Train X, y shapes = (1980, 28), (1980,). Dev X, y shapes = (220, 28), (220,)


# Model Lab

In [54]:
models = []

from xgboost import XGBClassifier

param = {'objective':'binary:logistic', 'n_estimators': 20, 'n_jobs': 4
         , 'max_depth': 50, 'learning_rate': 0.3, 'reg_lambda': 0.01
         , 'gamma': 2, 'max_delta_step': 1, 'min_child_weight': 1
         , 'colsample_bytree': 0.65, 'subsample': 0.9, 'base_score': 0.5
        }
models.append(XGBClassifier(**param))

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
param = {'n_estimators': 180, 'min_samples_split': 3
#          , 'min_samples_leaf': 4, 'max_depth': 32, 'bootstrap': True
         , 'max_features': 0.5
        }
models.append(RandomForestClassifier(**param))
models.append(AdaBoostClassifier())
models.append(GradientBoostingClassifier())

from sklearn.tree import DecisionTreeClassifier
models.append(DecisionTreeClassifier())

from sklearn.neighbors import KNeighborsClassifier
models.append(KNeighborsClassifier())

from sklearn.svm import SVC
models.append(SVC(probability=True))
 
from sklearn.naive_bayes import GaussianNB
models.append(GaussianNB())

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
models.append(LinearDiscriminantAnalysis())
models.append(QuadraticDiscriminantAnalysis())

from sklearn.linear_model import LogisticRegression
models.append(LogisticRegression())

print('Models list imported. with length = {}'.format(len(models)))

Models list imported. with length = 11


In [55]:
from sklearn.metrics import accuracy_score
df_acc = pd.DataFrame()

for model in models:
    model_name = model.__class__.__name__
    print('Running {}...'.format(model_name))
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    acc_train = accuracy_score(y_train, y_train_pred)
    y_dev_pred = model.predict(X_dev)
    acc_dev = accuracy_score(y_dev, y_dev_pred)
    
    df_acc = df_acc.append({'name': model_name, 'acc_dev': acc_dev, 'acc_train': acc_train}, ignore_index=True)
        
print(df_acc)

Running XGBClassifier...
Running RandomForestClassifier...




Running AdaBoostClassifier...
Running GradientBoostingClassifier...
Running DecisionTreeClassifier...
Running KNeighborsClassifier...
Running SVC...
Running GaussianNB...
Running LinearDiscriminantAnalysis...
Running QuadraticDiscriminantAnalysis...
Running LogisticRegression...
     acc_dev  acc_train                           name
0   0.922727   0.931818                  XGBClassifier
1   0.950000   0.973232         RandomForestClassifier
2   0.863636   0.826263             AdaBoostClassifier
3   0.900000   0.874747     GradientBoostingClassifier
4   0.950000   0.974242         DecisionTreeClassifier
5   0.881818   0.867172           KNeighborsClassifier
6   0.859091   0.828283                            SVC
7   0.840909   0.794444                     GaussianNB
8   0.859091   0.817677     LinearDiscriminantAnalysis
9   0.677273   0.680808  QuadraticDiscriminantAnalysis
10  0.854545   0.822222             LogisticRegression




# Modelling

In [56]:
def getModel():
    from xgboost import XGBClassifier
    
#     param = {'objective':'binary:logistic', 'n_estimators': 20, 'n_jobs': 4
#              , 'max_depth': 50, 'learning_rate': 0.3, 'reg_lambda': 0.01
#              , 'gamma': 2, 'max_delta_step': 1, 'min_child_weight': 1
#              , 'colsample_bytree': 0.65, 'subsample': 0.9, 'base_score': 0.5
#             }
#     model = XGBClassifier(**param)
    
    from sklearn.ensemble import RandomForestClassifier
    param = {'n_estimators': 180, 'min_samples_split': 3
    #          , 'min_samples_leaf': 4, 'max_depth': 32, 'bootstrap': True
             , 'max_features': 0.5
            }  
    model = RandomForestClassifier(**param)
    
#     from sklearn.tree import DecisionTreeClassifier
#     model = DecisionTreeClassifier()
    
    return model

model = getModel()

### Fine-tune the best parameter set

In [None]:
# from sklearn.model_selection import GridSearchCV

# grid = {'n_estimators':[20]
#         , 'max_depth':[50]
#         , 'learning_rate':[0.3]
#         , 'reg_lambda':[0.01]
#         , 'gamma':[2]
#         , 'max_delta_step':[0,1,2]
#         , 'min_child_weight':[1,2,3]
#         , 'colsample_bytree':[0.55,0.65]
#         , 'subsample':[1,0.9,0.8]
#         , 'base_score':[0.5]        
#        }

# search = GridSearchCV(estimator=model, param_grid=grid, scoring='accuracy', n_jobs=4, refit=True)
# search.fit(X_train, y_train)

# print(search.best_params_)
# print(search.best_score_)

### Model Conclusion

In [57]:
model.fit(X_train, y_train
#           , eval_set=[(X_train, y_train), (X_dev, y_dev)]
#           , eval_metric=['error', 'logloss'], verbose=True
         )

RandomForestClassifier(max_features=0.5, min_samples_split=3, n_estimators=180)

In [59]:
from sklearn.metrics import accuracy_score

y_train_pred = model.predict(X_train)
print('Accuracy of train data:', accuracy_score(y_train, y_train_pred))
y_dev_pred = model.predict(X_dev)
print('Accuracy of dev data:', accuracy_score(y_dev, y_dev_pred))

Accuracy of train data: 0.9732323232323232
Accuracy of dev data: 0.95


# Predict and Output

In [60]:
df_submit_raw = pd.read_csv(r'.\titanic\input\test.csv')
df_submit = preprocessor_0(df_submit_raw)
X_submit = preprocessor_1.transform(df_submit)

df_submit_final = pd.DataFrame(df_submit_raw['PassengerId'])
df_submit_final['Survived'] = pd.Series(model.predict(X_submit))

In [63]:
df_submit_final.to_csv(r'.\submission.csv', index=False)

# Submission

In [64]:
!kaggle competitions submit titanic -f .\submission.csv -m "Adding in tfds data for training"

Successfully submitted to Titanic - Machine Learning from Disaster



  0%|          | 0.00/3.18k [00:00<?, ?B/s]
100%|██████████| 3.18k/3.18k [00:05<00:00, 583B/s]


Final Score: 0.91148 ; Rank 637 / 35554

Remove the raw data and Closing file

In [69]:
import os, shutil

shutil.rmtree('titanic')
os.remove('titanic.zip')

In [70]:
import os
for dirname, _, filenames in os.walk('.'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

.\02-titanic.ipynb
.\submission.csv
.\.ipynb_checkpoints\02-titanic-checkpoint.ipynb
