### Overview of Data Set

Survival - Survival (0 = No; 1 = Yes)  
Pclass - Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)  
Name - Name  
Sex - Sex (male; female)  
Age - Age  
Sibsp - Number of Siblings/Spouses Aboard  
Parch - Number of Parents/Children Aboard  
Ticket - Ticket Number  
Fare - Passenger Fare  
Cabin - Cabin Number  
Embarked - Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)  

### Import Library

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import matplotlib.gridspec as grid_spec
from matplotlib.ticker import FuncFormatter
from IPython.display import display
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

### Import and Preparing Data

In [None]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')

data_train = df_train.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)
data_test = df_test.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)
data_train['Embarked'] = data_train['Embarked'].replace(['C','Q','S'],['Cherbourg','Queenstown','Southampton'])
data_test['Embarked'] = data_test['Embarked'].replace(['C','Q','S'],['Cherbourg','Queenstown','Southampton'])
data_train['Pclass'] = data_train['Pclass'].map({1:'First Class', 2:'Second Class', 3:'Third Class'})
data_test['Pclass'] = data_test['Pclass'].map({1:'First Class', 2:'Second Class', 3:'Third Class'})
data_label = df_train['Survived'] 

def titanic_children(passenger):
    age , sex = passenger
    if age <16:
        return 'child'
    else:
        return sex

data_train['Person'] = data_train[['Age','Sex']].apply(titanic_children, axis=1)

data_train['Alone'] = data_train['Parch'] + data_train['SibSp']
data_train['Alone'].loc[data_train['Alone']>0] = 'With Family'
data_train['Alone'].loc[data_train['Alone'] == 0] = 'Without Family'

Pipeline

In [None]:
# ================= PipeLine FeatureUnion ==================

num_attrs = list(data_train[['Age', 'Fare']])
cat_attrs = list(data_train[['Sex', 'Embarked','Alone','Person']])
data_train_att = data_train.drop(['Survived'], axis=1)

class DataFrameSelector (BaseEstimator , TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names=attribute_names
    def fit(self, x, y=None):
        return self
    def transform(self, x):
        return x[self.attribute_names].values

num_pipeline=Pipeline([
    ('selector',DataFrameSelector(num_attrs)),
    ('imputer',SimpleImputer(missing_values=np.nan, strategy='median')),
    ('std_scaler', StandardScaler()),
    ])    

cat_pipeline=Pipeline([
    ('selector',DataFrameSelector(cat_attrs)),
    ('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing')),
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore', sparse=False)),
    ])

full_pipeline=FeatureUnion(transformer_list=[
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline),
    ])
  
train_prepared = full_pipeline.fit_transform(data_train_att)
cat_columns = full_pipeline.transformer_list[1][1][2].get_feature_names(cat_attrs)
columns = np.append(num_attrs,cat_columns)
train_prepared = pd.DataFrame(train_prepared, columns=columns)

### Exploratory Data Analysis

In [None]:
display(df_train.head())

display(data_train.head())
display(data_train.info())
display(data_train.isna().sum())
display(data_train.describe())
display(data_train.columns.values)

display(train_prepared.info())
display(train_prepared.isna().sum())

data_label_rec=data_label.replace([0,1],['Dead','Live'])
Sex_Sur_tab = pd.crosstab(index=data_train['Sex'], columns=data_label_rec, margins=True, margins_name='total')
Pclass_Sur_tab = pd.crosstab(index=data_train['Pclass'], columns=data_label_rec, margins=True, margins_name='total')
Embarked_Sur_tab = pd.crosstab(index=data_train['Embarked'], columns=data_label_rec, margins=True, margins_name='total')
Person_Sur_tab = pd.crosstab(index=data_train['Person'], columns=data_label_rec, margins=True, margins_name='total')
Alone_Sur_tab = pd.crosstab(index=data_train['Alone'], columns=data_label_rec, margins=True, margins_name='total')


display(Sex_Sur_tab, Pclass_Sur_tab, Embarked_Sur_tab, Person_Sur_tab, Alone_Sur_tab)

In [None]:
fig, axes = plt.subplots(4,2, figsize=(10,10))
fig.suptitle('Plot of Peaple')
sns.countplot(ax=axes[0,0], x='Sex', data=data_train)
data_train.groupby('Sex').size().plot(ax=axes[0,1], kind='pie', autopct='%.0f')
# data_train['Sex'].value_counts().plot(ax=axes[0,1], kind='pie', autopct='%.0f')
sns.countplot(ax=axes[1,0], x='Person', data=data_train)
sns.countplot(ax=axes[1,1], x='Pclass', data=data_train)
sns.countplot(ax=axes[2,0], x='Embarked' ,data=data_train)
sns.countplot(ax=axes[2,1], x='SibSp' ,data=data_train)
sns.countplot(ax=axes[3,0], x='Parch' ,data=data_train)
sns.countplot(ax=axes[3,1], x='Alone' ,data=data_train)

In [None]:
sns.histplot(x='Age', data=data_train, kde=True, bins=70)
sns.catplot(x="Sex", y="Age", kind="box", data=data_train)
sns.catplot(x="Pclass", y="Age", kind="box", data=data_train)
sns.catplot(x="Embarked", y="Age", kind="box", data=data_train)

In [None]:
sns.violinplot(x="Sex", y="Age", hue="Survived", data=data_train, split = True)

In [None]:
fig, axes = plt.subplots(3,2, figsize=(10,10))
fig.suptitle('Plot of Peaple')
sns.countplot(ax=axes[0,0], x='Survived', data=data_train)
sns.countplot(ax=axes[0,1], x='Survived', data=data_train, hue='Sex')
sns.countplot(ax=axes[1,0], x='Survived', data=data_train, hue='Person')
sns.countplot(ax=axes[1,1], x='Survived', data=data_train, hue='Pclass')
sns.countplot(ax=axes[2,0], x='Survived', data=data_train, hue='Embarked')
sns.countplot(ax=axes[2,1], x='Survived', data=data_train, hue='Alone')
pd.crosstab(data_train['Pclass'], data_train['Survived']).plot(kind='bar', stacked=True)

In [None]:
sns.factorplot('Pclass', 'Survived', data=data_train, hue='Alone')
sns.factorplot('Pclass', 'Survived', data=data_train, hue='Person')

In [None]:
# Heatmap 
group = data_train.groupby(['Pclass', 'Survived'])
pclass_survived = group.size().unstack()
sns.heatmap(pclass_survived, annot=True, fmt="d")

In [None]:
# Divide Fare into 4 bins
# data_train['Fare_Range'] = pd.qcut(data_train['Fare'], 4)
# sns.barplot(x ='Fare_Range', y ='Survived', data=data_train)

In [None]:
g = sns.catplot(x="Fare", y="Survived", row="Pclass", kind="box", orient="h", height=1.5, aspect=4, data=data_train.query('Fare>0'))
g.set(xscale="log")

In [None]:
as_fig = sns.FacetGrid(data_train, hue='Sex', aspect=3)
as_fig.map(sns.kdeplot, 'Age', shade=True)
oldest = data_train['Age'].max()
as_fig.set(xlim=(0,oldest))
as_fig.add_legend()

In [None]:
as_fig = sns.FacetGrid(data_train, hue='Pclass', aspect=3)
as_fig.map(sns.kdeplot, 'Age', shade=True)
oldest = data_train['Age'].max()
as_fig.set(xlim=(0,oldest))
as_fig.add_legend()

In [None]:
grid = sns.FacetGrid(data_train, row='Embarked', col='Survived', size=2.2, aspect=1.6)
grid.map(sns.barplot, 'Sex', 'Fare', alpha=.5, ci=None)
grid.add_legend()

In [None]:
sns.lmplot('Age', 'Survived', data=data_train)
sns.lmplot('Age', 'Survived', data=data_train, hue='Pclass')
sns.lmplot('Age', 'Survived', data=data_train, hue='Sex')
sns.lmplot('Age', 'Survived', data=data_train, hue='Alone')
sns.lmplot('Age', 'Survived', data=data_train, hue='Embarked')

In [None]:
fig = plt.figure(figsize=(12, 8))
gs = fig.add_gridspec(3,1)
gs.update(hspace= -0.55)

axes = list()
colors = ["#022133", "#5c693b", "#51371c"]

for idx, cls, c in zip(range(3), sorted(data_train['Pclass'].unique()), colors):
    axes.append(fig.add_subplot(gs[idx, 0]))
    
    # you can also draw density plot with matplotlib + scipy.
    sns.kdeplot(x='Age', data=data_train[data_train['Pclass']==cls], 
                fill=True, ax=axes[idx], cut=0, bw_method=0.25, 
                lw=1.4, edgecolor='lightgray', hue='Survived', 
                multiple="stack", palette='PuBu', alpha=0.7
               ) 
    
    axes[idx].set_ylim(0, 0.04)
    axes[idx].set_xlim(0, 85)
    
    axes[idx].set_yticks([])
    if idx != 2 : axes[idx].set_xticks([])
    axes[idx].set_ylabel('')
    axes[idx].set_xlabel('')
    
    spines = ["top","right","left","bottom"]
    for s in spines:
        axes[idx].spines[s].set_visible(False)
        
    axes[idx].patch.set_alpha(0)
    axes[idx].text(-0.2,0,f'Pclass {cls}',fontweight="light", fontfamily='serif', fontsize=11,ha="right")
    if idx != 1 : axes[idx].get_legend().remove()
        
fig.text(0.13,0.81,"Age distribution by Pclass in Titanic", fontweight="bold", fontfamily='serif', fontsize=16)

plt.show()    

In [None]:
def age_band(num):
    for i in range(1, 100):
        if num < 10*i :  return f'{(i-1) * 10} ~ {i*10}'

data_train['age_band'] = data_train['Age'].apply(age_band)
titanic_age = data_train[['age_band', 'Survived']].groupby('age_band')['Survived'].value_counts().sort_index().unstack().fillna(0)
titanic_age['Survival rate'] = titanic_age[1] / (titanic_age[0] + titanic_age[1]) * 100

fig, ax = plt.subplots(1, 1, figsize=(10, 7))

color_map = ['#d4dddd' for _ in range(9)]
color_map[0] = color_map[8] = '#244747' # color highlight

ax.bar(titanic_age['Survival rate'].index, titanic_age['Survival rate'], 
       color=color_map, width=0.55, 
       edgecolor='black', 
       linewidth=0.7)



for s in ["top","right","left"]:
    ax.spines[s].set_visible(False)


# Annotation Part
for i in titanic_age['Survival rate'].index:
    ax.annotate(f"{titanic_age['Survival rate'][i]:.02f}%", 
                   xy=(i, titanic_age['Survival rate'][i] + 2.3),
                   va = 'center', ha='center',fontweight='light', 
                   color='#4a4a4a')


# mean line + annotation
mean = data_train['Survived'].mean() *100
ax.axhline(mean ,color='black', linewidth=0.4, linestyle='dashdot')
ax.annotate(f"mean : {mean :.4}%", 
            xy=('70 ~ 80', mean + 4),
            va = 'center', ha='center',
            color='#4a4a4a',
            bbox=dict(boxstyle='round', pad=0.4, facecolor='#efe8d1', linewidth=0))
    

# Title & Subtitle    
fig.text(0.06, 1, 'Age Band & Survival Rate', fontsize=15, fontweight='bold', fontfamily='serif')
fig.text(0.06, 0.96, 'It can be seen that the survival rate of young children and the elderly is high.', fontsize=12, fontweight='light', fontfamily='serif')

grid_y_ticks = np.arange(0, 101, 20)
ax.set_yticks(grid_y_ticks)
ax.grid(axis='y', linestyle='-', alpha=0.4)

plt.tight_layout()
plt.show()

In [None]:
survival_rate = data_train.groupby(['Sex']).mean()[['Survived']]
male_rate = survival_rate.loc['male']
female_rate = survival_rate.loc['female']
display(survival_rate)

In [None]:
male_pos = np.random.uniform(0, male_rate, len(data_train[(data_train['Sex']=='male') & (data_train['Survived']==1)]))
male_neg = np.random.uniform(male_rate, 1, len(data_train[(data_train['Sex']=='male') & (data_train['Survived']==0)]))
female_pos = np.random.uniform(0, female_rate, len(data_train[(data_train['Sex']=='female') & (data_train['Survived']==1)]))
female_neg = np.random.uniform(female_rate, 1, len(data_train[(data_train['Sex']=='female') & (data_train['Survived']==0)]))

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(9, 7))
plt.style
np.random.seed(42)

# Male Stripplot
ax.scatter(np.random.uniform(-0.3, 0.3, len(male_pos)), male_pos, color='#004c70', edgecolor='lightgray', label='Male(Survived=1)')
ax.scatter(np.random.uniform(-0.3, 0.3, len(male_neg)), male_neg, color='#004c70', edgecolor='lightgray', alpha=0.2, label='Male(Survived=0)')

# Female Stripplot
ax.scatter(1+np.random.uniform(-0.3, 0.3, len(female_pos)), female_pos, color='#990000', edgecolor='lightgray', label='Female(Survived=1)')
ax.scatter(1+np.random.uniform(-0.3, 0.3, len(female_neg)), female_neg, color='#990000', edgecolor='lightgray', alpha=0.2, label='Female(Survived=0)')

# Set Figure & Axes
ax.set_xlim(-0.5, 2.0)
ax.set_ylim(-0.03, 1.1)

# Ticks
ax.set_xticks([0, 1])
ax.set_xticklabels(['Male', 'Female'], fontweight='bold', fontfamily='serif', fontsize=13)
ax.set_yticks([], minor=False)
ax.set_ylabel('')

# Spines
for s in ["top","right","left", 'bottom']:
    ax.spines[s].set_visible(False)


# Title & Explanation
fig.text(0.1, 1, 'Distribution of Survivors by Gender', fontweight='bold', fontfamily='serif', fontsize=15)    
fig.text(0.1, 0.96, 'As is known, the survival rate for female is high, with 19% of male and 74% of female.', fontweight='light', fontfamily='serif', fontsize=12)    

ax.legend(loc=(0.8, 0.5), edgecolor='None')
plt.tight_layout()
plt.show()

In [None]:
data_train2=data_train.copy()
data_train2['Sex'] = data_train2['Sex'].map({'male':0, 'female':1})
data_train2['Embarked'] = data_train2['Embarked'].fillna('S')
data_train2['Embarked'] = data_train2['Embarked'].map({'Southampton':0, 'Cherbourg':1, 'Queenstown':2})
data_train2['Family'] = data_train2['SibSp'] + data_train2['Parch']
corr = data_train2.corr()
corr

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(7, 7))

mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

cmap = sns.diverging_palette(230, 20, as_cmap=True)

sns.heatmap(corr, 
            square=True, 
            mask=mask,
            linewidth=2.5, 
            vmax=0.4, vmin=-0.4, 
            cmap=cmap, 
            cbar=False, 
            ax=ax)

ax.set_yticklabels(ax.get_xticklabels(), fontfamily='serif', rotation = 0, fontsize=11)
ax.set_xticklabels(ax.get_xticklabels(), fontfamily='serif', rotation=90, fontsize=11)

ax.spines['top'].set_visible(True)

fig.text(0.97, 1, 'Correlation Heatmap Visualization', fontweight='bold', fontfamily='serif', fontsize=15, ha='right')    
fig.text(0.97, 0.92, 'Dataset : Titanic\nAuthor : Subin An', fontweight='light', fontfamily='serif', fontsize=12, ha='right')    

plt.tight_layout()
plt.show()

### Model

In [None]:
data_test['Person'] = data_test[['Age','Sex']].apply(titanic_children, axis=1)

data_test['Alone'] = data_test['Parch'] + data_test['SibSp']
data_test['Alone'].loc[data_test['Alone']>0] = 'With Family'
data_test['Alone'].loc[data_test['Alone'] == 0] = 'Without Family'
test_prepared = full_pipeline.fit_transform(data_test)
cat_columns = full_pipeline.transformer_list[1][1][2].get_feature_names(cat_attrs)
columns = np.append(num_attrs,cat_columns)
test_prepared = pd.DataFrame(test_prepared, columns=columns)

In [None]:
train_new = data_train.copy()
train_new['Pclass'] = train_new['Pclass'].map({'First Class':0, 'Second Class':1,'Third Class':2})
train_new['Sex'] = train_new['Sex'].map({'male':0, 'female':1})
train_new['Embarked'] = train_new['Embarked'].map({'Cherbourg':0, 'Queenstown':1,'Southampton':2})
train_new['Person'] = train_new['Person'].map({'male':0, 'female':1, 'child':2})
train_new['Alone'] = train_new['Alone'].map({'With Family':0, 'Without Family':1})
train_new = train_new.drop(['Person','Alone'], axis=1)

test_new = data_test.copy()
test_new['Pclass'] = test_new['Pclass'].map({'First Class':0, 'Second Class':1,'Third Class':2})
test_new['Sex'] = test_new['Sex'].map({'male':0, 'female':1})
test_new['Embarked'] = test_new['Embarked'].map({'Cherbourg':0, 'Queenstown':1,'Southampton':2})
test_new['Person'] = test_new['Person'].map({'male':0, 'female':1, 'child':2})
test_new['Alone'] = test_new['Alone'].map({'With Family':0, 'Without Family':1})
test_new = test_new.drop(['Person','Alone'], axis=1)


In [None]:
train_prepared = train_prepared.drop(['Alone_With Family','Alone_Without Family','Person_child','Person_female','Person_male'],axis=1)
train_prepared.head()

In [None]:
test_prepared = test_prepared.drop(['Alone_With Family','Alone_Without Family','Person_child','Person_female','Person_male'],axis=1)
test_prepared.head()

In [None]:
# Logistic Regression

logreg = LogisticRegression()
logreg.fit(train_prepared, data_label)
Y_pred = logreg.predict(test_prepared)
acc_log = round(logreg.score(train_prepared, data_label) * 100, 2)
print(acc_log)
coeff_df = pd.DataFrame(train_prepared.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])

coeff_df.sort_values(by='Correlation', ascending=False)

In [None]:
train_new['Age'] = train_new['Age'].fillna(train_new['Age'].median())
train_new['Embarked'] = train_new['Embarked'].fillna(2)

In [None]:
test_new['Age'] = test_new['Age'].fillna(test_new['Age'].median())
test_new['Fare'] = test_new['Fare'].fillna(test_new['Fare'].median())

In [None]:
train_new = train_new.drop('Survived', axis=1)

In [None]:
display(test_new)
display(train_new)


In [None]:
# Logistic Regression

logreg2 = LogisticRegression()
logreg2.fit(train_new, data_label)
Y2_pred = logreg2.predict(test_new)
acc_log2 = round(logreg2.score(train_new, data_label) * 100, 2)
print(acc_log2)
coeff_df = pd.DataFrame(train_new.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])

coeff_df.sort_values(by='Correlation', ascending=False)

In [None]:
cat_col_selector = selector(dtype_include=object)
cat_col_train = cat_col_selector(data_train_att)
data_train_num = data_train_att.drop(cat_col_train, axis=1)
cat_col_test = cat_col_selector(data_test_att)
data_test_num = data_test_att.drop(cat_col_test, axis=1)

encoder_1hot = OneHotEncoder(sparse=False)
cat_train_tmp = encoder_1hot.fit_transform(data_train_att[cat_col_train])
cat_test_tmp = encoder_1hot.fit_transform(data_test[cat_col_test])
train_cat_1hot = pd.DataFrame(cat_train_tmp)
test_cat_1hot = pd.DataFrame(cat_test_tmp)
train_cat_1hot.columns = encoder_1hot.get_feature_names()
test_cat_1hot.columns = encoder_1hot.get_feature_names()

data_train_final = pd.concat([data_train_num, train_cat_1hot], axis=1)
data_test_final = pd.concat([data_test_num, test_cat_1hot], axis=1)