In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns 

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

from glob import glob
import os, random, time, gc, warnings

from tqdm import tqdm_notebook

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

from pdpbox import pdp, info_plots

warnings.filterwarnings('ignore')

pd.set_option('max_columns', 500)
pd.set_option('max_rows', 500)

%matplotlib inline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


## 1. Data Laod 

In [None]:
#주최 측이 제공한 데이터셋 확인 
glob('../input/titanic/*.*')

In [None]:
# Data Load 
def load_dataset(path) : 
    train = pd.read_csv(path + 'train.csv')
    test = pd.read_csv(path + 'test.csv')
    sample_submission = pd.read_csv(path + 'gender_submission.csv')
    return train, test, sample_submission

path = '../input/titanic/'
%time train, test, sample_submission = load_dataset(path)

## 2. Data Explore

In [None]:
#test set = no 'Survived'(=target col)
#train/test split by 'PassengerId' 
display(train.head(3))
display(test.head(3))

In [None]:
print('-- Size --')
print(f'Train-set : {train.shape}')
print(f'Test-set : {test.shape}')

In [None]:
train.columns

In [None]:
#columns only in Train-set
train.columns.difference(test.columns)

## 3.Understanding How Train/ Test split 

In [None]:
print('Mins/Max of PassengerId in Train-Set')
display(train.PassengerId.agg(['min', 'max']))
print('='*80)
print(len(train))
print()
print('Mins/Max of PassengerId in Test-Set')
display(test.PassengerId.agg(['min', 'max']))
print('='*80)
print(len(test))

#PassengerId ? .. No meaning. So should be removed when modeling 

In [None]:
#distribution of features in each set 
train.columns

## 4. Compare Train vs. Test

In [None]:
#Pclass : almost same proportion 
train['Pclass'].value_counts().sort_index().plot(kind='bar', color='lightblue', label='train')
test['Pclass'].value_counts().sort_index().plot(kind='bar', color='lightcoral', label='test')

plt.legend()
plt.xlabel('Pclass')
plt.ylabel('num of Passengers')
plt.title('Distribution of Pclass in Train/Test set')

plt.show()

In [None]:
train['Sex'].value_counts().sort_index().plot(kind='bar', color='lightblue', label='train')
test['Sex'].value_counts().sort_index().plot(kind='bar', color='lightcoral', label='test')

plt.legend()
plt.xlabel('Sex')
plt.ylabel('num of Passengers')
plt.title('Distribution of Sex in Train/Test set')

plt.show()

In [None]:
#Age : similar distribution 
train['Age'].plot(kind='hist', color='lightblue', label='train')
test['Age'].plot(kind='hist', color='lightcoral', alpha=0.3, label='test')

plt.legend()
plt.xlabel('Age')
plt.ylabel('num of Passengers')
plt.title('Distribution of Age in Train/Test set')

plt.show()

In [None]:
#Sibsp : similar distribution 
train['SibSp'].plot(kind='hist', color='lightblue', label='train')
test['SibSp'].plot(kind='hist', color='lightcoral', alpha=0.3, label='test')

plt.legend()
plt.xlabel('SibSp')
plt.ylabel('num of Passengers')
plt.title('Distribution of SibSp in Train/Test set')

plt.show()

In [None]:
#Parch : similar distribution 
train['Parch'].plot(kind='hist', color='lightblue', label='train')
test['Parch'].plot(kind='hist', color='lightcoral', alpha = 0.3, label='test')

plt.legend()
plt.xlabel('Parch')
plt.ylabel('num of Passengers')
plt.title('Distribution of Parch in Train/Test set')

plt.show()

In [None]:
display(train['Parch'].value_counts()/len(train['Parch']))
display(test['Parch'].value_counts()/len(test['Parch']))

In [None]:
#Embarked : almost same proportion 
train['Embarked'].value_counts().sort_index().plot(kind='bar', color='lightblue', label='train')
test['Embarked'].value_counts().sort_index().plot(kind='bar', color='lightcoral', alpha=0.3, label='test')

plt.legend()
plt.xlabel('Embarked')
plt.ylabel('num of Passengers')
plt.title('Distribution of Embarked in Train/Test set')

plt.show()

In [None]:
#Fare : similar distribution 
train['Fare'].plot(kind='hist', color='lightblue', label='train')
test['Fare'].plot(kind='hist', color='lightcoral', alpha=0.3, label='test')

plt.legend()
plt.xlabel('Fare')
plt.ylabel('num of Passengers')
plt.title('Distribution of Fare in Train/Test set')

plt.show()

#Fare= 0인 경우가 존재.... 어떤 경우인지 확인 필요함 


In [None]:
# Cabin .. Cabin has too many values so let it be explored later. 

## 5. Missing Values  
- Age, Cabin 

In [None]:
msno.matrix(train)

In [None]:
print('Train Set')
display(train.isnull().sum())
print('='*80)
print('Test Set')
display(test.isnull().sum())

### 5.1. Dealing with missing values in 'Age'
* Age 결측치를 가진 탑승자 정보 확인
* 비슷한 정보를 가진 사람들의 평균 나이 확인하여 결측치 채우기

In [None]:
df_all=pd.concat([train, test])

pd.set_option('display.max_rows', 50)
df_all.Age.fillna(0, inplace=True)
print('Train/Test set - values in [Age]')
df_all.Age.value_counts()

# total 263 missing values 

In [None]:
# Dataframe with Missing values of Age 
df_miss_age=df_all[df_all['Age']==0]

fig=plt.figure(figsize=(10,5))

ax1=fig.add_subplot(1,3,1)
ax2=fig.add_subplot(1,3,2)
ax3=fig.add_subplot(1,3,3)
plt.subplots_adjust(left=0.125, bottom=0.1,  right=1.5, top=0.9, wspace=0.2, hspace=0.35)

sns.countplot(x='Pclass', palette='Set2', data=df_miss_age, ax=ax1)
sns.countplot(x='Sex', palette='Set2', data=df_miss_age, ax=ax2, hue='Pclass')
sns.countplot(x='SibSp', palette='Set2', data=df_miss_age, ax=ax3, hue='Pclass')

plt.show()


In [None]:
fig=plt.figure(figsize=(10,5))
ax1=fig.add_subplot(1,2,1)
ax2=fig.add_subplot(1,2,2)
plt.subplots_adjust(left=0.125, bottom=0.1,  right=1.5, top=0.9, wspace=0.2, hspace=0.35)

sns.countplot(x='Parch', palette='Set2', data=df_miss_age, ax=ax1, hue='Pclass')
sns.countplot(x='Embarked', palette='Set2', data=df_miss_age, ax=ax2, hue='Pclass')

plt.show()

# 나이가 기재되지 않은 승객들은 Pclass에서 두드러진 차이를 보이고 있으므로 나이는 Pclass(+성별?)의 평균? 중앙값?으로 채워주기로 한다. 

In [None]:
display(df_all.groupby(['Sex','Pclass'])['Age'].agg('mean'))
print('='*80)
display(df_all.groupby(['Sex','Pclass'])['Age'].agg('median'))

#평균과 중앙값이 크게 차이나지 않기에 그냥 중앙값으로 채워주기로 한다.

In [None]:
df_all['Age'] = df_all['Age'].replace(0, np.nan)
df_all['Age']=df_all['Age'].fillna(df_all.groupby(['Sex', 'Pclass'])['Age'].transform('median'))

In [None]:
# train vs. test 
df_all_train = df_all[df_all.PassengerId <=891]
df_all_test = df_all[df_all.PassengerId>891].drop('Survived', 1)

df_all_train['Age'].plot(kind='hist', color='lightblue', label='train')
df_all_test['Age'].plot(kind='hist', color='lightcoral', alpha=0.3, label='test')

### 5.2 Dealing with missing values in 'Cabin'

In [None]:
miss = df_all.Cabin[df_all['Pclass']==1].isnull().sum(axis=0)
notmiss=df_all.Cabin[df_all['Pclass']==1].notnull().sum(axis=0)
print(f'Pclass=1 .. missing values : {miss},  existing values : {notmiss}')
print('='*80)

miss = df_all.Cabin[df_all['Pclass']==2].isnull().sum(axis=0)
notmiss=df_all.Cabin[df_all['Pclass']==2].notnull().sum(axis=0)
print(f'Pclass=2 .. missing values : {miss},  existing values : {notmiss}')
print('='*80)

miss = df_all.Cabin[df_all['Pclass']==3].isnull().sum(axis=0)
notmiss=df_all.Cabin[df_all['Pclass']==3].notnull().sum(axis=0)
print(f'Pclass=3 .. missing values : {miss},  existing values : {notmiss}')

#주로 Pclass=1에서 Cabin values가 존재하는 것으로 보아, Cabin은 Pclass=1의 생존율 예측에 유의미한 영향을 미칠것이라 추론됨  

In [None]:
df_all['Cabin'].fillna('X', inplace=True)
df_all['Ca']=df_all['Cabin'].str[:1]
# Cabin의 알파벳과 숫자를 분리해서 알파벳+n번대로 만들어주고 이를 따로 분리  알파벳 -> [Ca]
# NaN값은 임의의 알파펫 X값으로 분리해줌 

In [None]:
#compare train vs. test 
df_all_train = df_all[df_all.PassengerId <=891]
df_all_test = df_all[df_all.PassengerId>891].drop('Survived', 1)

fig=plt.figure(figsize=(10,5))
sns.countplot(x=df_all_train['Ca'], color='lightblue', label='train')
sns.countplot(x=df_all_test['Ca'], color='lightcoral', alpha=0.3, label='test')
plt.show();

#F, G를 거의 학습하지 못할 것 같아서 확인을 해봐야겠다. 

In [None]:
df_all[(df_all['Ca']=='F') | (df_all['Ca']=='G')]
#G.. Cabin number가 G6로 같다. 따라서 괜찮을것같음. 
#F .. F를 보아하니.. Cabin num의 알파벳 뿐만 아니라 숫자도 중요할 것 같다.. 같은 객실=같은 가족인지 여부가 중요할 것 같음. 
# 같은 객실에 묵었는지 여부는 Cabin 말고도 Ticket으로 볼 수도 있다. 
# 티켓을 labeling 해볼까.. 싶기도 함. 그건 

In [None]:
df_all[df_all['Ca']=='X']

In [None]:
fig = plt.figure(figsize=(15,5))
sns.countplot(x=df_all['Ca'], palette='Set2', hue=df_all['Pclass'])
plt.show()


# A,B,C,T : pclass=1인 사람들만
# F : Pclass=3
# D : pclass=2
# E,G : Pclass 2, 3
# X : pclass=1,2,3

# 위의 알파벳은 숫자 편차가 심하므로 Ca을 다시 5개의 카테코리(1,2,3,4,5)로 분류하자. 

In [None]:
for ca in tqdm_notebook(df_all['Ca']) : 
    if ca=='A' or ca=='B' or ca=='C' or ca=='T' :
        df_all['Ca'].replace(ca, 1, inplace=True)
    elif ca=='E' or ca=='G' :
        df_all['Ca'].replace(ca, 2, inplace=True)
    elif ca=='D' :
        df_all['Ca'].replace(ca, 3, inplace=True) 
    elif ca=='F' :
        df_all['Ca'].replace(ca, 4, inplace=True)
    elif ca=='X' :
        df_all['Ca'].replace(ca, 5, inplace=True)
        
#문자형 타입으로 
df_all.Ca.astype(str)

In [None]:
df_all.head()

### 5.3 filling the missing values - Embarked
There are two missing values on Embared, and referring to the notebook("https://www.kaggle.com/gunesevitan/titanic-advanced-feature-engineering-tutorial), fill the values with 'S'

In [None]:
df_all['Embarked']=df_all['Embarked'].fillna('S')

### 5.4 missing value - Fare

In [None]:
df_all[df_all['Fare'].isnull()]
# though there is one null in 'Fare', I don't think Fare is importand predictor.
# So I would just drop the column Fare later. 

## 6 Testing Baseline Models

In [None]:
#Baseline models를 평가하기 위해 train, test데이터 분류. 
df_all_train = df_all[df_all.PassengerId <=891]
df_all_test = df_all[df_all.PassengerId>891].drop('Survived', 1)

In [None]:
#null 값 없음. 
display(df_all_train.info())
print('='*80)
display(df_all_test.info())

In [None]:
# Implement Label Encoding 
cat_cols = ['Sex','Embarked', 'Ticket']  #원래 티켓은 드롭했는데 같은 캐빈에 머문 것을 확인할 수 있는 증거이므로 한번 레이블을 시도해보자. 

lbl = LabelEncoder()
for col in tqdm_notebook( cat_cols ):   
    temp_df=pd.concat([df_all_train, df_all_test])
    
    lbl.fit( temp_df[col] )
    df_all_train[col]=lbl.transform(df_all_train[col])
    df_all_test[col]=lbl.transform(df_all_test[col])

In [None]:
drop_cols=['PassengerId', 'Survived', 'Name', 'Cabin', 'Fare']
target_cols='Survived'

X=df_all_train.drop(drop_cols,1)
y=df_all_train[target_cols]

In [None]:
display(X.head())
display(y)

In [None]:
#1st : RandomForest

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=11)


val_scores=list()

for i , (trn_idx, val_idx) in tqdm_notebook(enumerate(cv.split(X, y))):
    trn_data, trn_label = X.values[trn_idx, :], y[trn_idx]
    val_data, val_label = X.values[val_idx, :], y[val_idx]

    rf_model=RandomForestClassifier(
        n_estimators=1000,
        random_state=11,
        class_weight='balanced').fit(trn_data,trn_label)

    trn_acc = rf_model.score(trn_data, trn_label)
    val_acc = rf_model.score(val_data, val_label)
    
    print(f'{i} Fold, train Accuracy : {trn_acc},  validation Accuracy ; {val_acc} ')
          
    val_scores.append(val_acc)

          
print(f'Cross Validation Score : {np.mean(val_scores)}')

In [None]:
# 3rd : xgboost

val_scores=list()

for i , (trn_idx, val_idx) in tqdm_notebook(enumerate(cv.split(X, y))):
    trn_data, trn_label = X.values[trn_idx, :], y[trn_idx]
    val_data, val_label = X.values[val_idx, :], y[val_idx]

    xgb_model=xgb.XGBClassifier(
        n_estimators=1000,
        subsample=0.3,
        reg_alpha=10,
        random_state=11).fit(trn_data,trn_label)

    trn_acc = xgb_model.score(trn_data, trn_label)
    val_acc = xgb_model.score(val_data, val_label)
    
    print(f'{i} Fold, train Accuracy : {trn_acc},  validation Accuracy ; {val_acc} ')
          
    val_scores.append(val_acc)


print(f'Cross Validation Score : {np.mean(val_scores)}')

## 7. Feature Engineering - think of more features. 
- Fam_size  = Sib + Parch 
- Title  
- Sib and Parch : I think them important features since whether one was with parents(it means one=childs) and whether one was elder or younger did affect the survival rates

In [None]:
#Referring to many notebooks of others, create Fam_size and Title columns. 

df_all['Fam_size']=df_all['SibSp']+df_all['Parch']  #except oneself.  
df_all['Title'] = df_all['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]

In [None]:
df_all_train = df_all[df_all.PassengerId <=891]
df_all_test = df_all[df_all.PassengerId>891].drop('Survived', 1)

In [None]:
df_all_train['Title'].value_counts()

In [None]:
df_all_test['Title'].value_counts()

#needs to narrow down  - Mr, Mrs, Miss
# Master, Major, Rev, Col, Dr, Jonkheer, Sir, Don, Capt -> Mr 
# Dona, Mme -> Mrs 
# Mlle, Ms, Lady -> Miss 

In [None]:
for title in tqdm_notebook(df_all['Title']): 
    if title =='Master' or title =='Major' or title =='Rev' or title =='Col' or title =='Dr' or title =='Jonkheer' or title =='Sir' or \
    title =='Don' or title =='Capt' :
        df_all['Title'].replace(title, 'Mr', inplace=True)
    elif title =='Dona' or title =='Mme' or title=='the Countess' : 
        df_all['Title'].replace(title, 'Mrs', inplace=True)
    elif title =='Mlle' or title =='Ms' or title =='Lady': 
        df_all['Title'].replace(title, 'Miss', inplace=True)
        
df_all['Title'].value_counts()

## 8.Testing Models with XGBoost 

In [None]:
#Baseline models를 평가하기 위해 train, test데이터 분류. 
df_all_train = df_all[df_all.PassengerId <=891]
df_all_test = df_all[df_all.PassengerId>891].drop('Survived', 1)

In [None]:
df_all_train.head(3)

In [None]:
# Implement Label Encoding 
cat_cols = ['Sex','Embarked', 'Title']  

lbl = LabelEncoder()
for col in tqdm_notebook( cat_cols ):   
    temp_df=pd.concat([df_all_train, df_all_test])
    
    lbl.fit( temp_df[col] )
    df_all_train[col]=lbl.transform(df_all_train[col])
    df_all_test[col]=lbl.transform(df_all_test[col])

In [None]:
#train
drop_cols=['PassengerId','Ticket', 'Survived', 'Name', 'Cabin', 'Fare']
target_cols='Survived'

X=df_all_train.drop(drop_cols,1)
y=df_all_train[target_cols]

In [None]:
display(X.head(3))
display(y.astype(int))

In [None]:
# 3rd : xgboost
val_scores=list()  

for i , (trn_idx, val_idx) in tqdm_notebook(enumerate(cv.split(X, y))):
    trn_data, trn_label = X.values[trn_idx, :], y[trn_idx]
    val_data, val_label = X.values[val_idx, :], y[val_idx]

    xgb_model=xgb.XGBClassifier(
              n_estimators=1000,
        subsample=0.3,
        reg_alpha=10, 
        random_state=11).fit(trn_data,trn_label)

    trn_acc = xgb_model.score(trn_data, trn_label)
    val_acc = xgb_model.score(val_data, val_label)
    
    print(f'{i} Fold, train Accuracy : {trn_acc},  validation Accuracy ; {val_acc} ')
          
    val_scores.append(val_acc)
    

print(f'Cross Validation Score : {np.mean(val_scores)}')

## 9. test data import, preprocessing

In [None]:
# df_test = pd.read_csv("../input/titanic/test.csv")
# df_test.set_index('PassengerId', inplace=True)
# msno.matrix(df_test)
df_all_test.head(3)

In [None]:
#train
drop_cols=['PassengerId','Ticket', 'Name', 'Cabin', 'Fare']

X_test=df_all_test.drop(drop_cols,1)
X_test

In [None]:
display(X.columns)
display(X_test.columns)

In [None]:
y_preds = xgb_model.predict(X_test.values).astype(int)

In [None]:
submission = pd.DataFrame({'PassengerId':df_all_test.PassengerId, 
              'Survived':y_preds})
submission.to_csv('submission.csv', index=False)
pd.read_csv('submission.csv')