In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn')
sns.set(font_scale=2.5)

import missingno as msno
import warnings
warnings.filterwarnings('ignore')

#matplotlib inline

In [None]:
df_train=pd.read_csv('train.csv')
df_test=pd.read_csv('test.csv')

In [None]:
df_train.describe()

In [None]:
for col in df_train.columns:
    ratio=100*(df_train[col].isnull().sum()/df_train[col].shape[0])
    msg='column:{:>10}\t Percent of NaN value: {:.2f}%'.format(col,ratio)
    print(msg)

In [None]:
for col in df_test.columns:
    ratio=100*(df_test[col].isnull().sum()/df_test[col].shape[0])
    msg='column:{:>10}\t Percent of NaN value: {:.2f}%'.format(col,ratio)
    print(msg)

In [None]:
msno.matrix(df=df_train.iloc[:,:],figsize=(8,8), color=(0.2,0.4,0.6))

In [None]:
msno.bar(df=df_train.iloc[:,:],figsize=(8,8))

In [None]:
f,ax=plt.subplots(1,2,figsize=(18,8))
df_train['Survived'].value_counts().plot.pie(explode=[0,0.1],autopct='%1.1f%%',ax=ax[0])#,shadow=True)
ax[0].set_title('Pie plt - Survived')
ax[0].set_ylabel('')
sns.countplot('Survived',data=df_train,ax=ax[1])
ax[1].set_title('Count plt - Survived')
plt.show()

In [None]:
df_train[['Pclass','Survived']].groupby(['Pclass'], as_index=True).count()

In [None]:
df_train[['Pclass','Survived']].groupby(['Pclass']).sum()

In [None]:
pd.crosstab(df_train['Pclass'],df_train['Survived'],margins=True)

In [None]:
df_train[['Pclass','Survived']].groupby(['Pclass'],as_index=True).mean().sort_values(by='Survived',ascending=True).plot.bar()

In [None]:
y_position=1.04
f,ax=plt.subplots(1,2,figsize=(18,8))
df_train['Pclass'].value_counts().plot.bar(ax=ax[0])
ax[0].set_title('Number of passengers by pclass',y=y_position)
ax[0].set_ylabel('Count')
sns.countplot('Pclass',hue='Survived', data=df_train,ax=ax[1])
ax[1].set_title('Pclass:survived vs Dead', y=y_position)
plt.show()

## 3.EDA-Sex

In [None]:
f,ax=plt.subplots(1,2,figsize=(18,8))
df_train[['Sex','Survived']].groupby(['Sex'],as_index=True).mean().plot.bar(ax=ax[0])
ax[0].set_title('Survived vs Sex')
sns.countplot('Sex',hue='Survived',data=df_train,ax=ax[1])
ax[1].set_title('Sex: Survived vs Dead')
plt.show()

In [None]:
df_train[['Sex','Survived']].groupby(['Sex'],as_index=True).mean().plot.bar()

In [None]:
pd.crosstab(df_train['Sex'], df_train['Survived'],margins=True).style.background_gradient(cmap='summer_r')

In [None]:
sns.factorplot('Pclass','Survived',hue='Sex', data=df_train,size=6,aspect=1.5)

In [None]:
sns.factorplot(x='Sex',y='Survived',col='Pclass',data=df_train,saturation=.5,size=9,aspect=1.5)

## 4. EDA - Age

In [None]:
print('제일 나이 많은 탑승객: {:.1f} years'.format(df_train['Age'].max()))
print('제일 나이 어린 탑승객: {:.1f} years'.format(df_train['Age'].min()))
print('평균 나이 : {:.1f} years'.format(df_train['Age'].mean()))


In [None]:
# fig,ax=plt.subplots(1,1,figsize=(9,5))
# sns.kdeplot(df_train[df_train['Survived']==1]['Age'],ax=ax)
# sns.kdeplot(df_train[df_train['Survived']==0]['Age'],ax=ax)
# plt.legend(['Survived==1','Survived==0'])
# plt.show()


In [None]:
plt.figure(figsize=(8,6))
df_train['Age'][df_train['Pclass']==1].plot(kind='kde')
df_train['Age'][df_train['Pclass']==2].plot(kind='kde')
df_train['Age'][df_train['Pclass']==3].plot(kind='kde')
plt.xlabel('Age')
plt.title('Age Distribution within classes')
plt.legend(['1st','2nd','3rd'])
plt.show()

In [None]:
# fig,ax=plt.subplots(1,1,figsize=(9,5))
# sns.kdeplot(df_train[(df_train['Survived']==1)& (df_train['Pclass']==1)]['Age'],ax=ax)
# sns.kdeplot(df_train[(df_train['Survived']==0)& (df_train['Pclass']==1)]['Age'],ax=ax)
# plt.legend(['Survived==1','Survived==0'])
# plt.title('1st class')
# plt.show()


In [None]:
change_age_range_survival_ratio=[]
for i in range(1,80):
    change_age_range_survival_ratio.append(df_train[df_train['Age']<i]['Survived'].sum()/len(df_train[df_train['Age']<i]['Survived']))

plt.figure(figsize=(7,7))
plt.plot(change_age_range_survival_ratio)
plt.title('Survival rate chage depending on range of Age',y=1.02)
plt.ylabel('Survival rate')
plt.xlabel('Range of Age(0~x)')
plt.plot()

In [None]:
df_train[df_train['Age']<i]

## 5. Age, Sex, Pclass (violinplot)

In [None]:
f,ax=plt.subplots(1,2,figsize=(18,8))
sns.violinplot('Pclass','Age',hue='Survived', data=df_train, scale='count',split=True, ax=ax[0])
ax[0].set_title('Pclass and Age vs Survived')
ax[0].set_yticks(range(0,110,10))

sns.violinplot('Sex','Age',hue='Survived',data=df_train, scale='count',split=True, ax=ax[1])
ax[1].set_title('Sex and Age vs Survived')
ax[1].set_yticks(range(0,110,10))
plt.show()

##  6. EDA - Embarked

In [None]:
f,ax=plt.subplots(1,1,figsize=(7,7))
df_train[['Embarked','Survived']].groupby(['Embarked'],as_index=True).mean().sort_values(by='Survived',ascending=False).plot.bar(ax=ax)

In [None]:
f,ax=plt.subplots(2,2,figsize=(20,15))
sns.countplot('Embarked',data=df_train,ax=ax[0,0])
ax[0,0].set_title('(1) No. of Passengers Board')
sns.countplot('Embarked',hue='Sex',data=df_train,ax=ax[0,1])
ax[0,1].set_title('(2) male-Female split for embarked')
sns.countplot('Embarked',hue='Survived',data=df_train,ax=ax[1,0])
ax[1,0].set_title('(3) Embarked vs Survived')
sns.countplot('Embarked',hue='Pclass',data=df_train,ax=ax[1,1])
ax[1,1].set_title('(4) Embarked vs Pclass')

plt.subplots_adjust(wspace=0.2,hspace=0.5)
plt.show()

## 7.EDA-FamilySize

In [None]:
df_train['FamilySize'] = df_train['SibSp']+df_train['Parch'] +1
print('Maximum size of Family: ', df_train['FamilySize'].max())
print('Minimum size of Family: ', df_train['FamilySize'].min())

In [None]:
f,ax=plt.subplots(1,3,figsize=(40,10))
sns.countplot('FamilySize',data=df_train,ax=ax[0])
ax[0].set_title('(1) No of Passenger Boarded',y=1.02)
sns.countplot('FamilySize',hue='Survived',data=df_train, ax=ax[1])
ax[1].set_title('(2) Survived countplot depending on FamilySize',y=1.02)
df_train[['FamilySize','Survived']].groupby(['FamilySize'],as_index=True).mean().plot.bar(ax=ax[2])
ax[2].set_title('(3) Survived rate depending on FamilySize',y=1.02)
plt.subplots_adjust(wspace=0.2,hspace=0.5)

## 8.EDA-Fare,Cabin,Ticket

In [None]:
f,ax=plt.subplots(1,1,figsize=(8,8))
g=sns.distplot(df_train['Fare'],color='b',label='Skewness:{:.2f}'.format(df_train['Fare'].skew()),ax=ax)

In [None]:
df_train['Fare']=df_train['Fare'].map(lambda i:np.log(i) if i>0 else 0)

In [None]:
f,ax=plt.subplots(1,1,figsize=(8,8))
g=sns.distplot(df_train['Fare'],color='b',label='Skewness:{:.2f}'.format(df_train['Fare'].skew()),ax=ax)


## Feature Engineering

In [None]:
df_train['Ticket'].value_counts()

In [None]:
print(df_train['Age'].shape)
df_train['Age'].isnull().sum()

In [None]:
df_train['Initial']=df_train.Name.str.extract('([A-Za-z)]*)\.')
df_test['Initial']=df_train.Name.str.extract('([A-Za-z)]*)\.')

In [None]:
pd.crosstab(df_train['Initial'],df_train['Sex']).style.background_gradient(cmap='summer_r')

In [None]:
df_train['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don','Dona'],
                           ['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr','Mr'],inplace=True)
df_test['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don','Dona'],
                           ['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr','Mr'],inplace=True)

In [None]:
df_train.groupby('Initial').mean()

In [None]:
df_train.groupby(['Initial'])['Survived'].mean().plot.bar()


In [None]:
df_all=pd.concat([df_train,df_test])

In [None]:
df_all.groupby('Initial').mean()

In [None]:
df_train.loc[(df_train['Age'].isnull()) & (df_train['Initial']=='Mr'),'Age']=33
df_train.loc[(df_train['Age'].isnull()) & (df_train['Initial']=='Mrs'),'Age']=37
df_train.loc[(df_train['Age'].isnull()) & (df_train['Initial']=='Master'),'Age']=5
df_train.loc[(df_train['Age'].isnull()) & (df_train['Initial']=='Miss'),'Age']=22
df_train.loc[(df_train['Age'].isnull()) & (df_train['Initial']=='Other'),'Age']=45

df_test.loc[(df_test['Age'].isnull()) & (df_test['Initial']=='Mr'),'Age']=33
df_test.loc[(df_test['Age'].isnull()) & (df_test['Initial']=='Mrs'),'Age']=37
df_test.loc[(df_test['Age'].isnull()) & (df_test['Initial']=='Master'),'Age']=5
df_test.loc[(df_test['Age'].isnull()) & (df_test['Initial']=='Miss'),'Age']=22
df_test.loc[(df_test['Age'].isnull()) & (df_test['Initial']=='Other'),'Age']=45


## 10. Feature Engineering - Fill Null in Embarked

In [None]:
df_train['Embarked'].isnull().sum()

In [None]:
df_train['Embarked'].fillna('S',inplace=True)

In [None]:
df_train['Age_cat']=0

In [None]:
df_train.loc[df_train['Age']<10,'Age_cat']=0
df_train.loc[(df_train['Age']>=10)&(df_train['Age']<20),'Age_cat']=1
df_train.loc[(df_train['Age']>=20)&(df_train['Age']<30),'Age_cat']=2
df_train.loc[(df_train['Age']>=30)&(df_train['Age']<40),'Age_cat']=3
df_train.loc[(df_train['Age']>=40)&(df_train['Age']<50),'Age_cat']=4
df_train.loc[(df_train['Age']>=50)&(df_train['Age']<60),'Age_cat']=5
df_train.loc[(df_train['Age']>=60)&(df_train['Age']<70),'Age_cat']=6
df_train.loc[(df_train['Age']>=70),'Age_cat']=7

In [None]:
df_train.groupby(['Age_cat']).mean()

In [None]:
def category_age(x):
    if x<10:
        return 0
    if x<20:
        return 1
    if x<30:
        return 2
    if x<40:
        return 3
    if x<50:
        return 4
    if x<60:
        return 5
    if x<70:
        return 6
    
    return 7

In [None]:
df_train['Age_cat']= df_train['Age'].apply(category_age)
df_test['Age_cat']= df_train['Age'].apply(category_age)

## 11. Feature Engineering - Change string to categorical and Pearson Coefficient

In [None]:
df_train.Initial.unique()

In [None]:
df_train['Initial']=df_train['Initial'].map({'Master':0,'Miss':1,'Mr':2,'Mrs':3,'Other':4})
df_test['Initial']=df_test['Initial'].map({'Master':0,'Miss':1,'Mr':2,'Mrs':3,'Other':4})

In [None]:
df_train.Embarked.unique()

In [None]:
df_train['Embarked'].value_counts()

In [None]:
df_train['Embarked']=df_train['Embarked'].map({'C':0,'Q':1,'S':2})
df_test['Embarked']=df_test['Embarked'].map({'C':0,'Q':1,'S':2})

In [None]:
df_train.head()

In [None]:
df_train.Embarked.isnull().any()

In [None]:
df_train['Sex'].unique()

In [None]:
df_train['Sex']=df_train['Sex'].map({'female':0,'male':1})
df_test['Sex']=df_test['Sex'].map({'female':0,'male':1})

In [None]:
df_train.head()

In [None]:
heatmap_data = df_train[['Survived','Pclass','Sex','Fare','Embarked','FamilySize','Initial','Age_cat']]

In [None]:
colormap=plt.cm.BuGn
plt.figure(figsize=(19,12))
plt.title('Pearson Correlation of Features',y=1.05,size=15)
sns.heatmap(heatmap_data.astype(float).corr(),linewidths=0.1,vmax=1.0,
            square=True, cmap=colormap,annot=True,annot_kws={'size':16})

In [None]:
df_train.corr()

In [None]:
df_test.head()

## 12. Feature Engineering - One Hot encoding on the Initial and Embarked

In [None]:
#pd.get_dummies(df_train,columns=['Initial'],prefix='Initial')

In [None]:
df_train=pd.get_dummies(df_train,columns=['Initial'],prefix='Initial')
df_test=pd.get_dummies(df_test,columns=['Initial'],prefix='Initial')

In [None]:
df_train=pd.get_dummies(df_train,columns=['Embarked'],prefix='Embarked')
df_test=pd.get_dummies(df_test,columns=['Embarked'],prefix='Embarked')

In [None]:
df_train.head()

In [None]:
df_train.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin','Age'],axis=1,inplace=True)
df_test.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin','Age'],axis=1,inplace=True)

## 13. Model development - Machine learningl(Randomforest)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [None]:
x_train=df_train.drop('Survived',axis=1).values
target_label=df_train['Survived'].values
x_test=df_test.values

In [None]:
x_tr,x_vld,y_tr,y_vld=train_test_split(x_train,target_label,test_size=0.3,random_state=2018)


In [None]:
model=RandomForestClassifier()
model.fit(x_tr,y_tr)

In [None]:
prediction=model.predict(x_vld)

In [None]:
print('총 {}명 중 {:.2f}%'.format(y_vld.shape[0],100*metrics.accuracy_score(prediction,y_vld)))

In [None]:
prediction

In [None]:
y_vld

In [None]:
x_train.shape,target_label.shape

In [None]:
x_test

## 14. Machine learning prediction - feature importance and prediction on test set

In [None]:
model.feature_importances_


In [None]:
from pandas import Series

In [None]:
feature_importance=model.feature_importances_
Series_feat_lmp=Series(feature_importance)

In [None]:
feature_importance

In [None]:
plt.figure(figsize=(8,8))
Series_feat_lmp.sort_values().plot.barh()
plt.xlabel('Feature importance')
plt.ylabel('Feature')
plt.show()

In [None]:
df_train.columns

In [None]:
submission=pd.read_csv('../input/gender_submission.csv')


In [None]:
submission.head()

In [None]:
prediction=model.predict(x_test)

In [None]:
submission['Survived']=prediction

In [None]:
submission.to_csv('./my_first_submission.csv',index=false)