In [1]:
import numpy as np
import pandas as pd
import random as rnd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder

%matplotlib inline
np.random.seed(1)

In [2]:
train = pd.read_csv('C://erdongguo/titanic/train.csv')
test = pd.read_csv('C://erdongguo/titanic/test.csv')
combine = [train, test]

In [3]:
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract('([A-Za-z]+)\.', expand = False)
pd.crosstab(train['Title'], train['Sex'])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,2
Countess,1,0
Don,0,1
Dr,1,6
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,40
Miss,182,0


In [4]:
for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess', 'Capt', 
                                                'Col', 'Don', 'Dr', 'Major',
                                                'Rev', 'Sir', 'Jonkheer', 'Dona'],
                                               'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
train[['Title', 'Survived']].groupby(['Title'], as_index = False).mean()

Unnamed: 0,Title,Survived
0,Master,0.575
1,Miss,0.702703
2,Mr,0.156673
3,Mrs,0.793651
4,Rare,0.347826


In [5]:
title_mapping = {'Mr': 1, "Miss": 2, 'Mrs': 3, "Master": 4, 'Rare': 5}
for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(1)

In [6]:
train = train.drop(['Name'], axis = 1)
test = test.drop(['Name'], axis = 1)
combine = [train, test]
train.shape, test.shape

((891, 12), (418, 11))

In [7]:
for dataset in combine:
    dataset.loc[np.isnan(dataset['Age']), 'Age'] = dataset['Age'].median()
    dataset.loc[dataset['Cabin'].notnull(), 'Cabin'] = 1
    dataset.loc[dataset['Cabin'].isnull(), 'Cabin'] = 0

In [8]:
for dataset in combine:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
train[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index = False).mean().sort_values(by = 'Survived', ascending = False)

Unnamed: 0,FamilySize,Survived
3,4,0.724138
2,3,0.578431
1,2,0.552795
6,7,0.333333
0,1,0.303538
4,5,0.2
5,6,0.136364
7,8,0.0
8,11,0.0


In [9]:
for dataset in combine:
    dataset['Age * Class'] = dataset.Age * dataset.Pclass

In [10]:
for dataset in combine:
    print(dataset.Embarked.value_counts())

S    644
C    168
Q     77
Name: Embarked, dtype: int64
S    270
C    102
Q     46
Name: Embarked, dtype: int64


In [11]:
sex_label_data_list = []
embarked_label_data_list = []
for dataset in combine:
    sex_label_data = LabelBinarizer().fit_transform(dataset['Sex'])
    sex_label_data_list.append(sex_label_data)
    embarked_label_data = LabelEncoder().fit_transform(dataset['Embarked'].fillna('S'))
    embarked_label_data_list.append(embarked_label_data)

In [12]:
filled_data_list = []
for dataset in combine:
    filled_data_list.append(dataset)

In [13]:
def get_most_common(series):
    return series.value_counts().axes[0][0]

In [14]:
for dataset in filled_data_list:
    most_common = get_most_common(dataset['Embarked'])
    dataset.loc[dataset['Embarked'].isnull(), 'Embarked'] = most_common

In [15]:
dummy_cabin_list, dummy_sex_list, dummy_embarked_list = [], [], []
for dataset in filled_data_list:
    dummy_cabin = pd.get_dummies(dataset['Cabin'], prefix = 'Cabin')
    dummy_sex = pd.get_dummies(dataset['Sex'], prefix = 'Sex')
    dummy_embarked = pd.get_dummies(dataset['Embarked'], prefix = 'Embarked')
    dummy_cabin_list.append(dummy_cabin)
    dummy_sex_list.append(dummy_sex)
    dummy_embarked_list.append(dummy_embarked)

In [16]:
X_train_origin = filled_data_list[0]
X_test_origin = filled_data_list[1]
X_test_origin.loc[X_test_origin['Fare'].isnull(), 'Fare'] = X_test_origin['Fare'].median()
scaler = StandardScaler().fit(X_train_origin.filter(['Fare']))
X_train_origin['Fare'] = scaler.transform(X_train_origin.filter(['Fare']))
X_test_origin['Fare'] = scaler.transform(X_test_origin.filter(['Fare']))
scaler = StandardScaler().fit(X_train_origin.filter(['Age']))
X_train_origin['Age'] = scaler.transform(X_train_origin.filter(['Age']))
X_test_origin['Age'] = scaler.transform(X_test_origin.filter(['Age']))
scaler = StandardScaler().fit(X_train_origin.filter(['Age * Class']))
X_train_origin['Age * Class'] = scaler.transform(X_train_origin.filter(['Age * Class']))
X_test_origin['Age * Class'] = scaler.transform(X_test_origin.filter(['Age * Class']))

In [17]:
prepared_data_X_list = []
for i in range(len(filled_data_list)):
    dummied_data = pd.concat([filled_data_list[i], dummy_cabin_list[i], 
                              dummy_sex_list[i], dummy_embarked_list[i]], axis = 1)
    prepared_data_X = dummied_data.drop(['Ticket', 'Cabin', 'Sex', 'Embarked', 'PassengerId'], axis = 1)
    prepared_data_X_list.append(prepared_data_X)            

In [18]:
prepared_data_X_list[0].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 16 columns):
Survived       891 non-null int64
Pclass         891 non-null int64
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Title          891 non-null int64
FamilySize     891 non-null int64
Age * Class    891 non-null float64
Cabin_0        891 non-null uint8
Cabin_1        891 non-null uint8
Sex_female     891 non-null uint8
Sex_male       891 non-null uint8
Embarked_C     891 non-null uint8
Embarked_Q     891 non-null uint8
Embarked_S     891 non-null uint8
dtypes: float64(3), int64(6), uint8(7)
memory usage: 68.8 KB


In [19]:
prepared_data_X_list[1].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 15 columns):
Pclass         418 non-null int64
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Fare           418 non-null float64
Title          418 non-null int64
FamilySize     418 non-null int64
Age * Class    418 non-null float64
Cabin_0        418 non-null uint8
Cabin_1        418 non-null uint8
Sex_female     418 non-null uint8
Sex_male       418 non-null uint8
Embarked_C     418 non-null uint8
Embarked_Q     418 non-null uint8
Embarked_S     418 non-null uint8
dtypes: float64(3), int64(5), uint8(7)
memory usage: 29.1 KB


In [20]:
X_train = prepared_data_X_list[0]
Y_train_origin = X_train.Survived
X_train = X_train.drop('Survived', axis = 1)
X_test = prepared_data_X_list[1]

In [22]:
Y_train = pd.DataFrame({'PassegnerId': train['PassengerId'], 'Survived': Y_train_origin})

In [23]:
X_train.to_csv('C://erdongguo/titanic/X_train_v2.csv', index = False)
Y_train.to_csv('C://erdongguo/titanic/Y_train_v2.csv', index = False)
X_test.to_csv('C://erdongguo/titanic/X_test_v2.csv', index = False)