## Titanic v1.0

  1.2 [Load data](#load-data)



## 1 - Preliminaries

<a id='libraries'></a>
### 1.1 - Load libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import xgboost as xgb
from sklearn import cross_validation
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import GridSearchCV





### 1.2 - Load data

In [2]:
df_train_raw = pd.read_csv('train.csv')
df_test_raw = pd.read_csv('test.csv')

print("df_train_raw shape =", df_train_raw.shape)
print("df_test_raw shape =", df_test_raw.shape)

df_train_raw shape = (891, 12)
df_test_raw shape = (418, 11)


<a id='combine'></a> 
### 1.3 - Combine data 

In [3]:
# Save the survived column as y for later and drop it from the train data
y_train = df_train_raw.Survived
x_train_raw = df_train_raw.drop(['Survived'], axis = 1)

# Join the X data from the train and test files, for feature processing
X_all = pd.concat((x_train_raw, df_test_raw), axis = 0, ignore_index=True)

# Save a copy of X_all for later comparison
X_all_original = X_all

print('data shape = ', X_all.shape)
X_all.head()

data shape =  (1309, 11)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


<a id='check-nan'></a> 
### 1.4 - Check for NaN's

In [4]:
X_all.isnull().sum()

PassengerId       0
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

<a id='feature-eng'></a> 
## 2 - Feature engineering
<a id='name'></a> 
### 2.1 - Column "Name"

In [5]:
# Analysis and preparation of column "Name":

names = X_all.Name

print("There are " + str(pd.isnull(names).sum()) + " nan's in the name column")

There are 0 nan's in the name column


No missing names. The next step is to split each name into title, first name and surname.

In [6]:
for i in range(len(names)):
    
    full_name = str(names[i])
    
    # First split: separate the surname from the rest of the name
    name_split_1 = full_name.split(sep = ', ', maxsplit=1)
    surname = name_split_1[0]
    title_and_name = name_split_1[1]
    
    # Second split: separate the title from the rest of the name
    name_split_2 = title_and_name.split(sep = '. ', maxsplit=1)
    
    title = name_split_2[0]
    first_name = name_split_2[1]
    
    
    if(i==0):
        titles = np.array(title).reshape(1,1)
        first_names = np.array(first_name).reshape(1,1)
        surnames = np.array(surname).reshape(1,1)
        
    else:
        titles = np.append(titles, np.array(title).reshape(1,1), axis = 0)
        first_names = np.append(first_names, np.array(first_name).reshape(1,1), axis = 0)
        surnames = np.append(surnames, np.array(surname).reshape(1,1), axis = 0)
        
print("There are ", np.unique(titles).shape[0], " unique titles: ")
print(str(np.unique(titles)))

# add the title, first and last name to X_all
X_all['Title'] = titles
X_all['FirstName'] = first_names
X_all['Surname'] = surnames

X_all.head()

There are  18  unique titles: 
['Capt' 'Col' 'Don' 'Dona' 'Dr' 'Jonkheer' 'Lady' 'Major' 'Master' 'Miss'
 'Mlle' 'Mme' 'Mr' 'Mrs' 'Ms' 'Rev' 'Sir' 'the Countess']


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,FirstName,Surname
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,Owen Harris,Braund
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,John Bradley (Florence Briggs Thayer),Cumings
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,Laina,Heikkinen
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,Jacques Heath (Lily May Peel),Futrelle
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr,William Henry,Allen


Having 18 unique titles could cause trouble for the model if some of them are very infrequent.

Let's analyse of the frequency of the titles and respective survival in the training data

In [7]:
title_df = X_all[0:891]
title_df['Survived'] = y_train
title_df.head()

#plt.figure(figsize=(10,5))
sns.factorplot('Title',data=title_df,hue='Survived',kind='count', size = 8)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


NameError: name 'sns' is not defined

In [None]:
# to make the job easier for the model, some titles will be grouped together with the most frequent ones

X_all['Title'] = X_all['Title'].map({'Don': 'Mr',
                                     'Rev': 'Mr', 
                                     'Jonkheer': 'Mr', 
                                     'Capt': 'Mr', 
                                     'Mme': 'Mrs', 
                                     'Ms': 'Mrs', 
                                     'Lady': 'Miss', 
                                     'Sir': 'Mr', 
                                     'Mlle': 'Mrs', 
                                     'Mr': 'Mr',
                                     'Mrs': 'Mrs',
                                     'Miss': 'Miss',
                                     'Master': 'Master',
                                     'Dr': 'Dr',
                                     'Major': 'Major',
                                     'Col': 'Col',
                                     'the Countess': 'Mrs',
                                     'Dona': 'Mrs',
                                    })

In [None]:
X_all.head()

In [None]:
# Analysis and preparation of column "Age":
# Passenger's age

age = X_all.Age
print("There are " + str(pd.isnull(age).sum()) + " nan's in the age column")

#age = age.reshape(age.shape[0],1)

There are a lot of nan's in the age column and this could be a problem. These nan's will be replaced by -4 and where a value is available for the age, it will be separated in ranges.

In [None]:
# The nan's will be replaced with -4
age_ = np.zeros_like(age, dtype='float64')

for i in range(len(age)):
    #print(age[i])
    if (pd.isnull(age[i])):
        age_[i] = np.float64(-4.0)
    else:
        age_[i] = np.float64(age[i])

# Split the ages into 10 bins
bins = np.array([-10.0, 0.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0]) 

age_ = age_.reshape(age_.shape[0],)
age_bins = np.digitize(age_, bins)

# Convert to one hot
#age_bins = age_bins.reshape(age_bins.shape[0],)
#age_bins_onehot = pd.get_dummies(age_bins)
#age_bins_onehot = age_bins_onehot.values

# Put the age column back into the dataset
X_all.Age = age_bins

<a id='ticket'></a> 
### 2.2 - Column "Ticket"

In [None]:
tickets = X_all.Ticket

print("There are " + str(pd.isnull(tickets).sum()) + " nan's in the ticket column")

No missing values in the ticket column. 
The values will be split into prefix (where available) and number.

In [None]:
for i in range(len(tickets)):
    
    # get entry i of column 'ticket'
    full_ticket = str(tickets[i])
    
    # Separate the prefix from the ticket number
    ticket_split_1 = full_ticket.split(sep = ' ', maxsplit=1)
    
    # if there's no prefix, use -999
    if(len(ticket_split_1) == 1):
        # if there's no prefix, fill with 'nan' so it matches with the dtype of the elemenst in the column (str)
        prefix = 'nan'
        number = ticket_split_1[0]
    else:
        prefix = ticket_split_1[0]
        number = ticket_split_1[1]
    
    # append both strings to the respective arrays
    if(i==0):
        ticket_prefix = np.array(prefix).reshape(1,1)
        ticket_number = np.array(number).reshape(1,1)
    else:
        ticket_prefix = np.append(ticket_prefix, np.array(prefix).reshape(1,1), axis = 0)
        ticket_number = np.append(ticket_number, np.array(number).reshape(1,1), axis = 0)

# add the ticket prefix and number to X_all
X_all['TicketPrefix'] = ticket_prefix
X_all['TicketNumber'] = ticket_number

<a id='cabin'></a> 
### 2.3 - Column "Cabin"

In [None]:
#Analisys of the 'cabin' column

cabins = X_all.Cabin
print("There are " + str(pd.isnull(cabins).sum()) + " nan's in the Cabin column")

In [None]:
for i in range(len(cabins)):
    
    # get entry from column 'cabin'
    full_cabin = str(cabins[i])
    
    #print(full_cabin)
    
    # First, check if the entry i is NaN
    if(full_cabin == 'nan'):
    # no cabin
        reserve = 0
        multiple = 0
        letter = 'Z'
    else:
        # passenger has a cabin
        reserve = 1
        
        # check if there is more than one cabin reserved
        cabin_split_1 = full_cabin.split(sep = ' ', maxsplit=1)
        
        if(len(cabin_split_1) > 1):
            multiple = 1
        else:
            multiple = 0
        
        # check the letter, corresponding to the level in the ship
        
        #cabin_split_2 = full_cabin.split(sep = '', maxsplit=1)
        letter = full_cabin[0]
        
    #print(full_cabin, ' || reserve = ', reserve, 'multiple = ', multiple, 'letter = ', letter)
    #print(cabin_split_1)
    #print('======')
    
    # append both strings to the respective arrays
    if(i==0):
        cabin_reserve = np.array(reserve).reshape(1,1)
        cabin_multiple = np.array(multiple).reshape(1,1)
        cabin_letter = np.array(letter).reshape(1,1)
    else:
        cabin_reserve = np.append(cabin_reserve, np.array(reserve).reshape(1,1), axis = 0)
        cabin_multiple = np.append(cabin_multiple, np.array(multiple).reshape(1,1), axis = 0)
        cabin_letter = np.append(cabin_letter, np.array(letter).reshape(1,1), axis = 0)

# add the ticket prefix and number to X_all
X_all['CabinReserve'] = cabin_reserve
X_all['CabinMultiple'] = cabin_multiple
X_all['CabinLetter'] = cabin_letter

<a id='encoding'></a> 
### 2.2 - Variable encoding

In [None]:
# Save a copy of X_all before encoding
X_all_before_encoding = X_all

# Convert categorical features into ordinal numbers.


def convert_to_int(column):
    # convert -column- to integer
    
    # replace nan's with -999
    column[pd.isnull(column)]  = 'NaN'
    
    # encode the column to integers
    le = LabelEncoder()
    le.fit(column.unique())
    column_int = le.transform(column)
    return column_int

In [None]:
X_all.head()

In [None]:
aaa = convert_to_int(X_all.CabinLetter)

In [None]:
aaa

In [None]:
# Name, Ticket might not be very useful, consider removing later
X_all.Sex = convert_to_int(X_all.Sex)
X_all.Embarked = convert_to_int(X_all.Embarked)
X_all.Title = convert_to_int(X_all.Title)
X_all.Surname = convert_to_int(X_all.Surname)
X_all.TicketPrefix = convert_to_int(X_all.TicketPrefix)
X_all.CabinLetter = convert_to_int(X_all.CabinLetter)

#X_all.Name = convert_to_int(X_all.Name)
#X_all.Name = convert_to_int(X_all.Name)

# Drop Name, Ticket, Cabin, FirstName, TicketNumber
X_all = X_all.drop(['Name'], axis = 1)
X_all = X_all.drop(['Ticket'], axis = 1)
X_all = X_all.drop(['Cabin'], axis = 1)
X_all = X_all.drop(['FirstName'], axis = 1)
X_all = X_all.drop(['TicketNumber'], axis = 1)

#X_all = X_all.drop(['PassengerId'], axis = 1)

<a id='before-after'></a> 
### 2.2 - Before and after

The original data, before being processed:

In [None]:
X_all_original.head()

After preprocessing, before variable encoding:

In [None]:
X_all_before_encoding.head()

After variable encoding and dropping some of the columns:

In [None]:
X_all.head()

<a id='split-data'></a> 
### 2.2 - Split data

In [None]:
X_train = X_all[0:891]
X_submission = X_all[891:]

<a id='model'></a> 
## 3 - Model

<a id='cv'></a>
### 3.1 - Cross Validation

In [None]:
regr = xgb.XGBClassifier()

tuned_parameters = [{'eval_metric': ['logloss'],
                     'objective': ["binary:logistic"],
                     'max_depth': [8, 9, 10, 11], 
                     'n_estimators': [1000], 
                     'eta': [0.1, 0.2, 0.3],
                     'reg_lambda': [0.5, 1.5, 2.5],
                     'nthread': [4],
                     'subsample': [1],
                     'colsample_bytree': [0.3, 0.5, 0.7],
                     'alpha': [0.5, 1.5, .5],
                     'min_child_weight': [2, 4, 6, 8]}]
n_folds = 5

In [None]:
regr_grid = GridSearchCV(regr, tuned_parameters, cv=n_folds, refit=False, verbose = 1)
#regr_grid.fit(X_train, y_train)

In [None]:
print("Best parameters set found on development set:")
print(regr_grid.best_params_)
print("Best score:")
print(regr_grid.best_score_ )

<a id='fitting'></a>
### 3.2 - Fitting

In [None]:
print("Grid scores on development set:")
print()
means = regr_grid.cv_results_['mean_test_score']
stds = regr_grid.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, regr_grid.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

In [None]:
# fit using the optimal settings

regr_optimum = xgb.XGBClassifier(eval_metric= 'logloss', 
                         objective= "binary:logistic",
                         max_depth= 9,
                         n_estimators = 1000
                         eta= 0.1,
                         nthread = 4,
                         subsample= 1,
                         colsample_bytree= 0.3,
                         reg_lambda= 2.5,
                         alpha= 0.5,
                         min_child_weight= 8
                         )

regr_optimum.fit(X_train, y_train)

<a id='submission'></a>
## 4 - Submission

<a id='sub-file'></a>
### 4.1 - File preparation

In [None]:
y_pred_subm = regr_optimum.predict(X_submission)

submission_PassengerId = pd.DataFrame(df_test_raw.PassengerId)
submission_Survived = pd.DataFrame(y_pred_subm)

submission = pd.concat((submission_PassengerId, submission_Survived), axis = 1)
submission.columns = ['PassengerId', 'Survived']
submission.head()

In [None]:
submission.to_csv('submission.csv', index = False)

<a id='score'></a>
### 4.2 - Score

![Logistic Regression score](https://raw.githubusercontent.com/jgamboias/Titanic/master/logreg_submission.png)