# Titanic Dataset 

Using decision Tree to solve Titanic survival datset 

Import the necessary libraries 


In [73]:
import numpy as np 
import pandas as pd 
import re
from sklearn import tree
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [74]:
train_data = pd.read_csv('./data/train.csv')
test_data  = pd.read_csv('./data/test.csv')

complete_data = [train_data,test_data]
passenger_id = test_data['PassengerId']

#Keep the original data
original_train = train_data.copy()
original_train = test_data.copy()

train_data['Cabin'] = train_data['Cabin'].apply(lambda x:0 if type(x)==float else 1)
test_data['Cabin']  = test_data['Cabin'].apply(lambda x:0 if type(x)==float else 1)

for data in complete_data:
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
    
    data['Embarked'] = data['Embarked'].fillna('S')
    
    data['Fare'] = data['Fare'].fillna(train['Fare'].median())

for data in complete_data:
    avg_age = data['Age'].mean()
    std_age = data['Age'].std()
    null_age_count = data['Age'].isnull().sum()
    random_age_list = np.random.randint(avg_age-std_age,avg_age+std_age,null_age_count)
    data.loc[np.isnan(data['Age']), 'Age'] = random_age_list
    data['Age'] = data['Age'].astype(int)

def get_title(name):
    title_search = re.search('([A-Za-z]+)\.',name)
    if title_search:
        return title_search.group(1)
    else :
        return ""
    
for data in complete_data:
    data['Title'] = data['Name'].apply(get_title)
    
for dataset in complete_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

for dataset in complete_data:
    dataset['Sex'] = dataset['Sex'].map({"male":1,"female":0})
    title_mapping = {"Mr": 1, "Master": 2, "Mrs": 3, "Miss": 4, "Rare": 5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)
    
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare']                               = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] 							        = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
    
    # Mapping Age
    dataset.loc[ dataset['Age'] <= 16, 'Age']                          = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] ;
    
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    
    

In [78]:
print 
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
train = train_data.drop(drop_elements, axis = 1)
test  = test_data.drop(drop_elements, axis = 1)

# Finding best depth with the help of Cross Validation 


This is the important part of building decision tree. We need to find the tree with the depth, such that it 
fits the data without overfitting. We will try depths from 1 to max features and use the tree with the depth 
that gives best validation score

In [98]:
cv =KFold(n_splits=10)
accuracies = list()
max_attributes = len(list(test_data))
depth_range = range(1,max_attributes+1)

for depth in depth_range:
    fold_accuraacy= []
    tree_model = tree.DecisionTreeClassifier(max_depth=depth)
    for train_fold, valid_fold in cv.split(train):
        f_train = train.loc[train_fold] # Extract train data with cv indices
        f_valid = train.loc[valid_fold]
        
        model = tree_model.fit(X = f_train.drop(['Survived'], axis=1), 
                               y = f_train["Survived"]) 
        val_score = model.score(X = f_train.drop(['Survived'], axis=1), 
                               y = f_train["Survived"]) 
        
        fold_accuraacy.append(val_score)
        
    avg = sum(fold_accuraacy)/len(fold_accuraacy)
    accuracies.append(avg)
df =pd.DataFrame({"Max Depth":depth_range,"Accuracy":accuracies})  
df = df[["Max Depth","Accuracy"]]
print (df.to_string())    

    Max Depth  Accuracy
0           1  0.782267
1           2  0.806583
2           3  0.829405
3           4  0.836139
4           5  0.847363
5           6  0.861081
6           7  0.873551
7           8  0.881906
8           9  0.887767
9          10  0.890760
10         11  0.893129
11         12  0.894625
12         13  0.896122
