In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
test_set = pd.read_csv('../input/test.csv')

In [None]:
train_set = pd.read_csv('../input/train.csv')

In [None]:
# data cleaning
full_data = [train_set, test_set]

for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    
for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

for dataset in full_data:
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)
train_set['CategoricalAge'] = pd.cut(train_set['Age'], 5)

for dataset in full_data:
    dataset.loc[ dataset['Age'] <= 16, 'Age']  = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 4 ;


In [None]:
train_set = train_set.set_index('PassengerId')
test_set = test_set.set_index('PassengerId')

In [None]:
train_set['SexBinary'] = train_set['Sex'].apply(lambda x: 1 if x =='male' else 0)
train_set['Family'] = train_set['SibSp'].apply(lambda x: 1 if x > 0 else 0)
train_set['Survived'] = train_set.replace(to_replace=0,value=-1)

In [None]:
train_set.head()

In [None]:
features = ['Age','Pclass','SexBinary','FamilySize','IsAlone']

In [None]:
survivor = train_set[train_set['Survived'] == 1]
dead = train_set[train_set['Survived'] == -1]

In [None]:
def intermediate_node_num_mistakes(labels_in_node):
    survivor_ct = 0
    dead_ct = 0
    # Corner case: If labels_in_node is empty, return 0
    if len(labels_in_node) == 0:
        return 0
    for label in labels_in_node:
    # Count the number of 1's (safe loans)
        if label == 1:
            survivor_ct += 1
    # Count the number of -1's (risky loans)
        elif label == -1:
            dead_ct += 1        
    # Return the number of mistakes that the majority classifier makes.
    return min(survivor_ct, dead_ct)

In [None]:
def best_splitting_feature(data, features, target):
    
    best_feature = None # Keep track of the best feature 
    best_error = 10     # Keep track of the best error so far 
    # Note: Since error is always <= 1, we should intialize it with something larger than 1.

    # Convert to float to make sure error gets computed correctly.
    num_data_points = float(len(data))  
    
    # Loop through each feature to consider splitting on that feature
    for feature in features:
        
        # The left split will have all data points where the feature value is 0
        left_split = data[data[feature] == 0]
        
        # The right split will have all data points where the feature value is 1
        right_split =  data[data[feature] == 1]
            
        # Calculate the number of misclassified examples in the left split.
        # Remember that we implemented a function for this! (It was called intermediate_node_num_mistakes)

        left_mistakes = intermediate_node_num_mistakes(left_split[target])            

        # Calculate the number of misclassified examples in the right split.

        right_mistakes = intermediate_node_num_mistakes(right_split[target])  
            
        # Compute the classification error of this split.
        # Error = (# of mistakes (left) + # of mistakes (right)) / (# of data points)

        error = (left_mistakes+right_mistakes)/num_data_points

        # If this is the best error we have found so far, store the feature as best_feature and the error as best_error

        if error < best_error:
            best_error = error
            best_feature = feature
    
    return best_feature # Return the best feature we found

In [None]:
def create_leaf(target_values):
    
    # Create a leaf node
    leaf = {'splitting_feature' : None,
            'left' : None,
            'right' : None,
            'is_leaf': True    }  
    
    # Count the number of data points that are +1 and -1 in this node.
    num_ones = len(target_values[target_values == +1])
    num_minus_ones = len(target_values[target_values == -1])
    
    # For the leaf node, set the prediction to be the majority class.
    # Store the predicted class (1 or -1) in leaf['prediction']
    
    if num_ones > num_minus_ones:
        leaf['prediction'] = +1          
    else:
        leaf['prediction'] = -1        
        
    # Return the leaf node        
    return leaf 

In [None]:
def decision_tree_create(data, features, target, current_depth = 0, max_depth = 10):
    remaining_features = features[:] # Make a copy of the features.
    
    target_values = data[target]
    print ("--------------------------------------------------------------------")
    print ("Subtree, depth = %s (%s data points)." % (current_depth, len(target_values)))
    

    # Stopping condition 1
    # (Check if there are mistakes at current node.
    # Recall you wrote a function intermediate_node_num_mistakes to compute this.)
    if intermediate_node_num_mistakes(target_values) == 0:  
        print ("Stopping condition 1 reached.")     
        # If not mistakes at current node, make current node a leaf node
        return create_leaf(target_values)
    
    # Stopping condition 2 (check if there are remaining features to consider splitting on)
    if remaining_features == []:  
        print ("Stopping condition 2 reached.")    
        # If there are no remaining features to consider, make current node a leaf node
        return create_leaf(target_values)    
    
    # Additional stopping condition (limit tree depth)
    if current_depth >= max_depth:  
        print ("Reached maximum depth. Stopping for now.")
        # If the max tree depth has been reached, make current node a leaf node
        return create_leaf(target_values)

    # Find the best splitting feature (recall the function best_splitting_feature implemented above)
    
    splitting_feature = best_splitting_feature(data, features, target)
    
    # Split on the best feature that we found. 
    left_split = data[data[splitting_feature] == 0]
    right_split = data[data[splitting_feature] == 1]      
    remaining_features.remove(splitting_feature)
    print ("Split on feature %s. (%s, %s)" % (
                      splitting_feature, len(left_split), len(right_split)))
    
    # Create a leaf node if the split is "perfect"
    if len(left_split) == len(data):
        print ("Creating leaf node.")
        return create_leaf(left_split[target])
    if len(right_split) == len(data):
        print ("Creating leaf node.")
        
        return create_leaf(right_split[target])
        
    # Repeat (recurse) on left and right subtrees
    left_tree = decision_tree_create(left_split, remaining_features, target, current_depth + 1, max_depth)        
    
    right_tree = decision_tree_create(right_split, remaining_features, target, current_depth + 1, max_depth)

    return {'is_leaf'          : False, 
            'prediction'       : None,
            'splitting_feature': splitting_feature,
            'left'             : left_tree, 
            'right'            : right_tree}

In [None]:
def count_nodes(tree):
    if tree['is_leaf']:
        return 1
    return 1 + count_nodes(tree['left']) + count_nodes(tree['right'])

In [None]:
titanic_decision_tree = decision_tree_create(train_set, features, 'Survived', max_depth = 5)

In [None]:
def classify(tree, x, annotate = False):   
    # if the node is a leaf node.
    if tree['is_leaf']:
        if annotate: 
            print ("At leaf, predicting %s" % tree['prediction'])
        return tree['prediction'] 
    else:
        # split on feature.
        split_feature_value = x[tree['splitting_feature']]
        if annotate: 
            print ("Split on %s = %s" % (tree['splitting_feature'], split_feature_value))
        if split_feature_value == 0:
            return classify(tree['left'], x, annotate)
        else:
            return classify(tree['right'], x, annotate)

In [None]:
print ('Predicted class: %s ' % classify(titanic_decision_tree, train_set.iloc[56]))

In [None]:
test_set.iloc[56]

In [None]:
test_set['SexBinary'] = test_set['Sex'].apply(lambda x: 1 if x =='male' else 0)
test_set['Family'] = test_set['SibSp'].apply(lambda x: 1 if x > 0 else 0)

In [None]:
test_set.fillna(0)

In [None]:
print ('Predicted class: %s ' % classify(titanic_decision_tree, test_set.iloc[56]))

In [None]:
def evaluate_classification_error(tree, data, target):
    # Apply the classify(tree, x) to each row in your data
    pred_temp = []
    for i in range(len(data)):
        pred_temp.append(classify(tree, data.iloc[i]))
    data['prediction'] = pred_temp
    
    # Once you've made the predictions, calculate the classification error and return it
    return 1.0 * sum(data['prediction'] != data[target])/len(data)

In [None]:
xtrain, xtest = train_test_split(train_set, test_size=0.2)

In [None]:
evaluate_classification_error(titanic_decision_tree, xtrain, 'Survived')

In [None]:
submission = test_set.copy()
submission = submission.fillna(0)

In [None]:
submission['Survived'] = test_set.apply(lambda row: classify(titanic_decision_tree, row), axis=1)

In [None]:
submission['Survived'] = submission['Survived'].replace(to_replace=-1,value=0)

In [None]:
decision_submission = submission.reset_index()

In [None]:
decision_submission.head()

In [None]:
decision_submission[['PassengerId','Survived']].to_csv('submission.csv', index=False)