In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
ds = pd.read_csv('./titanic.csv')

print ds.columns
ds.head(n=10)

Index([u'PassengerId', u'Survived', u'Pclass', u'Name', u'Sex', u'Age',
       u'SibSp', u'Parch', u'Ticket', u'Fare', u'Cabin', u'Embarked'],
      dtype='object')


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [3]:
mean_Age = ds['Age'].mean()
ds['Age'] = ds['Age'].fillna(int(mean_Age))

sex_mapping = {
    'male': 0,
    'female': 1
}

sex = ds['Sex'].values
print sex.shape
quant_sex = np.zeros(sex.shape)

for ix in range(quant_sex.shape[0]):
    quant_sex[ix] = sex_mapping[sex[ix]]
# print quant_sex

(891,)


In [4]:
data = np.asarray([ds['Age'], ds['Pclass'], ds['SibSp'],
        ds['Parch'], ds['Fare'], quant_sex,
        ds['Survived']]).astype("float").T
Y = ds['Survived'].values
cols = ['Age', 'Pclass', 'SibSp', 'Parch', 'Fare', 'Sex', 'Survived']
print data[0]
# ds.head(n=1)

[ 22.     3.     1.     0.     7.25   0.     0.  ]


In [5]:
def data_split(dataset, col_id, value):
    data_right = []
    data_left = []
    
    # send greater values to right and smaller or equal to left
    for ix in range(dataset.shape[0]):
        if dataset[ix, col_id] <= value:
            data_left.append(dataset[ix, :])
        else:
            data_right.append(dataset[ix, :])
    return np.asarray(data_right), np.asarray(data_left)

In [8]:
x, y = data_split(data[:100], 1, 1)
print x.shape
print y.shape

print cols
print x[:, -1].mean()
print y[:, -1].mean()

print '-----------------'

p1 = entropy(x, -1)
i1 = information_gain(p1)

p2 = entropy(y, -1)
i2 = information_gain(p2)

print i1, i2
print i1 + i2

(79, 7)
(21, 7)
['Age', 'Pclass', 'SibSp', 'Parch', 'Fare', 'Sex', 'Survived']
0.392405063291
0.47619047619
-----------------
-0.966334094296 -0.998363672594
-1.96469776689


In [7]:
def entropy(dataset, col=-1):
    p = dataset[:, col].mean()
    return p

def information_gain(p):
    if  p == 1 or p==0:
        return 1.0
    ent = (-1.0 * p * np.log2(p)) + (-1.0 * (1.0 - p) * np.log2(1.0 - p))
    return -1.0*ent

In [9]:
INF = 100000
class DT:
    def __init__(self, depth=0, max_depth=10):
        self.left = None
        self.right = None
        self.col_id = None
        self.value = None
        self.depth = depth
        self.max_depth = max_depth
    
    def get_best_gain(self, dataset):
        
        if self.depth == self.max_depth:
            return
        
        n_cols = 6 # number of columns in data
        check_id = 6 # last column index is survival
        all_gains = []
        
        for cx in range(n_cols):
            mean_val_cx = dataset[:, cx].mean()
            right, left = data_split(dataset, cx, mean_val_cx)
            
            if left.shape[0] > 0:
                gain_left = information_gain(entropy(left))
            else:
                gain_left = -1.0 * INF
            
            if right.shape[0] > 0:
                gain_right = information_gain(entropy(right))
            else:
                gain_right = -1.0 * INF
            
            total_gain = gain_left + gain_right
            all_gains.append(total_gain)
            
        self.col_id = np.asarray(all_gains).argmax()
        self.value = dataset[:, self.col_id].mean()
        
        data_right, data_left = data_split(dataset, self.col_id, self.value)
        
        if data_left.shape[0] > 0:
            self.left = DT(depth=self.depth+1, max_depth=self.max_depth)
            self.left.get_best_gain(data_left)
            
        if data_right.shape[0] > 0:
            self.right = DT(depth=self.depth+1, max_depth=self.max_depth)
            self.right.get_best_gain(data_right)
        return


In [15]:
dt = DT(max_depth=10)
dt.get_best_gain(data)

print dt.col_id, dt.value
print dt.left.col_id, dt.left.value
print dt.right.col_id, dt.right.value

5 0.35241301908
0 30.3555805893
1 2.15923566879
