In [104]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [12]:
data=pd.read_csv("titanic.csv")

In [13]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [14]:
col_toDrop=["PassengerId","Name","Ticket","Cabin","Embarked"]



In [15]:
data_clean=data.drop(col_toDrop,axis=1)

In [16]:
data_clean.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05


In [17]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
data_clean["Sex"]=le.fit_transform(data_clean["Sex"])

In [18]:
data_clean.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,1,22.0,1,0,7.25
1,1,1,0,38.0,1,0,71.2833
2,1,3,0,26.0,0,0,7.925
3,1,1,0,35.0,1,0,53.1
4,0,3,1,35.0,0,0,8.05


In [21]:
data_clean=data_clean.fillna(data_clean["Age"].mean())

In [22]:
data_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null int64
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
dtypes: float64(2), int64(5)
memory usage: 48.8 KB


In [25]:
X=data_clean[["Pclass","Sex","Age","SibSp","Parch","Fare"]]
Y=data_clean["Survived"]

In [42]:
def entropy(col):
    uniq=np.unique(col,return_counts=True)
    N=float(col.shape[0])
    entropy=0.0
    for ix in uniq[1]:
        p=ix/N
        entropy+=-1.0*p*np.log2(p)
    
    return entropy

def divide_data(x,fkey,fval):
    left=pd.DataFrame(data=[],columns=x.columns)
    right=pd.DataFrame(data=[],columns=x.columns)
    
    for ix in range(x.shape[0]):
        val=x[fkey].loc[ix]
        if val > fval:
            right=right.append(x.loc[ix])
        else:
            left=left.append(x.loc[ix])
    
    return left,right


def information_gain(x,fkey,fval):
    
    left,right=divide_data(x,fkey,fval)
    l=float(left.shape[0])/x.shape[0]
    r=float(right.shape[0])/x.shape[0]
    
    if left.shape[0]==0 or right.shape[0]==0:
        return -100000
    
    i_gain=entropy(x.Survived)-l*(entropy(left.Survived))-r*(entropy(right.Survived))
    
    return i_gain


In [85]:
class DecisionTree:
    
    def __init__(self,depth=0,max_depth=5):
        
        self.left=None
        self.right=None
        self.fkey=None
        self.fval=None
        self.depth=depth
        self.max_depth=max_depth
        self.target=None
        
    def train(self,X_train):
        
        features=["Pclass","Sex","Age","SibSp","Parch","Fare"]
        i_gain=[]
        
        for ix in features:
            ig=information_gain(X_train,ix,X_train[ix].mean())
            
            i_gain.append(ig)
        
        self.fkey=features[np.argmax(i_gain)]
        self.fval=X_train[self.fkey].mean()
        
        print("making tree, feature is ",self.fkey)
        
        data_left,data_right=divide_data(X_train,self.fkey,self.fval)
        data_left=data_left.reset_index(drop=True)
        data_right=data_right.reset_index(drop=True)
        
        if data_left.shape[0]==0 or data_right.shape[0]==0:
            if X_train.Survived.mean()>0.5:
                self.target=1
            
            else:
                self.target=0
        
            return
        if(self.depth>self.max_depth):
            
            if X_train.Survived.mean()>0.5:
                self.target=1
            
            else:
                self.target=0
        
            return
 
        self.left=DecisionTree(depth=self.depth+1,max_depth=self.max_depth)
        self.left.train(data_left)
        
        self.right=DecisionTree(depth=self.depth+1,max_depth=self.max_depth)
        self.right.train(data_right)
        
        if X_train.Survived.mean()>0.5:
            self.target=1
            
        else:
            self.target=0
        
        return
    
    
    def predict(self,test):
        if test[self.fkey]>self.fval:
            if self.right is None:
                return self.target
            
            return self.right.predict(test)
        
        else:
            if self.left is None:
                return self.target
            
            return self.left.predict(test)
            

In [86]:
split = int(0.7*data_clean.shape[0])
train_data = data_clean[:split]
test_data = data_clean[split:]
test_data = test_data.reset_index(drop=True)

In [87]:
dt = DecisionTree()

In [88]:
dt.train(train_data)

making tree, feature is  Sex
making tree, feature is  Pclass
making tree, feature is  Age
making tree, feature is  SibSp
making tree, feature is  Pclass
making tree, feature is  Age
making tree, feature is  Age
making tree, feature is  Age
making tree, feature is  Age
making tree, feature is  Age
making tree, feature is  Age
making tree, feature is  SibSp
making tree, feature is  Parch
making tree, feature is  Pclass
making tree, feature is  Pclass
making tree, feature is  Pclass
making tree, feature is  Age
making tree, feature is  Age
making tree, feature is  SibSp
making tree, feature is  Fare
making tree, feature is  Parch
making tree, feature is  Pclass
making tree, feature is  Pclass
making tree, feature is  Age
making tree, feature is  Age
making tree, feature is  Age
making tree, feature is  Pclass
making tree, feature is  Age
making tree, feature is  Age
making tree, feature is  Age
making tree, feature is  Age
making tree, feature is  Age
making tree, feature is  Age
making t

In [89]:
print(dt.fkey)
print(dt.fval)
print(dt.left.fkey)
print(dt.left.right.right.target)

Sex
0.6292134831460674
Pclass
0


In [90]:
y_pred = []
for ix in range(test_data.shape[0]):
    y_pred.append(dt.predict(test_data.loc[ix]))

In [95]:
y_actual = test_data["Survived"]

In [102]:

y_pred = np.array(y_pred).reshape((-1,))
print(y_pred.shape)

acc = np.sum(np.array(y_pred)==np.array(y_actual))/y_pred.shape[0]

(268,)


In [103]:
acc

0.8134328358208955