In [48]:
import pandas as pd
import numpy as np

In [49]:
data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [50]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [51]:
dropped = ['PassengerId' ,'Embarked' , 'Ticket' , 'Name' , 'Cabin']
data_clean = data.drop(dropped , axis=1)

In [52]:
data_clean.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05


In [53]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data_clean["Sex"] = le.fit_transform(data_clean["Sex"])

In [54]:
data_clean = data_clean.fillna(data_clean['Age'].mean())

In [55]:
input_cols = ['Age' , 'Parch' , 'Fare' , 'SibSp' , 'Pclass' ,'Sex']
output_cols = ['Survived']
x = data_clean[input_cols]
y = data_clean[output_cols]


In [56]:
def entropy(col):
    counts = np.unique(col , return_counts=True)
    ent = 0.0
    n = float(col.shape[0])
    for ix in counts[1]:
        p = ix/n
        ent += -1*p*np.log2(p)
    return ent
        
        

In [57]:
col = np.array([1 ,1,1,0,0,2])
entropy(col)

1.4591479170272448

In [58]:
def divide(x_data,fkey,fval):
    x_right = pd.DataFrame([],columns=x_data.columns)
    x_left = pd.DataFrame([],columns=x_data.columns)
    
    for ix in range(x_data.shape[0]):
        val =x_data[fkey].loc[ix]
        
        if val > fval:
            x_right = x_right.append(x_data.loc[ix])
        else:
            x_left = x_left.append(x_data.loc[ix])
            
    return x_left , x_right   


In [59]:
divide(data_clean[:10],'Sex',0.5)

(   Survived  Pclass  Sex   Age  SibSp  Parch     Fare
 1       1.0     1.0  0.0  38.0    1.0    0.0  71.2833
 2       1.0     3.0  0.0  26.0    0.0    0.0   7.9250
 3       1.0     1.0  0.0  35.0    1.0    0.0  53.1000
 8       1.0     3.0  0.0  27.0    0.0    2.0  11.1333
 9       1.0     2.0  0.0  14.0    1.0    0.0  30.0708,
    Survived  Pclass  Sex        Age  SibSp  Parch     Fare
 0       0.0     3.0  1.0  22.000000    1.0    0.0   7.2500
 4       0.0     3.0  1.0  35.000000    0.0    0.0   8.0500
 5       0.0     3.0  1.0  29.699118    0.0    0.0   8.4583
 6       0.0     1.0  1.0  54.000000    0.0    0.0  51.8625
 7       0.0     3.0  1.0   2.000000    3.0    1.0  21.0750)

In [60]:
def information_gain(x_data,fkey,fval):
    left,right=divide(x_data,fkey,fval)
    l=float(left.shape[0])/x_data.shape[0]
    r=float(right.shape[0])/x_data.shape[0]
    if left.shape[0] ==0 or right.shape[0]==0:
        return -100000000000000
    i_gain = entropy(x_data.Survived)-(l*entropy(left.Survived)+r*entropy(right.Survived))
    return i_gain

In [61]:
for fx in x.columns:
    print(fx)
    print(information_gain(data_clean,fx,data_clean[fx].mean()))

Age
0.001158644038169343
Parch
0.015380754493137694
Fare
0.042140692838995464
SibSp
0.009584541813400071
Pclass
0.07579362743608165
Sex
0.2176601066606142


In [69]:
class decisiontree:
    def __init__(self , depth=0 , max_depth=5):
        self.left = None
        self.right = None
        self.fkey = None
        self.fval = None
        self.max_depth = max_depth
        self.depth = depth
        self.target = None
        
    def train(self , x_train):
        features = ['Age' , 'Parch' , 'Fare' , 'SibSp' , 'Pclass' ,'Sex']
        info_gain =[]
        for ix in features:
            i_gain = information_gain(x_train,ix,x_train[ix].mean())
            info_gain.append(i_gain)
            
        self.fkey = features[np.argmax(info_gain)]
        self.fval = x_train[self.fkey].mean()
        data_left,data_right = divide(x_train,self.fkey,self.fval)
        data_left = data_left.reset_index(drop=True)
        data_right = data_right.reset_index(drop=True)
        
        if data_left.shape[0]==0 or data_right.shape[0]==0:
            if x_train.Survived.mean() >= 0.5:
                self.target = 'Survive'
            else:
                self.target = 'dead'
            return
        if self.depth>=self.max_depth:
            if x_train.Survived.mean() >= 0.5:
                self.target = 'Survive'
            else:
                self.target = 'dead'
            return
                
        self.left = decisiontree(depth=self.depth+1,max_depth=self.max_depth+1)
        self.left.train(data_left)
        self.right = decisiontree(depth=self.depth+1,max_depth=self.max_depth+1)
        self.right.train(data_right)
        if x_train.Survived.mean() >= 0.5:
                self.target = 'Survive'
        else:
                self.target = 'dead'
        return
    def predict(self,test):
        if test([self.fkey])>self.fval:
            if self.right is None:
                return self.target
            return self.right.predict(test)
        else:
            if self.left is None:
                return self.target
            return self.left.predict(test)
           
            
            

In [63]:
from sklearn.model_selection import train_test_split

In [64]:
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.3)

In [65]:
x_train.shape , x_test.shape

((623, 6), (268, 6))

In [73]:
x_test = x_test.reset_index(drop = True)

In [75]:
dt = decisiontree()

In [76]:
dt.train(x_train)

KeyError: 'the label [1] is not in the [index]'