### Decision Trees

#### Preprocessing

In [1]:
import numpy as np
import pandas as pd 

In [2]:
df = pd.read_csv('titanic.csv')

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
col_to_remove = ['PassengerId','Name','Ticket','Cabin','Embarked']

In [6]:
data_clean = df.drop(col_to_remove,axis = 1)

In [7]:
print(data_clean)

     Survived  Pclass     Sex   Age  SibSp  Parch     Fare
0           0       3    male  22.0      1      0   7.2500
1           1       1  female  38.0      1      0  71.2833
2           1       3  female  26.0      0      0   7.9250
3           1       1  female  35.0      1      0  53.1000
4           0       3    male  35.0      0      0   8.0500
..        ...     ...     ...   ...    ...    ...      ...
886         0       2    male  27.0      0      0  13.0000
887         1       1  female  19.0      0      0  30.0000
888         0       3  female   NaN      1      2  23.4500
889         1       1    male  26.0      0      0  30.0000
890         0       3    male  32.0      0      0   7.7500

[891 rows x 7 columns]


In [143]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [144]:
data_clean["Sex"] = le.fit_transform(data_clean["Sex"])

In [145]:
print(data_clean.head())

   Survived  Pclass  Sex   Age  SibSp  Parch     Fare
0         0       3    1  22.0      1      0   7.2500
1         1       1    0  38.0      1      0  71.2833
2         1       3    0  26.0      0      0   7.9250
3         1       1    0  35.0      1      0  53.1000
4         0       3    1  35.0      0      0   8.0500


In [146]:
#dataImputation
data_clean["Age"] = data_clean["Age"].fillna(data_clean["Age"].mean()) 

In [147]:
print(data_clean.head())
print(data_clean.loc[0])
x_data = data_clean

   Survived  Pclass  Sex   Age  SibSp  Parch     Fare
0         0       3    1  22.0      1      0   7.2500
1         1       1    0  38.0      1      0  71.2833
2         1       3    0  26.0      0      0   7.9250
3         1       1    0  35.0      1      0  53.1000
4         0       3    1  35.0      0      0   8.0500
Survived     0.00
Pclass       3.00
Sex          1.00
Age         22.00
SibSp        1.00
Parch        0.00
Fare         7.25
Name: 0, dtype: float64


In [148]:
input_cols = ['Pclass','Sex','Age','SibSp','Parch','Fare']
output_cols = ['Survived']
X = data_clean[input_cols]
Y = data_clean[output_cols]

In [149]:
print(X.shape,Y.shape)
print(type(x_data))
print(type(x_data.Survived))

(891, 6) (891, 1)
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


#### Algorithm

In [150]:
#entropy
def entropy(col):
    elements = np.unique(col,return_counts = "True")
    n = col.shape[0]
    etr = 0
    for i in elements[1]:
        pi = i/n
        etr += (-1*pi*np.log2(pi))
        
    return etr

In [151]:
c = np.array([0,1,0,0,1,0,1,1])
print(entropy(c))

1.0


In [152]:
#node -> 2 children node
def divide(x_data,fkey,fval):
    left_data = pd.DataFrame([],columns=x_data.columns)
    right_data = pd.DataFrame([],columns=x_data.columns)
    for idx in range(x_data.shape[0]):
        if x_data.loc[idx][fkey] < fval:                 
            left_data = left_data.append(x_data.loc[idx])
        else:
            right_data = right_data.append(x_data.loc[idx])
    left_data = left_data.reset_index(drop = "True")
    right_data = right_data.reset_index(drop = "True")

    return left_data,right_data
    

In [153]:
left_data,right_data = divide(x_data,'Sex',x_data['Sex'].mean())
print(left_data.head(),"\n\n",right_data.head())

   Survived  Pclass  Sex   Age  SibSp  Parch     Fare
0       1.0     1.0  0.0  38.0    1.0    0.0  71.2833
1       1.0     3.0  0.0  26.0    0.0    0.0   7.9250
2       1.0     1.0  0.0  35.0    1.0    0.0  53.1000
3       1.0     3.0  0.0  27.0    0.0    2.0  11.1333
4       1.0     2.0  0.0  14.0    1.0    0.0  30.0708 

    Survived  Pclass  Sex        Age  SibSp  Parch     Fare
0       0.0     3.0  1.0  22.000000    1.0    0.0   7.2500
1       0.0     3.0  1.0  35.000000    0.0    0.0   8.0500
2       0.0     3.0  1.0  29.699118    0.0    0.0   8.4583
3       0.0     1.0  1.0  54.000000    0.0    0.0  51.8625
4       0.0     3.0  1.0   2.000000    3.0    1.0  21.0750


In [154]:
#infoGain
def info_gain(x_data,fkey):
    fval = x_data[fkey].mean()
    left_data,right_data = divide(x_data,fkey,fval)
    
    l = float(left_data.shape[0]/x_data.shape[0])
    r = float(right_data.shape[0]/x_data.shape[0])
    etr_node = entropy(np.array(x_data.Survived))
    etr_left = entropy(np.array(left_data.Survived))
    etr_right = entropy(np.array(right_data.Survived))
    
    if left_data.shape[0] == 0 or right_data.shape[0] == 0:
        return -1000000
    i_gain = etr_node - (l*etr_left + r*etr_right)
    
    return i_gain
    

In [155]:
for ix in X.columns:
    print(info_gain(x_data,ix))


0.07579362743608165
0.2176601066606142
0.001158644038169343
0.009584541813400071
0.015380754493137694
0.042140692838995464


In [167]:
class DecisionTree:
    def __init__ (self,depth = 0,max_depth = 5):       #Attributes a particular node stores.
        self.left = None
        self.right = None
        self.fkey = None
        self.fval = None
        self.depth = depth
        self.max_depth = max_depth
        self.target = None
        
    def train(self,x_data):
        i_gain = []
        for f in x_data.columns[1:]:
            i_gain.append(info_gain(x_data,f))
        self.fkey = x_data.columns[np.argmax(i_gain)+1]
        self.fval = x_data[fkey].mean()
        
        left_data,right_data = divide(x_data,self.fkey,self.fval)
        print("created nodes about current node ",fkey,'\n')
        #node truly a leaf
        if left_data.shape[0] == 0 or right_data.shape[0] == 0:       #possible only when all values of that featur are equal.
            if x_data.Survived.mean() > 0.5:
                self.target = 'Survived'
            else:
                self.target = 'Died'
            return
        
        #countering Overfitting
        if self.depth > self.max_depth:
            if x_data.Survived.mean() > 0.5:
                self.target = 'Survived'
            else:
                self.target = 'Died'
            return
        
        #Recursive Case
        self.left = DecisionTree(depth = self.depth+1,max_depth = self.max_depth)    #Attempt without max_depth parameter
        self.left.train(left_data)
        self.right = DecisionTree(depth = self.depth+1,max_depth = self.max_depth)    #Attempt without max_depth parameter
        self.right.train(right_data)
        
        if x_data.Survived.mean() > 0.5:
            self.target = 'Survived'
        else:
            self.target = 'Died'
        return
    
    def predict(self,test):
        if test[self.fkey] > self.fval:
        #go right
            if self.right == None:
                return self.target
            else:
                return self.right.predict(test)
        else:
            if self.left == None:
                return self.target
            else:
                return self.left.predict(test)


In [168]:
#Training
dt = DecisionTree()
dt.train(x_data)

created nodes about current node  Sex 

created nodes about current node  Sex 

created nodes about current node  Sex 

created nodes about current node  Sex 

created nodes about current node  Sex 



In [169]:
#prediction
dt.predict(x_data.loc[2])

'Survived'

In [162]:
from sklearn.preprocessing import LabelEncoder

In [163]:
#Accuracy Computaion
def score(x_train,x_test):
    dt = DecisionTree()
    dt.train(x_train)
    y_pred = []
    for i in range(x_test.shape[0]):
        y_pred.append(dt.predict(x_test.loc[i]))
    le = LabelEncoder()
    y_pred = le.fit_transform(y_pred)
    y_act = np.array(x_test.Survived)
    
    cp = sum(y_pred == y_act)
    
    return cp/x_test.shape[0]
    

In [164]:
n = x_data.shape[0]
t = int(0.7*n)

In [165]:
x_train = x_data[:t]
x_test = x_data[t:]
x_test = x_test.reset_index(drop = 'True')

In [166]:
print(score(x_train,x_test))

created nodes about current node  Sex 

created nodes about current node  Sex 

created nodes about current node  Sex 

created nodes about current node  Sex 

created nodes about current node  Sex 

0.7798507462686567
