In [1]:
import numpy as np
import pandas as pd
import re
import random
from random import choices
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from IPython.display import Image as PImage
from subprocess import check_call
from PIL import Image, ImageDraw, ImageFont

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
accuracies = []
PassengerId = test['PassengerId']

train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


# Cleaning Dataset

In [3]:
original_train = train.copy() 
full_data = [train, test]


train['Has_Cabin'] = train["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
test['Has_Cabin'] = test["Cabin"].apply(lambda x: 0 if type(x) == float else 1)


for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')

for dataset in full_data:
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())


for dataset in full_data:
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    dataset.loc[np.isnan(dataset['Age']), 'Age'] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)


def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""

for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)

for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

for dataset in full_data:
    
    dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    
    
    title_mapping = {"Mr": 1, "Master": 2, "Mrs": 3, "Miss": 4, "Rare": 5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

    
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    
    
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] 						        = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] 							        = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
    
    
    dataset.loc[ dataset['Age'] <= 16, 'Age'] 					       = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] ;

In [4]:
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
train = train.drop(drop_elements, axis = 1)
test  = test.drop(drop_elements, axis = 1)

In [5]:
test.head(3)

Unnamed: 0,Pclass,Sex,Age,Parch,Fare,Embarked,Has_Cabin,FamilySize,IsAlone,Title
0,3,1,2,0,0,2,0,1,1,1
1,3,0,2,0,0,0,0,2,0,3
2,2,1,3,0,1,2,0,1,1,1


In [6]:
class Node:
    def __init__(self, data,f):
        self.t = data
        self.c = f
        self.o = None
        self.left = None
        self.right = None

In [7]:
class doublyLinkedList:
    def __init__(self):
        self.start_node = None
        
    def InsertToEmptyList(self, data):
        if self.start_node is None:
            new_node = data
            self.start_node = new_node
        else:
            print("The list is empty")


# Decision Trees 

In [8]:
class DecisionTree:
    def __init__(self,criterion,max_depth,min_samples_split,min_samples_leaf):
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.dt = None
        self.p = None
        self.l = []
        k = 0
        for i in range(self.max_depth+1):
            n = [0] * (2**i)
            self.l.append(n)
            for j in range(2**i):
                self.l[i][j] = k
                k = k+1
        
    def fit(self,train,sample_weight=None):
        self.sample_weights = None
        if isinstance(sample_weight,list)  and len(sample_weight) == len(train["Survived"]):
            self.sample_weights = sample_weight
            train["weights"] = sample_weight 
        self.tree(train,0,0)
            
    def find(self,f,r):
        for i in range(self.max_depth+1):
            for j in range(2**i):
                if self.l[i][j] == f:
                    self.l[i][j] = r
                
    def printtree(self):
        for i in range(self.max_depth+1):
            print(self.l[i])
            
    def predict(self,start,y):
        
        if start.o != None:
            y["output"] = start.o
            self.p.at[y.index,'output'] = start.o
            return
        ls = y[y[start.c]<=start.t]
        rs = y[y[start.c]>start.t]
        if start.left != None:
            self.predict(start.left,ls.drop(start.c,axis=1))
        if start.right != None:
            self.predict(start.right,rs.drop(start.c,axis=1))
        
    def tree(self,train,l,m,n=None,d=None):
        minimum = 999999
        cri = self.criterion
        for c in train:
            if c == "Survived" or c == "weights" or c == "output":
                continue
            i = train[c].unique()
            for j in i:
                ls = train[train[c]<=j]
                rs = train[train[c]>j]
                if len(ls[c])==0 or len(rs[c])==0:
                        continue
                ls1 = ls[c]
                rs1 = rs[c]
                if cri == "Gini impurity":
                    p = len(train[train["Survived"]==1])/len(train[c])
                    q = 1-p
                    if isinstance(self.sample_weights,list):
                        p = train[train["Survived"] == 1]['weights'].sum()/train["weights"].sum()
                        q = train[train["Survived"] == 0]['weights'].sum()/train["weights"].sum()
                    gini_start = 1-(p**2 + q**2)
                    lp = len(ls[ls["Survived"]==1])/len(ls[c])
                    lq = 1 - lp
                    if isinstance(self.sample_weights,list):
                        lp = ls[ls["Survived"] == 1]['weights'].sum()/ls["weights"].sum()
                        lq = ls[ls["Survived"] == 0]['weights'].sum()/ls["weights"].sum()
                    gini_l = 1-(lp**2 + lq**2)
                    rp = len(rs[rs["Survived"]==1])/len(rs[c])
                    rq = 1 - rp
                    if isinstance(self.sample_weights,list):
                        rp = rs[rs["Survived"] == 1]['weights'].sum()/rs["weights"].sum()
                        rq = rs[rs["Survived"] == 0]['weights'].sum()/rs["weights"].sum()
                    gini_r = 1-(rp**2 + rq**2)
                    lw = len(ls[c])/len(train[c])
                    rw = len(rs[c])/len(train[c])
                    bf = (gini_l * lw) + (gini_r * rw)
                    bf = bf - gini_start
                if cri == "entropy":
                    p = len(train[train["Survived"]==1])/len(train[c])
                    q = 1-p
                    if isinstance(self.sample_weights,list):
                        p = train[train["Survived"] == 1]['weights'].sum()/train["weights"].sum()
                        q = train[train["Survived"] == 0]['weights'].sum()/train["weights"].sum()
                    if p==0:
                        ent_start = (-q * (np.log2(q)))
                    elif q==0:
                        ent_start = (-p * (np.log2(p)))
                    else:
                        ent_start = (-p * (np.log2(p))) + (-q * (np.log2(q)))
                    lp = len(ls[ls["Survived"]==1])/len(ls[c])
                    lq = 1 - lp
                    if isinstance(self.sample_weights,list):
                        lp = ls[ls["Survived"] == 1]['weights'].sum()/ls["weights"].sum()
                        lq = ls[ls["Survived"] == 0]['weights'].sum()/ls["weights"].sum()
                    if lp==0:
                        ent_l = (-lq * (np.log2(lq)))
                    elif lq==0:
                        ent_l = (-lp * (np.log2(lp)))
                    else:
                        ent_l = (-lp * (np.log2(lp))) + (-lq * (np.log2(lq)))
                    rp = len(rs[rs["Survived"]==1])/len(rs[c])
                    rq = 1 - rp
                    if isinstance(self.sample_weights,list):
                        rp = rs[rs["Survived"] == 1]['weights'].sum()/rs["weights"].sum()
                        rq = rs[rs["Survived"] == 0]['weights'].sum()/rs["weights"].sum()
                    if rp==0:
                        ent_r = (-rq * (np.log2(rq)))
                    elif rq==0:
                        ent_r = (-rp * (np.log2(rp)))
                    else:
                        ent_r = (-rp * (np.log2(rp))) + (-rq * (np.log2(rq)))
                    lw = len(ls[c])/len(train[c])
                    rw = len(rs[c])/len(train[c])
                    bf = (ent_l * lw) + (ent_r * rw)
                    bf = bf - ent_start
                if cri == "misclassification rate":
                    p = len(train[train["Survived"]==1])/len(train[c])
                    q = 1-p
                    if isinstance(self.sample_weights,list):
                        p = train[train["Survived"] == 1]['weights'].sum()/train["weights"].sum()
                        q = train[train["Survived"] == 0]['weights'].sum()/train["weights"].sum()
                    ent_start = 1 - max(p,q)
                    lp = len(ls[ls["Survived"]==1])/len(ls[c])
                    lq = 1 - lp
                    if isinstance(self.sample_weights,list):
                        lp = ls[ls["Survived"] == 1]['weights'].sum()/ls["weights"].sum()
                        lq = ls[ls["Survived"] == 0]['weights'].sum()/ls["weights"].sum()
                    ent_l = 1 - max(lp,lq)
                    rp = len(rs[rs["Survived"]==1])/len(rs[c])
                    rq = 1 - rp
                    if isinstance(self.sample_weights,list):
                        rp = rs[rs["Survived"] == 1]['weights'].sum()/rs["weights"].sum()
                        rq = rs[rs["Survived"] == 0]['weights'].sum()/rs["weights"].sum()
                    ent_r = 1 - max(rp,rq)
                    lw = len(ls[c])/len(train[c])
                    rw = len(rs[c])/len(train[c])
                    bf = (ent_l * lw) + (ent_r * rw)
                    bf = bf - ent_start
                if bf<minimum:
                    minimum = bf
                    b = c
                    t = j
                    ld = ls
                    rd = rs
        if minimum==999999:
            n.o = np.around(train["Survived"].mean())
            
            return
        ld = ld.drop(b,axis=1)
        rd = rd.drop(b,axis=1)
        if l==0:
            node = Node(t,b)
            self.find(m,[b,t])
            self.dt = doublyLinkedList()
            self.dt.InsertToEmptyList(node)
            self.tree(ld,l+1,(m*2)+1,node,"left")
            self.tree(rd,l+1,(m*2)+2,node,"right")
        else:
            node = Node(t,b)
            
            if d=="left":
                n.left = node
            else:
                n.right = node
            if l!=self.max_depth:
                self.find(m,[b,t])
            if(l==self.max_depth):
                node.o = np.around(train["Survived"].mean())
                self.find(m,[len(train),node.o])
                return 
            self.tree(ld,l+1,(m*2)+1,node,"left")
            self.tree(rd,l+1,(m*2)+2,node,"right")
    

In [9]:
model = DecisionTree("Gini impurity",3,5,5)
model.fit(train)
model.printtree()
test["output"] = 0
model.p = test
model.predict(model.dt.start_node,test)
pred = model.p['output']

[['Title', 1]]
[['Has_Cabin', 0], ['FamilySize', 4]]
[['Pclass', 1], ['Fare', 0], ['Pclass', 2], ['Pclass', 2]]
[[22, 0.0], [402, 0.0], [8, 0.0], [85, 0.0], [196, 1.0], [127, 1.0], [6, 1.0], [45, 0.0]]


In [10]:
a = pd.read_csv('submission.csv')
target = a["Survived"]
s = 0
for i in pred.index:
    if target.iloc[i]==pred.iloc[i]:
        s +=1
print("Accuracy = ",(s/len(pred))*100)
accuracies.append((s/len(pred))*100)

Accuracy =  100.0


# Random Forests - Select a random subset of features when splitting

In [11]:
class RandomForest:
    def __init__(self,criterion,max_depth,num_trees,min_features,replacement,min_samples_split,min_samples_leaf):
        self.criterion = criterion
        self.max_depth = max_depth
        self.num_trees = num_trees
        self.min_features = min_features
        self.replacement = replacement
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.t = []
        self.dt = []
        self.p = []
        self.l = []
        k = 0
        for p in range(self.num_trees):
            k = 0
            self.l = []
            for i in range(self.max_depth+1):
                n = [0] * (2**i)
                self.l.append(n)
                for j in range(2**i):
                    self.l[i][j] = k
                    k = k+1
            self.t.append(self.l)
        
    def fit(self,train):
        f = ["Pclass","Sex","Age","Parch","Fare","Embarked","Has_Cabin","FamilySize","IsAlone","Title"]
        for i in range(self.num_trees):
            if self.replacement == "without":
                n = random.randint(self.min_features, 10)
                m = random.sample(f,n)
            else:
                m = choices(f,k=self.min_features)
            m.append("Survived")
            self.tree(train[m],0,0,tl=i)
            
    def find(self,f,r,tl):
        for i in range(self.max_depth+1):
            for j in range(2**i):
                if self.t[tl][i][j] == f:
                    self.t[tl][i][j] = r
                
    def printtree(self):
        for k in range(self.num_trees):
            for i in range(self.max_depth+1):
                print(self.t[k][i])
            print("\n")
                
    def predict(self,x):
        x["output"] = 0
        for j in range(len(self.dt)):
            self.p.append(x)
        for i in range(len(self.dt)):
            self.predict1(self.dt[i].start_node,self.p[i],i)
            
    def predict1(self,start,y,k):
        if start.o != None:
            y["output"] = start.o
            self.p[k].at[y.index,'output'] = start.o
            return
        ls = y[y[start.c]<=start.t]
        rs = y[y[start.c]>start.t]
        if start.left != None:
            self.predict1(start.left,ls.drop(start.c,axis=1),k)
        if start.right != None:
            self.predict1(start.right,rs.drop(start.c,axis=1),k)
        
    def tree(self,train,l,m,n=None,d=None,tl=None):
        minimum = 999999
        cri = self.criterion
        for c in train:
            if c == "Survived":
                continue
            i = train[c].unique()
            for j in i:
                ls = train[train[c]<=j]
                rs = train[train[c]>j]
                if len(ls[c])==0 or len(rs[c])==0:
                        continue
                ls1 = ls[c]
                rs1 = rs[c]
                if cri == "Gini impurity":
                    p = len(train[train["Survived"]==1])/len(train[c])
                    q = 1-p
                    gini_start = 1-(p**2 + q**2)
                    lp = len(ls[ls["Survived"]==1])/len(ls[c])
                    lq = 1 - lp
                    gini_l = 1-(lp**2 + lq**2)
                    rp = len(rs[rs["Survived"]==1])/len(rs[c])
                    rq = 1 - rp
                    gini_r = 1-(rp**2 + rq**2)
                    lw = len(ls[c])/len(train[c])
                    rw = len(rs[c])/len(train[c])
                    bf = (gini_l * lw) + (gini_r * rw)
                    bf = bf - gini_start
                if cri == "entropy":
                    p = len(train[train["Survived"]==1])/len(train[c])
                    q = 1-p
                    if p==0:
                        ent_start = (-q * (np.log2(q)))
                    elif q==0:
                        ent_start = (-p * (np.log2(p)))
                    else:
                        ent_start = (-p * (np.log2(p))) + (-q * (np.log2(q)))
                    lp = len(ls[ls["Survived"]==1])/len(ls[c])
                    lq = 1 - lp
                    if lp==0:
                        ent_l = (-lq * (np.log2(lq)))
                    elif lq==0:
                        ent_l = (-lp * (np.log2(lp)))
                    else:
                        ent_l = (-lp * (np.log2(lp))) + (-lq * (np.log2(lq)))
                    rp = len(rs[rs["Survived"]==1])/len(rs[c])
                    rq = 1 - rp
                    if rp==0:
                        ent_r = (-rq * (np.log2(rq)))
                    elif rq==0:
                        ent_r = (-rp * (np.log2(rp)))
                    else:
                        ent_r = (-rp * (np.log2(rp))) + (-rq * (np.log2(rq)))
                    lw = len(ls[c])/len(train[c])
                    rw = len(rs[c])/len(train[c])
                    bf = (ent_l * lw) + (ent_r * rw)
                    bf = bf - ent_start
                if cri == "misclassification rate":
                    p = len(train[train["Survived"]==1])/len(train[c])
                    q = 1-p
                    ent_start = 1 - max(p,q)
                    lp = len(ls[ls["Survived"]==1])/len(ls[c])
                    lq = 1 - lp
                    ent_l = 1 - max(lp,lq)
                    rp = len(rs[rs["Survived"]==1])/len(rs[c])
                    rq = 1 - rp
                    ent_r = 1 - max(rp,rq)
                    lw = len(ls[c])/len(train[c])
                    rw = len(rs[c])/len(train[c])
                    bf = (ent_l * lw) + (ent_r * rw)
                    bf = bf - ent_start
                if bf<minimum:
                    minimum = bf
                    b = c
                    t = j
                    ld = ls
                    rd = rs
        if minimum==999999:
            n.o = np.around(train["Survived"].mean())
            return
        ld = ld.drop(b,axis=1)
        rd = rd.drop(b,axis=1)
        if l==0:
            node = Node(t,b)
            self.find(m,[b,t],tl)
            self.dt.append(doublyLinkedList())
            self.dt[-1].InsertToEmptyList(node)
            self.tree(ld,l+1,(m*2)+1,node,"left",tl)
            self.tree(rd,l+1,(m*2)+2,node,"right",tl)
        else:
            node = Node(t,b)
            
            if d=="left":
                n.left = node
            else:
                n.right = node
            if l!=self.max_depth:
                self.find(m,[b,t],tl)
            if(l==self.max_depth):
                node.o = np.around(train["Survived"].mean())
                self.find(m,[len(train),node.o],tl)
                return 
            self.tree(ld,l+1,(m*2)+1,node,"left",tl)
            self.tree(rd,l+1,(m*2)+2,node,"right",tl)
    

In [12]:
model1 = RandomForest("Gini impurity",3,10,6,"without",5,5)
model1.fit(train)
model1.printtree()
model1.predict(test)

[['Pclass', 2]]
[['FamilySize', 1], ['Embarked', 0]]
[['Has_Cabin', 0], ['Parch', 0], ['Has_Cabin', 0], ['FamilySize', 4]]
[[124, 0.0], [89, 1.0], [84, 1.0], [103, 1.0], [343, 0.0], [10, 0.0], [133, 0.0], [5, 0.0]]


[['Title', 1]]
[['Has_Cabin', 0], ['FamilySize', 4]]
[['Pclass', 1], ['Age', 2], ['Pclass', 2], ['Pclass', 2]]
[[22, 0.0], [402, 0.0], [72, 0.0], [21, 0.0], [196, 1.0], [127, 1.0], [6, 1.0], [45, 0.0]]


[['Title', 1]]
[['Has_Cabin', 0], ['Pclass', 2]]
[['Pclass', 1], ['Fare', 0], ['Sex', 0], ['Fare', 2]]
[[22, 0.0], [402, 0.0], [8, 0.0], [85, 0.0], [170, 1.0], [32, 1.0], [146, 1.0], [26, 0.0]]


[['Title', 1]]
[['Pclass', 1], ['FamilySize', 4]]
[['Fare', 0], ['Fare', 2], ['Pclass', 2], ['Pclass', 2]]
[7, [102, 0.0], [386, 0.0], [24, 0.0], [196, 1.0], [127, 1.0], [6, 1.0], [45, 0.0]]


[['Title', 1]]
[['Has_Cabin', 0], ['Has_Cabin', 0]]
[['Age', 1], ['Age', 2], ['Sex', 0], ['Age', 3]]
[[287, 0.0], [137, 0.0], [72, 0.0], [21, 0.0], [217, 1.0], [46, 0.0], [110, 1.0], 14]


[

In [13]:
pred = []
for j in (model1.p[0].index):
    ones = 0
    zero = 0
    for k in range(model1.num_trees):
        if model1.p[k].iloc[j]["output"] == 1:
            ones += 1
        else:
            zero += 1
    if ones > zero:
        pred.append(1)
    else:
        pred.append(0)
        
s = 0
for i in range(len(pred)):
    if target.iloc[i]==pred[i]:
        s +=1
print("Accuracy = ",(s/len(pred))*100)
accuracies.append((s/len(pred))*100)
        

Accuracy =  100.0


# Random Forests - Sample  samples with replacement

In [14]:
class RandomForest1:
    def __init__(self,criterion,max_depth,num_trees,min_features,replacement,min_samples_split,min_samples_leaf):
        self.criterion = criterion
        self.max_depth = max_depth
        self.num_trees = num_trees
        self.min_features = min_features
        self.replacement = replacement
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.t = []
        self.dt = []
        self.p = []
        self.l = []
        k = 0
        for p in range(self.num_trees):
            k = 0
            self.l = []
            for i in range(self.max_depth+1):
                n = [0] * (2**i)
                self.l.append(n)
                for j in range(2**i):
                    self.l[i][j] = k
                    k = k+1
            self.t.append(self.l)
        
    def fit(self,train):
        f = ["Pclass","Sex","Age","Parch","Fare","Embarked","Has_Cabin","FamilySize","IsAlone","Title"]
        for i in range(self.num_trees):
            cols = []
            count=0
            f = ["Pclass","Sex","Age","Parch","Fare","Embarked","Has_Cabin","FamilySize","IsAlone","Title"]
            m = choices(f,k=6)
            testing = train[m]
            for column in testing.columns:
                t = column+str(count)
                testing.rename(columns={column : t})
                cols.append(column+str(count))
                count+=1
            testing.columns = cols
            a = train["Survived"]
            testing["Survived"] = a
            self.tree(testing,0,0,tl=i)
            
    def find(self,f,r,tl):
        for i in range(self.max_depth+1):
            for j in range(2**i):
                if self.t[tl][i][j] == f:
                    self.t[tl][i][j] = r
                
    def printtree(self):
        for k in range(self.num_trees):
            for i in range(self.max_depth+1):
                print(self.t[k][i])
            print("\n")
                
    def predict(self,x):
        x["output"] = 0
        for j in range(len(self.dt)):
            self.p.append(x)
        for i in range(len(self.dt)):
            self.predict1(self.dt[i].start_node,self.p[i],i)
            
    def predict1(self,start,y,k):
        if start.o != None:
            y["output"] = start.o
            self.p[k].at[y.index,'output'] = start.o
            return
        ls = y[y[start.c]<=start.t]
        rs = y[y[start.c]>start.t]
        if start.left != None:
            self.predict1(start.left,ls,k)
        if start.right != None:
            self.predict1(start.right,rs,k)
        
    def tree(self,train,l,m,n=None,d=None,tl=None):
        minimum = 999999
        cri = self.criterion
        for c in train:
            if c == "Survived":
                continue
            i = train[c].unique()
            for j in i:
                ls = train[train[c]<=j]
                rs = train[train[c]>j]
                if len(ls[c])==0 or len(rs[c])==0:
                        continue
                ls1 = ls[c]
                rs1 = rs[c]
                if cri == "Gini impurity":
                    p = len(train[train["Survived"]==1])/len(train[c])
                    q = 1-p
                    gini_start = 1-(p**2 + q**2)
                    lp = len(ls[ls["Survived"]==1])/len(ls[c])
                    lq = 1 - lp
                    gini_l = 1-(lp**2 + lq**2)
                    rp = len(rs[rs["Survived"]==1])/len(rs[c])
                    rq = 1 - rp
                    gini_r = 1-(rp**2 + rq**2)
                    lw = len(ls[c])/len(train[c])
                    rw = len(rs[c])/len(train[c])
                    bf = (gini_l * lw) + (gini_r * rw)
                    bf = bf - gini_start
                if cri == "entropy":
                    p = len(train[train["Survived"]==1])/len(train[c])
                    q = 1-p
                    if p==0:
                        ent_start = (-q * (np.log2(q)))
                    elif q==0:
                        ent_start = (-p * (np.log2(p)))
                    else:
                        ent_start = (-p * (np.log2(p))) + (-q * (np.log2(q)))
                    lp = len(ls[ls["Survived"]==1])/len(ls[c])
                    lq = 1 - lp
                    if lp==0:
                        ent_l = (-lq * (np.log2(lq)))
                    elif lq==0:
                        ent_l = (-lp * (np.log2(lp)))
                    else:
                        ent_l = (-lp * (np.log2(lp))) + (-lq * (np.log2(lq)))
                    rp = len(rs[rs["Survived"]==1])/len(rs[c])
                    rq = 1 - rp
                    if rp==0:
                        ent_r = (-rq * (np.log2(rq)))
                    elif rq==0:
                        ent_r = (-rp * (np.log2(rp)))
                    else:
                        ent_r = (-rp * (np.log2(rp))) + (-rq * (np.log2(rq)))
                    lw = len(ls[c])/len(train[c])
                    rw = len(rs[c])/len(train[c])
                    bf = (ent_l * lw) + (ent_r * rw)
                    bf = bf - ent_start
                if cri == "misclassification rate":
                    p = len(train[train["Survived"]==1])/len(train[c])
                    q = 1-p
                    ent_start = 1 - max(p,q)
                    lp = len(ls[ls["Survived"]==1])/len(ls[c])
                    lq = 1 - lp
                    ent_l = 1 - max(lp,lq)
                    rp = len(rs[rs["Survived"]==1])/len(rs[c])
                    rq = 1 - rp
                    ent_r = 1 - max(rp,rq)
                    lw = len(ls[c])/len(train[c])
                    rw = len(rs[c])/len(train[c])
                    bf = (ent_l * lw) + (ent_r * rw)
                    bf = bf - ent_start
                if bf<minimum:
                    minimum = bf
                    b = c
                    t = j
                    ld = ls
                    rd = rs
        if minimum==999999:
            n.o = np.around(train["Survived"].mean())
            return
        ld = ld.drop(b,axis=1)
        rd = rd.drop(b,axis=1)
        if l==0:
            node = Node(t,b[:-1])
            self.find(m,[b[:-1],t],tl)
            self.dt.append(doublyLinkedList())
            self.dt[-1].InsertToEmptyList(node)
            self.tree(ld,l+1,(m*2)+1,node,"left",tl)
            self.tree(rd,l+1,(m*2)+2,node,"right",tl)
        else:
            node = Node(t,b[:-1])
            
            if d=="left":
                n.left = node
            else:
                n.right = node
            if l!=self.max_depth:
                self.find(m,[b[:-1],t],tl)
            if(l==self.max_depth):
                node.o = np.around(train["Survived"].mean())
                self.find(m,[len(train),node.o],tl)
                return 
            self.tree(ld,l+1,(m*2)+1,node,"left",tl)
            self.tree(rd,l+1,(m*2)+2,node,"right",tl)
    

In [15]:
model2 = RandomForest1("Gini impurity",3,10,6,"with",5,5)
model2.fit(train)
model2.printtree()
model2.predict(test)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

[['Title', 1]]
[['Fare', 1], ['FamilySize', 4]]
[['Age', 1], ['Age', 0], ['Fare', 2], ['Age', 0]]
[[223, 0.0], [101, 0.0], [14, 0.0], [179, 0.0], [219, 1.0], [104, 1.0], [27, 0.0], [24, 0.0]]


[['Title', 1]]
[['Has_Cabin', 0], ['FamilySize', 4]]
[['FamilySize', 1], ['FamilySize', 4], ['Title', 4], ['Has_Cabin', 0]]
[7, 8, 9, 10, [300, 1.0], [23, 0.0], [47, 0.0], 14]


[['Title', 1]]
[['Fare', 1], ['FamilySize', 4]]
[['Age', 1], ['Age', 0], ['Title', 4], ['Age', 0]]
[[223, 0.0], [101, 0.0], [14, 0.0], [179, 0.0], [300, 1.0], [23, 0.0], [27, 0.0], [24, 0.0]]


[['Pclass', 2]]
[['Fare', 1], ['FamilySize', 4]]
[['Fare', 0], ['FamilySize', 1], ['Parch', 0], ['Fare', 1]]
[7, [86, 0.0], [119, 1.0], [183, 1.0], [381, 0.0], [56, 1.0], 13, [53, 0.0]]


[['Title', 1]]
[['Has_Cabin', 0], ['Has_Cabin', 0]]
[['Age', 1], ['Age', 2], ['Sex', 0], ['Age', 3]]
[[287, 0.0], [137, 0.0], [72, 0.0], [21, 0.0], [217, 1.0], [46, 0.0], [110, 1.0], 14]


[['Title', 1]]
[['Pclass', 1], ['Pclass', 2]]
[['Fare', 0



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [16]:
pred = []
for j in (model2.p[0].index):
    ones = 0
    zero = 0
    for k in range(model2.num_trees):
        if model2.p[k].iloc[j]["output"] == 1:
            ones += 1
        else:
            zero += 1
    if ones > zero:
        pred.append(1)
    else:
        pred.append(0)
        
s = 0
for i in range(len(pred)):
    if target.iloc[i]==pred[i]:
        s +=1
print("Accuracy = ",(s/len(pred))*100)
accuracies.append((s/len(pred))*100)

Accuracy =  94.97607655502392


# AdaBoost

In [17]:
class AdaBoost:
    
    def __init__(self,num_learners):
        self.num_learners = num_learners
        self.learning_rate = []
        self.weak_learner = DecisionTree("Gini impurity",3,5,5)
        self.alphas = []
        self.G_M = []
        self.M = None
        self.training_errors = []
        self.prediction_errors = []

    def fit(self, X):
        
        
        self.alphas = [] 
        self.training_errors = []
        self.M = self.num_learners
        M = self.M
       
        for m in range(0, M):
            if m == 0:
                w_i = np.ones(len(train["Survived"])) * 1 / len(train["Survived"])
            else:
                
                w_i = w_i * np.exp(alpha_m * (np.not_equal(train["Survived"], y_pred)).astype(int))
            G_m = DecisionTree("Gini impurity",3,5,5)
            G_m.fit(X, sample_weight = w_i)
            if m == 0:
                X["output"] = 0
            G_m.p = X
            G_m.predict(G_m.dt.start_node,X)
            y_pred = G_m.p["output"]
            
            self.G_M.append(G_m)
            
            error_m = (sum(w_i * (np.not_equal(train["Survived"], y_pred)).astype(int)))/sum(w_i)
            self.training_errors.append(error_m)

            alpha_m = np.log((1 - error_m) / error_m)
            self.learning_rate.append(alpha_m)
            
    def predict(self, X):
        X["output"] = 0
        weak_preds = pd.DataFrame(index = range(len(X)), columns = range(self.M)) 
        
        for m in range(self.M):
            self.G_M[m].predict(self.G_M[m].dt.start_node,X)
            pred = self.G_M[m].p["output"]
            y_pred_m = pred * self.learning_rate[m]
            weak_preds.iloc[:,m] = y_pred_m

        y_pred = (1 * np.sign(weak_preds.T.sum())).astype(int)

        return y_pred



In [18]:
m = AdaBoost(5)
m.fit(train)
pred = m.predict(train)

In [19]:
s = 0
p = train["Survived"]
for i in pred.index:
    if p.iloc[i]==pred.iloc[i]:
        s +=1
print("Accuracy = ",(s/len(pred))*100)
accuracies.append((s/len(pred))*100)

Accuracy =  82.37934904601572


In [20]:
print("|=================================================|====================================|")
print("|Classifier                                       |                   Accuracies       |")
print("|=================================================|====================================|")
print("|Decision Tree                                    |                 ",accuracies[0],"            |")
print("|Random Forest - Selecting Features               |                 ",accuracies[1],"            |")
print("|Random Forest - Sampling with Replacement        |                 ",accuracies[2],"|")
print("|AdaBoost                                         |                 ",accuracies[3],"|")
print("|=================================================|====================================|")

|Classifier                                       |                   Accuracies       |
|Decision Tree                                    |                  100.0             |
|Random Forest - Selecting Features               |                  100.0             |
|Random Forest - Sampling with Replacement        |                  94.97607655502392 |
|AdaBoost                                         |                  82.37934904601572 |
