In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("Train.csv")

In [3]:
#data.head(n=10)# Survived is the final value output values

In [4]:
# The features which are not useful
columns_to_drop=["name","boat","ticket","body","cabin","embarked","home.dest"]

In [5]:
data_clean = data.drop(columns_to_drop,axis=1)

In [6]:
#data_clean.info()

In [7]:
submission_format=pd.read_csv("sample_submission.csv")

In [8]:
print(submission_format.shape)

(300, 2)


In [9]:
#print(submission_format)

In [10]:
from sklearn.preprocessing import LabelEncoder

In [11]:
le = LabelEncoder()
data_clean["sex"] = le.fit_transform(data_clean["sex"])

In [12]:
# Data Imputation
data_clean = data_clean.fillna(data_clean["age"].mean())

In [13]:
data_clean.head(n=5)

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
0,3.0,0.0,0,29.838978,0.0,0.0,7.75
1,2.0,0.0,1,39.0,0.0,0.0,26.0
2,2.0,1.0,0,40.0,0.0,0.0,13.0
3,3.0,1.0,0,31.0,1.0,1.0,20.525
4,3.0,1.0,0,29.838978,2.0,0.0,23.25


In [14]:
# Separating columns for training(input_cols) and validating(output_cols)  
input_cols = ['pclass',"sex","age","sibsp","parch","fare"]
output_cols = ["survived"]

X = data_clean[input_cols]
Y = data_clean[output_cols]

print(X.shape,Y.shape)
print(type(X))

(1009, 6) (1009, 1)
<class 'pandas.core.frame.DataFrame'>


In [15]:
def entropy(col):
    # No of unique values in that column
    # counts is a tuple
    counts = np.unique(col,return_counts=True)
    # Total number of entries in that column
    N = float(col.shape[0])
    
    ent = 0.0
    
    for ix in counts[1]:
        p  = ix/N
        ent += (p*np.log2(p))
    
    return -1.0*ent

In [16]:
# fkey is the name of column and fval is the threshold value(mean of values in that column)
# if value in column is greater than fval(threshold) it will belong to one node and otherwise
def divide_data(x_data,fkey,fval):
    #Work with Pandas Data Frames
    #Creating two empty data files
    x_right = pd.DataFrame([],columns=x_data.columns)
    x_left = pd.DataFrame([],columns=x_data.columns)
    
    for ix in range(x_data.shape[0]):
        val = x_data[fkey].loc[ix]
        
        if val > fval:
            x_right = x_right.append(x_data.loc[ix])
        else:
            x_left = x_left.append(x_data.loc[ix])
            
    return x_left,x_right


In [17]:
def information_gain(x_data,fkey,fval):
    
    left,right = divide_data(x_data,fkey,fval)
    
    #% of total samples are on left and right
    l = float(left.shape[0])/x_data.shape[0]
    r = float(right.shape[0])/x_data.shape[0]
    
    #All examples come to one side!
    if left.shape[0] == 0 or right.shape[0] ==0:
        return -1000000 #Min Information Gain
    # Survived column is used for entropy calculation
    i_gain = entropy(x_data.survived) - (l*entropy(left.survived)+r*entropy(right.survived))
    return i_gain

In [18]:
# Test our function
for fx in X.columns:
    print(fx)
    print(information_gain(data_clean,fx,data_clean[fx].mean()))

pclass
0.055456910002982474
sex
0.19274737190850932
age
0.0010525742338489685
sibsp
0.006492394392888956
parch
0.019756080122948272
fare
0.04242793401428169


In [40]:
# max_depth=5 it is not allowed that tree goes to maximum depth possible bz if it grows to fullest it may
# lead to the case of overfitting leading to the poor generalisation
class DecisionTree:
    
    #Constructor
    def __init__(self,depth=0,max_depth=4):
        self.left = None
        self.right = None
        # feature about which split will be performed
        self.fkey = None
        # threshold value for the feature
        self.fval = None
        self.max_depth = max_depth
        self.depth = depth
        # what i am goint to predict at this node
        self.target = None
        
    def train(self,X_train):
        
        features = ['pclass','sex','age','sibsp', 'parch', 'fare']
        info_gains = []
        
        for ix in features:
            i_gain = information_gain(X_train,ix,X_train[ix].mean())
            info_gains.append(i_gain)
        # extractimg the columns that defines max information gain  
        self.fkey = features[np.argmax(info_gains)]
        # setting that fval for the current node in the decision tree
        self.fval = X_train[self.fkey].mean()
        print("Making Tree Features is",self.fkey)
        
        #Split Data
        data_left,data_right = divide_data(X_train,self.fkey,self.fval)
        data_left = data_left.reset_index(drop=True)
        data_right = data_right.reset_index(drop=True)
         
        #Truly a left node
        if data_left.shape[0]  == 0 or data_right.shape[0] ==0:
            if X_train.survived.mean() >= 0.5:
                self.target = "survive"
            else:
                self.target = "dead"
            return
        #Stop earyly when depth >=max depth
        if(self.depth>=self.max_depth):
            if X_train.survived.mean() >= 0.5:
                self.target = "survive"
            else:
                self.target = "dead"
            return
        
        #Recursive Case
        self.left = DecisionTree(depth=self.depth+1,max_depth=self.max_depth)
        self.left.train(data_left)
        
        self.right = DecisionTree(depth=self.depth+1,max_depth=self.max_depth)
        self.right.train(data_right)
        
        #You can set the target at every node
        if X_train.survived.mean() >= 0.5:
            self.target = "survive"
        else:
            self.target = "dead"
        return
    
    def predict(self,test):
        if test[self.fkey]>self.fval:
            #go to right
            if self.right is None:
                return self.target
            return self.right.predict(test)
        else:
            if self.left is None:
                return self.target
            return self.left.predict(test)
        

In [41]:
dt = DecisionTree()

In [42]:
dt.train(data_clean)

Making Tree Features is sex
Making Tree Features is pclass
Making Tree Features is pclass
Making Tree Features is fare
Making Tree Features is sibsp
Making Tree Features is sibsp
Making Tree Features is parch
Making Tree Features is fare
Making Tree Features is age
Making Tree Features is parch
Making Tree Features is sibsp
Making Tree Features is fare
Making Tree Features is age
Making Tree Features is fare
Making Tree Features is fare
Making Tree Features is fare
Making Tree Features is fare
Making Tree Features is parch
Making Tree Features is age
Making Tree Features is fare
Making Tree Features is sibsp
Making Tree Features is age
Making Tree Features is pclass
Making Tree Features is parch
Making Tree Features is pclass
Making Tree Features is age
Making Tree Features is parch
Making Tree Features is sibsp
Making Tree Features is sibsp
Making Tree Features is sibsp
Making Tree Features is age


In [43]:
test_data=pd.read_csv("Test.csv")

In [44]:
test_data["sex"] = le.fit_transform(test_data["sex"])

In [45]:
columns_to_drop=["name","boat","ticket","body","cabin","embarked","home.dest"]

In [46]:
test_data_clean=test_data.drop(columns_to_drop,axis=1)

In [47]:
print(test_data_clean.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 6 columns):
pclass    300 non-null float64
sex       300 non-null int64
age       234 non-null float64
sibsp     300 non-null float64
parch     300 non-null float64
fare      300 non-null float64
dtypes: float64(5), int64(1)
memory usage: 14.1 KB
None


In [48]:
print(test_data_clean.head(n=5))

   pclass  sex   age  sibsp  parch     fare
0     1.0    1  36.0    0.0    0.0  26.3875
1     3.0    0   NaN    8.0    2.0  69.5500
2     1.0    1   NaN    0.0    0.0  50.0000
3     2.0    1  34.0    0.0    0.0  13.0000
4     2.0    1  28.0    0.0    0.0  13.0000


In [49]:
print(test_data_clean.shape)

(300, 6)


In [50]:
test_data_clean.loc[0]

pclass     1.0000
sex        1.0000
age       36.0000
sibsp      0.0000
parch      0.0000
fare      26.3875
Name: 0, dtype: float64

In [51]:
print(dt.fkey)

sex


In [52]:
y_pred = []
for ix in range(test_data_clean.shape[0]):
    # loc (stands for location) is used to fetch row by row for pandas dataframe You cant use indices for pandas
    # as in numpy
    y_pred.append(dt.predict(test_data_clean.loc[ix]))

In [53]:
#print(y_pred)

In [54]:
le = LabelEncoder()
# encoding our prediction in numbers
y_pred = le.fit_transform(y_pred)

In [55]:
#print(y_pred)

In [56]:
print(y_pred.shape)

(300,)


In [57]:
df=pd.DataFrame(data=y_pred,columns=['survived'])

In [58]:
df.to_csv("Answer.csv",index=True,index_label='Id')

In [59]:
ans=pd.read_csv("Answer.csv")

In [60]:
print(ans.shape)

(300, 2)
