In [1]:
import pandas as pd
import numpy as np

In [4]:
df= pd.read_csv("train.csv")
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# PREPROCESSING

Dropping labels that are irrelevant

In [8]:
dropped_labels=['Name','Ticket','Cabin','Embarked']
df.drop(labels=dropped_labels,axis=1,inplace=True)


In [9]:
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,1,0,3,male,22.0,1,0,7.25
1,2,1,1,female,38.0,1,0,71.2833
2,3,1,3,female,26.0,0,0,7.925
3,4,1,1,female,35.0,1,0,53.1
4,5,0,3,male,35.0,0,0,8.05


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Fare         891 non-null    float64
dtypes: float64(2), int64(5), object(1)
memory usage: 55.8+ KB


Replacing Male/Female with numeric label

In [13]:
from sklearn.preprocessing import LabelEncoder
le= LabelEncoder()
df['Sex']=le.fit_transform(df['Sex'])

In [14]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,1,0,3,1,22.0,1,0,7.25
1,2,1,1,0,38.0,1,0,71.2833
2,3,1,3,0,26.0,0,0,7.925
3,4,1,1,0,35.0,1,0,53.1
4,5,0,3,1,35.0,0,0,8.05


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    int64  
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Fare         891 non-null    float64
dtypes: float64(2), int64(6)
memory usage: 55.8 KB


Imputing NaN values in age column

In [20]:
df['Age'].fillna(df['Age'].mean(),inplace=True)

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    int64  
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Fare         891 non-null    float64
dtypes: float64(2), int64(6)
memory usage: 55.8 KB


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int64  
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
dtypes: float64(2), int64(5)
memory usage: 48.9 KB


Separating input/output classes

In [28]:
input_labels=['Pclass','Sex','Age','SibSp','Parch','Fare']
output_labels=['Survived']

In [29]:
X= df[input_labels]
Y= df[output_labels]

# HELPER FUNCTIONS

In [108]:
def entropy(col):
    counts= np.unique(col,return_counts=True)
    entropy=0.0
    N= col.shape[0]
    for ix in counts[1]:
        p = ix/N
        entropy+= p*np.log2(p)
        
    return -entropy
        
def divide_data(df, feature, threshold):
    left= pd.DataFrame([],index=None,columns=df.columns)
    right= pd.DataFrame([], index= None,columns=df.columns)
    
    for ix in range(df.shape[0]):
        if (df[feature].loc[ix]>threshold):
            right=right.append(df.loc[ix])
        else:
            left=left.append(df.loc[ix])
        
    return left,right
    
    
def info_gain(df, feature, threshold):
    l_data,r_data= divide_data(df, feature, threshold)
    l= l_data.shape[0]/float(df.shape[0])
    r= r_data.shape[0]/float(df.shape[0])
    if (l_data.shape[0]==0 or r_data.shape[0]==0):
        return -100000;
    
    i_gain= entropy(df.Survived)-(l*entropy(l_data.Survived) + r*entropy(r_data.Survived))
    return i_gain
        
        
        
        

In [109]:
for fx in X.columns:
    print(fx,"-")
    print("info gain-",info_gain(df,fx,df[fx].mean()))
    
    

Pclass -
info gain- 0.07579362743608165
Sex -
info gain- 0.2176601066606142
Age -
info gain- 0.001158644038169343
SibSp -
info gain- 0.009584541813400071
Parch -
info gain- 0.015380754493137694
Fare -
info gain- 0.042140692838995464


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,1,22.000000,1,0,7.2500
1,1,1,0,38.000000,1,0,71.2833
2,1,3,0,26.000000,0,0,7.9250
3,1,1,0,35.000000,1,0,53.1000
4,0,3,1,35.000000,0,0,8.0500
...,...,...,...,...,...,...,...
886,0,2,1,27.000000,0,0,13.0000
887,1,1,0,19.000000,0,0,30.0000
888,0,3,0,29.699118,1,2,23.4500
889,1,1,1,26.000000,0,0,30.0000


# MAKING THE DECISION TREE