In [179]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,confusion_matrix

In [88]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [89]:
penguins=pd.read_csv('/content/drive/MyDrive/Data_PRML/penguins.csv')

In [90]:
penguins=penguins.dropna()
penguins.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male,2007


In [91]:
penguins.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 333 entries, 0 to 343
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            333 non-null    object 
 1   island             333 non-null    object 
 2   bill_length_mm     333 non-null    float64
 3   bill_depth_mm      333 non-null    float64
 4   flipper_length_mm  333 non-null    float64
 5   body_mass_g        333 non-null    float64
 6   sex                333 non-null    object 
 7   year               333 non-null    int64  
dtypes: float64(4), int64(1), object(3)
memory usage: 23.4+ KB


In [101]:
class Node:
  def __init__(self,dataset,left_branch,right_branch,Best_feature):
    self.predictions=dataset['species'].value_counts().nlargest(1)
    self.left=left_branch
    self.right=right_branch
    self.feature=Best_feature

In [93]:
def gini_index(dataset,threshold,feature):
  main_values=dataset['species'].unique()
  left_df=dataset[dataset[feature]<=threshold]
  right_df=dataset[dataset[feature]>threshold]
  a=0
  b=0
  for val in main_values:
    a+=(left_df[left_df['species']==val].shape[0]/left_df.shape[0])**2
  a=1-a
  for val in main_values:
    b+=(right_df[right_df['species']==val].shape[0]/right_df.shape[0])**2
  b=1-b
  g=(((left_df.shape[0])/dataset.shape[0])*a)+(((right_df.shape[0])/dataset.shape[0])*b) 
  return g 

In [94]:
def cont_to_cat(dataset,feature):
  Min=dataset[feature].min()
  Max=dataset[feature].max()
  List=list(np.arange(Min,Max,1))
  G=1
  threshold=None
  for val in List:
    g=gini_index(dataset,val,feature)
    if(G>g):
      G=g
      threshold=val
  dataset.loc[dataset[feature]<=threshold,feature]=0
  dataset.loc[dataset[feature]>threshold,feature]=1    
  print(threshold)
  return G   

In [95]:
features=['bill_length_mm','bill_depth_mm','flipper_length_mm','body_mass_g']
for feature in features:
  g=cont_to_cat(penguins,feature)

43.1
16.1
206.0
4500.0


In [96]:
penguins.head(20)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,0.0,1.0,0.0,0.0,male,2007
1,Adelie,Torgersen,0.0,1.0,0.0,0.0,female,2007
2,Adelie,Torgersen,0.0,1.0,0.0,0.0,female,2007
4,Adelie,Torgersen,0.0,1.0,0.0,0.0,female,2007
5,Adelie,Torgersen,0.0,1.0,0.0,0.0,male,2007
6,Adelie,Torgersen,0.0,1.0,0.0,0.0,female,2007
7,Adelie,Torgersen,0.0,1.0,0.0,1.0,male,2007
12,Adelie,Torgersen,0.0,1.0,0.0,0.0,female,2007
13,Adelie,Torgersen,0.0,1.0,0.0,0.0,male,2007
14,Adelie,Torgersen,0.0,1.0,0.0,0.0,male,2007


In [112]:
def get_feature(dataset):
  best=None
  gini=1
  features=['bill_length_mm','bill_depth_mm','flipper_length_mm','body_mass_g']
  for feature in features:
    b=cont_to_cat(dataset,feature)
    if(gini>b):
      gini=b
      best=feature
  return gini,best 

In [154]:
user_prompt = 0.3
user_enable = True
Y=penguins['species']
X=penguins[['species','bill_length_mm','bill_depth_mm','flipper_length_mm'	,'body_mass_g']]
x_train,x_test,y_train,y_test = tts(X,Y,test_size=user_prompt,shuffle=user_enable)

In [113]:
def build_tree(dataset,max_depth):
  if(max_depth==0):
    return Node(dataset,None,None,None)
  elif(len(dataset)==0):
    return None
  else:
    gini,best_feature=get_feature(dataset)
    print(best_feature)
    if(best_feature!=None):
      left_df=dataset[dataset[best_feature]==0]
      right_df=dataset[dataset[best_feature]==1]
      left_branch=build_tree(left_df,max_depth-1)
      right_branch=build_tree(right_df,max_depth-1)
      return Node(dataset,left_branch,right_branch,best_feature)
    else:
      return Node(dataset,None,None,None)  

In [157]:
my_tree=build_tree(x_train,3)

0.0
0.0
0.0
0.0
flipper_length_mm
0.0
0.0
None
0.0
bill_length_mm
None
0.0
None
0.0
body_mass_g
None
0.0
None
0.0
bill_depth_mm
0.0
0.0
None
0.0
bill_depth_mm
0.0
None
None
0.0
bill_length_mm
0.0
None
None
0.0
body_mass_g


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in th

In [156]:
def print_tree(node, spacing=""):
    if isinstance(node, Node):
        print (spacing + "Predict", node.predictions)
    if(node!=None):    
      print (spacing + str(node.feature))
      print (spacing + '--> Left:')
      print_tree(node.left, spacing + "  ")
      print (spacing + '--> Right:')
      print_tree(node.right, spacing + "  ")

In [158]:
print_tree(my_tree)

Predict Adelie    99
Name: species, dtype: int64
flipper_length_mm
--> Left:
  Predict Adelie    97
Name: species, dtype: int64
  bill_length_mm
  --> Left:
    Predict Adelie    92
Name: species, dtype: int64
    body_mass_g
    --> Left:
      Predict Adelie    88
Name: species, dtype: int64
      None
      --> Left:
      --> Right:
    --> Right:
      Predict Adelie    4
Name: species, dtype: int64
      None
      --> Left:
      --> Right:
  --> Right:
    Predict Chinstrap    39
Name: species, dtype: int64
    bill_depth_mm
    --> Left:
      Predict Gentoo    1
Name: species, dtype: int64
      None
      --> Left:
      --> Right:
    --> Right:
      Predict Chinstrap    39
Name: species, dtype: int64
      None
      --> Left:
      --> Right:
--> Right:
  Predict Gentoo    87
Name: species, dtype: int64
  bill_depth_mm
  --> Left:
    Predict Gentoo    77
Name: species, dtype: int64
    bill_length_mm
    --> Left:
      Predict Gentoo    7
Name: species, dtype: int64
  

In [165]:
def classify(row, node):
    if(node.left==None or node.right==None):
      return node.predictions 
    elif(row[node.feature]==0):
      return classify(row,node.left)
    elif(row[node.feature]==1):
      return classify(row,node.right)
         

In [177]:
pred=[]
for row in x_test.iterrows():
  pred.append(str(classify(row[1],my_tree)).split()[0])

In [180]:
accuracy_score(y_test,pred)

0.97


# Q2