## Computing gini impurity for Breast cancer classification

In [1]:
import numpy as np

def compute_gini(class_frequencies):
    probabilities = class_frequencies / np.sum(class_frequencies)
    gini = 1 - np.sum(probabilities ** 2)
    return gini

class_dist_matrix = np.array([
    [150, 0],
    [10, 90],
    [60, 40],
    [50, 50]
])

for num_cases in range(len(class_dist_matrix)):
    class_dist = class_dist_matrix[num_cases,:]
    gini_impurity = compute_gini(class_dist)
    print("{:.2f}".format(gini_impurity))

0.00
0.18
0.48
0.50


## Decision tree implementation with python from scratch

In [None]:
# Let's import the necessary libraries.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay
from sklearn import datasets

iris = datasets.load_iris()

# convert the data into dataframe
data = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                     columns= iris['feature_names'] + ['target'])
data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [5]:
# Function to split dataset into train and test set
def train_test_split(df, test_size):
  if isinstance(test_size, float):
      test_size = round(test_size * len(df))

  indices = df.index.tolist()
  test_indices = random.sample(population=indices, k=test_size)

  test_df = df.loc[test_indices]
  train_df = df.drop(test_indices)

  return train_df, test_df

In [7]:
# let's try the function to split into train and test from above data
random.seed(0)  # Ensures same reproducibility
train_df, test_df = train_test_split(data, test_size=20)
train_df.shape, test_df.shape

((130, 5), (20, 5))

## Helper Functions

In [None]:
# function to calculate gini impuriti