## Decision Tree Classifier

- Using the dataset below, build a decision tree, with Buys as the target variable, to help in buying lipsticks in the future.
- Find the root node of the decision tree.
- Find the decision for the test data: [Age < 21, Income = Low, Gender = Female, Marital Status = Married]

### Imports

In [44]:
import numpy as np
import pandas as pd
from typing import Hashable, Any, Literal
IG_CRITERION = Literal['gini', 'entropy']

### Decision Tree Functions

\begin{align}
Gini = 1 - \sum_{i=1}^{n} p^{2}\left(c_{i}\right)
\end{align}

In [45]:
def calculate_gini(df_label):
	classes, counts = np.unique(df_label, return_counts=True)
	return 1 - np.sum([
		(counts[i] / np.sum(counts)) ** 2 for i in range(len(classes))
	])

\begin{align}
E(S) = \sum_{i=1}^{c}-p_i\log_2p_i
\end{align}

In [46]:
def calculate_entropy(df_label):
	classes, counts = np.unique(df_label, return_counts=True)
	return np.sum([
		(-counts[i] / np.sum(counts)) * np.log2(counts[i] / np.sum(counts))
		for i in range(len(classes))
	])

\begin{align}
Gain(T,X) = Entropy(T) - Entropy(T,X)
\end{align}

In [47]:
def calculate_information_gain(dataset: pd.DataFrame, feature: Any, label: str, criteria: IG_CRITERION = "entropy"):
	entropy = calculate_entropy(dataset[label])
	features, counts = np.unique(dataset[feature], return_counts=True)

	weighted_feature_entropy = np.sum([
		(counts[i] / np.sum(counts)) * globals()["calculate_" + criteria](
			dataset.where(dataset[feature] == features[i]).dropna()[label]
		) for i in range(len(features))
	])

	return entropy - weighted_feature_entropy

In [48]:
def create_decision_tree(sub_df: pd.DataFrame, df: pd.DataFrame, features: pd.Index, label: str, parent: Any, criteria: IG_CRITERION = "entropy"):

	datum = np.unique(df[label], return_counts=True)
	unique_data = np.unique(sub_df[label])

	if len(unique_data) <= 1:
		return unique_data[0]

	if len(sub_df) == 0:
		return unique_data[np.argmax(datum[1])]

	if len(features) == 0:
		return parent

	parent = unique_data[np.argmax(datum[1])]
	
	gain_values = [calculate_information_gain(sub_df, feature, label, criteria) for feature in features]

	optimum_feature_index = np.argmax(gain_values)
	optimum_feature = features[optimum_feature_index]
	decision_tree = {optimum_feature: {}}
	sub_features = [i for i in features if i != optimum_feature]

	for value in np.unique(sub_df[optimum_feature]):
		sub_data = sub_df.where(sub_df[optimum_feature] == value).dropna()

		min_tree = create_decision_tree(sub_data, df, sub_features, label, parent, criteria)

		decision_tree[optimum_feature][value] = min_tree
	
	return decision_tree

In [49]:
def predict(decision_tree: dict[Hashable, dict], test_data: pd.Series):
	prediction:int = 0
	
	for nodes in decision_tree.keys():
		value = test_data[nodes]
		decision_tree = decision_tree[nodes][value]

		if isinstance(decision_tree, dict):
			prediction = predict(decision_tree, test_data)
		else:
			prediction = decision_tree
			break
	
	return prediction


### Testing

In [50]:
raw_train_data = [
    ['<21', 'High', 'Male', 'Single', 'No'],
    ['<21', 'High', 'Male', 'Married', 'No'],
    ['21-35', 'High', 'Male', 'Single', 'Yes'],
    ['>35', 'Medium', 'Male', 'Single', 'Yes'],
    ['>35', 'Low', 'Female', 'Single', 'Yes'],
    ['>35', 'Low', 'Female', 'Married', 'No'],
    ['21-35', 'Low', 'Female', 'Married', 'Yes'],
    ['<21', 'Medium', 'Male', 'Single', 'No'],
    ['<21', 'Low', 'Female', 'Married', 'Yes'],
    ['>35', 'Medium', 'Female', 'Single', 'Yes'],
    ['<21', 'Medium', 'Female', 'Married', 'Yes'],
    ['21-35', 'Medium', 'Male', 'Married', 'Yes'],
    ['21-35', 'High', 'Female', 'Single', 'Yes'],
    ['>35', 'Medium', 'Male', 'Married', 'No']
]
columns = ['Age', 'Income', 'Gender', 'Marital Status', 'Buys']
raw_test_data = raw_train_data.pop(8)

In [51]:
train_data = pd.DataFrame(raw_train_data, columns=columns) # or pd.read_csv, whichever is availbable
train_data.head()

Unnamed: 0,Age,Income,Gender,Marital Status,Buys
0,<21,High,Male,Single,No
1,<21,High,Male,Married,No
2,21-35,High,Male,Single,Yes
3,>35,Medium,Male,Single,Yes
4,>35,Low,Female,Single,Yes


In [52]:
test_data = pd.Series(raw_test_data[:-1], index=columns[:-1])
test_data

Age                   <21
Income                Low
Gender             Female
Marital Status    Married
dtype: object

In [53]:
features = train_data.columns[:-1]
decision_tree = create_decision_tree(train_data, train_data, features, 'Buys', None, "entropy")
decision_tree

{'Age': {'21-35': 'Yes',
  '<21': {'Gender': {'Female': 'Yes', 'Male': 'No'}},
  '>35': {'Marital Status': {'Married': 'No', 'Single': 'Yes'}}}}

In [54]:

prediction = predict(decision_tree, test_data)
prediction

'Yes'