In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import math
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
def xlogx(x):
    return x * np.log2(x)

def cal_entropy(train_data, train_labels):
    entropy = 0
    values = train_labels.unique()
    for value in values:
        fraction = train_labels.value_counts()[value]/len(train_data)
        entropy -= xlogx(fraction)
    return entropy

def cal_entropy_feature(train_data, train_labels, feature):
    target_variables = train_labels.unique()
    variables = train_data[feature].unique()
    feature_entropy = 0
    epsilon = np.finfo(float).eps
    for variable in variables:
        entropy = 0
        feat = train_data[feature]
        feat_var = feat[feat == variable]
        den = len(feat_var)
        for target_variable in target_variables:
            num = len(feat_var[train_labels == target_variable])
            fraction =  num/(den + epsilon)
            entropy -= fraction*np.log(fraction + epsilon)
        fraction = den/len(train_data)
        feature_entropy -= fraction * entropy
    return abs(feature_entropy)

def get_best_feature(train_data, train_labels):
    IG = []
    for key in train_data.keys():
        IG.append(cal_entropy(train_data, train_labels) -
         cal_entropy_feature(train_data, train_labels, key))
    return train_data.keys()[np.argmax(IG)]

def data_cleanup(df):
    """
    Handling missing data using an imputing strategy and converting numerical columns to categorical.
    For now:
    Missing categorical data is filled with mode.
    Missing numerical data is filled with median.
    """

    # Fill categorical data with mode
    df['Ever_Married'].fillna(df['Ever_Married'].mode()[0], inplace=True)
    df['Graduated'].fillna(df['Graduated'].mode()[0], inplace=True)
    df['Profession'].fillna(df['Profession'].mode()[0], inplace=True)
    df['Var_1'].fillna(df['Var_1'].mode()[0], inplace=True)

    # Fill numerical data with median
    df['Work_Experience'].fillna(df['Work_Experience'].median(), inplace=True)
    df['Family_Size'].fillna(df['Family_Size'].median(), inplace=True)

    # Convert age to categorical using label encoding based on certain range
    df['Age'] = df['Age'].mask(df['Age'] <= 12, 1)
    df['Age'] = df['Age'].mask((df['Age'] > 12) & (df['Age'] <= 19), 2)
    df['Age'] = df['Age'].mask((df['Age'] > 19) & (df['Age'] <= 26), 3)
    df['Age'] = df['Age'].mask((df['Age'] > 26) & (df['Age'] <= 45), 4)
    df['Age'] = df['Age'].mask((df['Age'] > 45) & (df['Age'] <= 60), 5)
    df['Age'] = df['Age'].mask(df['Age'] > 60, 6)

    # Convert Work_Experience to categorical
    df['Work_Experience'] = df['Work_Experience'].mask(df['Work_Experience'] <= 2, 1)
    df['Work_Experience'] = df['Work_Experience'].mask((df['Work_Experience'] > 2) & (df['Work_Experience'] <= 5), 2)
    df['Work_Experience'] = df['Work_Experience'].mask((df['Work_Experience'] > 5) & (df['Work_Experience'] <= 8), 3)
    df['Work_Experience'] = df['Work_Experience'].mask(df['Work_Experience'] > 8, 4)

    # Convert Family_Size to categorical
    df['Family_Size'] = df['Family_Size'].mask(df['Family_Size'] <= 2, 1)
    df['Family_Size'] = df['Family_Size'].mask((df['Family_Size'] > 2) & (df['Family_Size'] <= 4), 2)
    df['Family_Size'] = df['Family_Size'].mask(df['Family_Size'] > 4, 3)

    # Drop features with 3 or more columns as NaN
    idxs = []
    for i in range(len(df)):
        cnt = df.iloc[i].isnull().sum()
        if(cnt >= 3):
            idxs.append(i)
    df.drop(idxs, axis = 0, inplace=True)
    # df = df.astype(str)
    return df

In [None]:
# Implement the Node class for Decision Tree
class Node:
    """
    The Node class for our decision tree.
    It contains the attributes required for each node and functions for various tasks.
    """

    def __init__(self, attr, prob_label):
        """
        Initializes a node with proper values.
        Args:
            attr (str): The decision attribute selected for the node
                    on the basis of which we split the tree further.
            prob_label (int): This is the most probable outcome if we were to convert this 
                    node to a leaf. It is calculated by determining which outcome 
                    occurs the most in the data points we have at this node.
        """
        self.attr = attr
        self.prob_label = prob_label
        self.children = {}         # dictionary of children nodes
    
    def is_leaf(self):
        """
        Checks if the given node is a leaf.
        Returns:
            bool: True, if the node is a leaf, otherwise False.
        """
        return not self.children       # if the node has no children, it is a leaf
    
    def node_count(self):
        """
        Finds the number of nodes in the subtree rooted at the given node.
        Returns:
            int: Number of nodes in the subtree.
        """
        if self.is_leaf():
            return 1
        count = 1
        for child in self.children.values():
            count += child.node_count()
        return count
    
class DecisionTree:
    """
    The main Decision Tree class having metadata for the decision tree, and functions
    for various operations of the decision tree.
    """
    def __init__(self, max_depth=10, min_samples=1):
        """
        Initializes a decision tree with proper metadata.
        Args:
            max_depth (int, optional): Maxmimum depth of the decision tree. Defaults to 15.
            min_samples (int, optional): Minimum number of samples that must be present to
                    branch the tree further. Defaults to 1.
        """
        self.root = None
        self.max_depth = max_depth
        self.min_samples = min_samples
        self.tree_depth = 0
    
    def train(self, train):
        """
        Trains the decision tree model.
        Args:
            train (pd.DataFrame): The training dataset.
        """
        train_data = train.drop(['ID', 'Segmentation'], axis=1)
        train_labels = train['Segmentation']
        self.root = self.build_tree(train_data, train_labels)
    
    def build_tree(self, train_data, train_labels, depth=0):
        """
        Builds the entire decision tree recursively, by selecting the feature to be split on,
        and then splitting the data into all unique values of that feature.
        Args:
            train_data (pd.DataFrame): The training dataset without the output labels.
            train_labels (pd.Series): The output labels for each row in the training dataset.
            depth (int, optional): Depth of the current node. Defaults to 0.
        Returns:
            Node: Root node of the tree
        """

        # if maximum depth is reached, or if we do not have enough samples, or all the samples have the same outcome label
        # then make this node a leaf
        if (depth == self.max_depth) or (len(train_data) <= self.min_samples) or (len(train_labels.unique()) == 1):
            return self.create_leaf(train_labels)

        attr = get_best_feature(train_data, train_labels)
        node = Node(attr, train_labels.value_counts().sort_index().idxmax())

        # # handle missing data in the best attribute
        # if train_data[attr].isnull().any():
        #     print(attr)
        #     print(train_data[attr].mode())
        #     train_data[attr].fillna(train_data[attr].mode()[0], inplace = True)

        # split the data into all unique values of the best attribute
        for val in train_data[attr].unique():
            data_subset = train_data[train_data[attr] == val]
            data_subset.drop(attr, axis=1, inplace=True)
            labels_subset = train_labels[data_subset.index]

            if data_subset.empty:
                return self.create_leaf(train_labels)

            node.children[val] = self.build_tree(data_subset, labels_subset, depth + 1)

        self.tree_depth = max(self.tree_depth, depth)
        return node

    def create_leaf(self, labels):
        """
        Creates and returns a leaf node for the decision tree.
        Args:
            labels (pd.Series): The output labels of the data points at this node.
        Returns:
            Node: The leaf node created.
        """
        prob_label = labels.value_counts().sort_index().idxmax()
        return Node('Segmentation', prob_label)


    def predict_one(self, test_instance, root):
        """
        Predicts the outcome on one row of data i.e. one test instance.
        Args:
            test_instance (dict): The test instance for which prediction is to be made.
            root (Node): The root node of the decision tree.
        Returns:
            string: Returns the predicted value from the set of labels.
        """
        if root.is_leaf():
            return root.prob_label
        edge = test_instance[root.attr]

        if edge not in root.children:
            return root.prob_label
        return self.predict_one(test_instance, root.children[edge])

    def predict(self, test_data):
        """
        Predicts the outcome on a set of test data.
        Args:
            test_data (pd.DataFrame): The test dataset for which predictions are to be made.
        Returns:
            pd.Series: Predicted outcomes (series of 0, 1 values) for the test dataset.
        """
        predictions = pd.Series([self.predict_one(row, self.root) for row in test_data.to_dict(orient='records')])
        return predictions

In [None]:
data = pd.read_csv('Dataset_A.csv')
data = data_cleanup(data)

for i in range(1, 11):
    train_data, test_data = train_test_split(data, test_size = 0.2, random_state=42+i)
    dt = DecisionTree(max_depth = len(train_data.columns)-2)
    dt.train(train_data)
    print(dt.tree_depth)
    pred = dt.predict(test_data)
    acc = accuracy_score(test_data.iloc[:, -1], pred)
    print(f"Seed value = {(42+i)}, accuracy = {(acc*100.0)}")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


7
Seed value = 43, accuracy = 45.10532837670384


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


7
Seed value = 44, accuracy = 45.16728624535316


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


7
Seed value = 45, accuracy = 46.84014869888476


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


7
Seed value = 46, accuracy = 47.70755885997522


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


7
Seed value = 47, accuracy = 44.85749690210657


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


7
Seed value = 48, accuracy = 45.72490706319702


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


7
Seed value = 49, accuracy = 45.66294919454771


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


7
Seed value = 50, accuracy = 45.66294919454771


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


7
Seed value = 51, accuracy = 46.77819083023544


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
