In [95]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [130]:
def get_and_split_vegis_set(dataset):
    vegis_df = pd.read_csv(f"G:/Meine Ablage/KI_Projekt/Daten/{dataset}")
    vegis_df = vegis_df.drop(columns=["Unnamed: 0", "green pixels", "Image"])

    vegis_df.loc[vegis_df['Label'] == 'Karotte', 'Label'] = 0
    vegis_df.loc[vegis_df['Label'] == 'Kartoffel', 'Label'] = 1
    vegis_df.loc[vegis_df['Label'] == 'Zwiebel', 'Label'] = 2
    vegis_df.loc[vegis_df['Label'] == 'Karotte_Trieb', 'Label'] = 3
    vegis_df.loc[vegis_df['Label'] == 'Kartoffel_Trieb', 'Label'] = 4
    vegis_df.loc[vegis_df['Label'] == 'Zwiebel_Trieb', 'Label'] = 5

    vegis_df['Label'] = vegis_df['Label'].astype('int')
    print(vegis_df['Label'].value_counts())

    vegis_df = shuffle(vegis_df, random_state=42)
    
    return np.array(vegis_df)

In [93]:
class DecissionTree:
    def __init__(self, max_depth, min_size):
        self.max_depth = max_depth
        self.min_size = min_size
        self.depth = 0
        self.classes_dataset = None
        self.tree = None

    def fit(self, dataset):
        """
        X: array(n,d)
        y: array(labels)
        """
        #self.classes_dataset = list(set(dataset[:,-1]))
        
        # Nach jedem Split gibt es ein neuen Datensatz der dann übergeben werden muss 
        root = self._get_split(dataset)
        self.depth += 1
        self._split(root)
        self.tree = root
        return 

    def _split(self, node):
        left, right = node['buckets']
        del(node['buckets'])
        #print("Left: ", left)
        #print("Right: ", right)
        # Check if there was no split. The lists are empty
        if not left or not right:
            # Add amount of each class to the node
            node['left'] = node['right'] = self._to_terminal(left + right)
            return
        if self.depth >= self.max_depth:
            node['left'], node['right'] = self._to_terminal(left), self._to_terminal(right)
            return
        if len(left) <= self.min_size:
            node['left'] = self._to_terminal(left)
        else:
            node['left'] = self._get_split(left)
            self.depth += 1
            self._split(node['left'])
        
        if len(right) <= self.min_size:
            node['right'] = self._to_terminal(right)
        else:
            node['right'] = self._get_split(right)
            self.depth += 1
            self._split(node['right'])
        
    def _get_split(self, dataset):
        best_feat, best_thresh, best_gini_loss, best_buckets = 999, 999, 999, None
        #unique_cls_values = list(set(dataset[:,-1]))
        #print("DATASET: ,", dataset)
        unique_cls_values = list(set(row[-1] for row in dataset))
        # Do the split for each feature in the dataset (X)
        for idx in range(len(dataset[0])-1): # do not use the label for split
            # assign all instances to a split
            #print("Split for ", idx)
            for row in dataset:
                buckets = self._create_split(idx, row[idx], dataset)
            # evaluate the split for the two groups
            gini_loss = self._gini_loss(buckets, unique_cls_values)
            #print("New Gini loss: ", gini_loss)

            # check if the split is better than the current best split. If true, keep it
            if gini_loss < best_gini_loss:
                best_gini_loss = gini_loss
                best_feat = idx
                best_thresh = row[idx]
                best_buckets = buckets

                #print(f"best gini: {best_gini_loss}\nbest feat Idx: {best_feat}\nbest thresh: {best_thresh}\nbest buckets: {best_buckets}")
        return {'feature_idx': best_feat, 'thresh': best_thresh, 'buckets': best_buckets}

    def _create_split(self, idx, thresh, dataset):
        # idx = current feature for the split
        # thresh = current threshold for the split. Now check value for the whole dataset. Then grab the next threshold
        left_bucket, right_bucket = list(), list()
        for row in dataset:
            if row[idx] <= thresh:
                left_bucket.append(row)  #append row instead of only the label
            else:
                right_bucket.append(row)

        return left_bucket, right_bucket

    def _gini_loss(self, buckets, cls_values):
        # cls_values: Are the values in the current dataset 
        #print("Class Values i ngini : ", cls_values)
        n_instances = float(sum(len(bucket) for bucket in buckets))
        #print("Number Instances: ", n_instances)

        gini = 0.0
        for bucket in buckets:
            size = float(len(bucket))
            # calculate gini score for left and right bucket
            if size == 0:
                continue
            score = 0.0
            for cls_val in cls_values:
                ratio = [row[-1] for row in bucket].count(cls_val) / size
                #ratio = bucket.count(cls_val) / size
                score += ratio**2

            gini += (size / n_instances) * (1.0 - score)
        return gini
    
    def _to_leaf(self, bucket):
        pred_cls = {cls:0 for cls in self.classes_dataset}
        for c in self.classes_dataset:
            pred_cls[c] = bucket.count(c)
        return pred_cls
    
    def _to_terminal(self, bucket):
        outcomes = [row[-1] for row in bucket]
        return max(set(outcomes), key=outcomes.count)
    
    def predict(self, X_test):
        predictions = list()
        node = self.tree
        for row in X_test:
            pred = self._make_predictions(node, row)
            predictions.append(pred)
        return predictions
    
    def _make_predictions(self, node, row):
        if row[node['feature_idx']] < node['thresh']:
            if isinstance(node['left'], dict):
                return self._make_predictions(node['left'], row)
            else:
                return node['left'] #final prediction
        else:
            if isinstance(node['right'], dict):
                return self._make_predictions(node['right'], row)
            else:
                return node['right'] #final prediction
        

In [148]:
clf_tree = DecissionTree(25, 1)
clf_tree.fit(vegis_train)

In [113]:
#Sklearn
clf_tree_lib = DecisionTreeClassifier(criterion='gini')
clf_tree_lib.fit(vegis_train_lib_X, vegis_train_lib_y)
y_train_pred = clf_tree_lib.predict(vegis_test_lib_X)