In [1]:
import numpy as np
import pandas as pd

In [2]:
# Load dataset
vegis_df = pd.read_csv("G:/Meine Ablage/KI_Projekt/Daten/vegis_traindata.csv")
vegis_df = vegis_df.drop(columns=["Unnamed: 0", "green pixels", "Image"])
print(vegis_df.head())
print(vegis_df.shape)

   Hight  Width    R    G   B  green onion pixels  ratio area  circularity  \
0    174    111  121   98  76                   0    0.747411     0.740759   
1     97     65  109   84  50                   0    0.785250     0.812063   
2    139    124  144  118  84                   0    0.826178     0.791622   
3     90     61  108   80  44                   0    0.782149     0.816098   
4     73     50  110   83  46                   0    0.764247     0.810826   

   keypoints  mean keypoints      Label  
0         39       12.664070  Kartoffel  
1         18        7.130874  Kartoffel  
2         36        5.622304  Kartoffel  
3         14       10.524947  Kartoffel  
4         11        6.394969  Kartoffel  
(2137, 11)


In [3]:
class DecissionTree():
    def __init__(self, max_depth):
        self.max_depth = max_depth
        self.features = None

    def fit(self, X, y):
        # X = df
        self.features = X.columns
        # value counts dont work because it returns a sorted list. In a node i need this: [0, 30, 20]..
    
        best_feat, best_thresh = self._find_best_split(X, y)
        print(f"best feat: {best_feat} And best_thres: {best_thresh}")

    def _loss_gini_score(self, left_cup, right_cup, gini_left, gini_right):
        return (sum(left_cup)*gini_left + sum(right_cup)*gini_right) / (sum(left_cup)+sum(right_cup))

    def _gini(self, cls, unique_cls_in_subset):
        #cls_in_subset: set {0, 1, 2, 3, 4, 5}
        # cls is a list of the amount of classes in each subset
        total = 0
        for y in unique_cls_in_subset:
            sum_cls = sum(cls)
            #print("Sum cls ", sum_cls)
            #print("clas number for y: ", cls[y])
            total += (cls[y] / sum_cls)**2
            #print("Total: ", total)
        #sum((cls[y]/sum(cls))**2 for y in unique_cls_in_subset)
        return 1.0 - total
    
    def _get_num_classes(self, y):
        # Number of each instance in a subset
        n_classes = []
        for cls in list(set(y)):
            n_classes.append(np.sum(y==cls))
        #print("Num Classes: ", n_classes)

        y_encoded = list(pd.Series(y, dtype="category").cat.codes) # contains encoded classes. It stores their index what they have in set(y)

        return n_classes, y_encoded
    
    def _find_best_split(self, X, y):
        best_feat, best_thresh = np.inf, np.inf
        n_classes, y_encoded = self._get_num_classes(y)
        best_gini = self._gini(n_classes, list(set(y_encoded)))
        
        for idx in range(X.shape[1]):  # 10 Features
            # thres = list of all values from feat col
            # cls are the corresponding labels
            X_copy = X.copy()
            thres, cls = self._get_thresholds_and_classes(X_copy.iloc[:,idx], y_encoded)

            left_cup = [0] * len(list(set(y_encoded)))  # Init with zeros. Classes are in ascending order. 0,1,2... 0=Karotte, 2=Kartoffel
            right_cup = n_classes.copy()        # Init with all instances btw classes

            for i in range(sum(n_classes)-1):
                c = cls[i]
                #print("c ", c)
                left_cup[c] += 1  # just increase the labels
                right_cup[c] -= 1

                gini_left = self._gini(left_cup, list(set(y_encoded)))
                gini_right = self._gini(right_cup, list(set(y_encoded)))

                new_gini_loss = self._loss_gini_score(left_cup, right_cup, gini_left, gini_right)

                if thres[i] == thres[i-1]:
                    continue

                if new_gini_loss < best_gini:
                    best_gini = new_gini_loss
                    best_feat = idx
                    best_thresh = (thres[i] + thres[i - 1]) / 2

        print("Best gini score: ", best_gini)
        print("Gini bucket left: ,", gini_left)
        print("Gini bucket right: ,", gini_right)
        return best_feat, best_thresh

    def _get_thresholds_and_classes(self, feat_col, y_encoded):
        data = {feat_col.name: feat_col, 'Classes': y_encoded}
        temp_df = pd.DataFrame(data)
        temp_df = temp_df.sort_values(by=feat_col.name, ignore_index=True)
        #print(temp_df.head())

        return temp_df.iloc[:,0].to_list(), temp_df.iloc[:,1].to_list() 
    
clf = DecissionTree(max_depth=10)

In [4]:
X = vegis_df.iloc[:, :10]
y = vegis_df.iloc[:, 10]

In [5]:
clf.fit(X, y)

KeyboardInterrupt: 

In [162]:
for c in range(1,5):
    print(c)

1
2
3
4


In [56]:
y.value_counts()

Label
Kartoffel          496
Zwiebel            445
Karotte            401
Karotte_Trieb      357
Kartoffel_Trieb    245
Zwiebel_Trieb      193
Name: count, dtype: int64

In [57]:
y[:3]

0    Kartoffel
1    Kartoffel
2    Kartoffel
Name: Label, dtype: object

In [98]:
y_en = list(pd.Series(y, dtype="category").cat.codes)
y_en[760]


0

In [100]:
y[760]

'Karotte'

In [99]:
f = set(y)
f

{'Karotte',
 'Karotte_Trieb',
 'Kartoffel',
 'Kartoffel_Trieb',
 'Zwiebel',
 'Zwiebel_Trieb'}

In [68]:
o = list(pd.Series(y, dtype="category").cat.codes)
o

[2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,


In [None]:
#which is two - Humidity & Wind
n_features = len(data.T)

#num_row_byclass - Which is [4,6]
num_row_byclass = [np.sum(y==c) for c in list(set(y))]
print('num_row_byclass ',num_row_byclass)

#creating placeholder for best(f,t)
best_attribute, best_thr = None, None

# Gini of root node.
best_gini = 1.0 - sum((n / len(y)) ** 2 for n in num_row_byclass)
print('best_gini ',best_gini)

for idx in range(n_features):

  y = list(pd.Series(y, dtype="category").cat.codes.values)
  thresholds, classes = zip(*sorted(zip(data[:, idx], y)))
  print('thresholds ',thresholds)

  left_bucket = [0] * len(list(set(y)))
  right_bucket = num_row_byclass.copy()

  print('left_bucket ',left_bucket)
  print('right_bucket ',right_bucket)
  print('m ',sum(num_row_byclass))

In [51]:
s1 = set(o)
s1

{0, 1, 2, 3, 4, 5}

In [118]:
val = {'a': [3, 1, 5], 'b': np.array([8, 7, 99])}
df = pd.DataFrame(val)
print(df.head())
sort_df = df.sort_values(by='a', ignore_index=True)
print(sort_df.head())

   a   b
0  3   8
1  1   7
2  5  99
   a   b
0  1   7
1  3   8
2  5  99


In [126]:
df2 = pd.DataFrame({sort_df.iloc[:,0].name: sort_df.iloc[:,0]})
a = df2.iloc[:,0].to_list()
a

[1, 3, 5]

In [6]:
def do():
    h = [2,4]
    l = [3,4,5,5]
    return h, l

u = do()
print(u)

([2, 4], [3, 4, 5, 5])
