# __WSI - ćwiczenie 4.__

### __Regresja i klasyfikacja__

#### __Treść ćwiczenia__

- Celem ćwiczenia jest implementacja drzew decyzyjnych tworzonych algorytmem _ID3_ z ograniczeniem maksymalnej głębokości drzewa.
- Następnie należy wykorzystać stworzony algorytm do stworzenia i zbadaniajakości klasyfikatorów dla zbioru danych breast cancer (https://archive.ics.uci.edu/ml/datasets/breast+cancer). Klasą jest poleirradiat.
- Należy pamiętać o podziale danych na zbiory trenujący, walidacyjny i testo-wy. Można użyć w tym celu gotowych funkcji.

In [481]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, precision_recall_curve, auc, RocCurveDisplay, PrecisionRecallDisplay, recall_score, precision_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
import plotly.express as px
from math import log

RNG = np.random.default_rng()

#### __Ładowanie danych__

In [482]:

columns = ['class', 'age', 'menopause', 'tumor-size', 'inv-nodes', 
           'node-caps', 'deg-malig', 'breast', 'breast-quad', 'irradiat']
data = pd.read_csv('./data/breast-cancer.data', header=None, names=columns)

In [483]:
data.head()

Unnamed: 0,class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [484]:
data.value_counts()

class                 age    menopause  tumor-size  inv-nodes  node-caps  deg-malig  breast  breast-quad  irradiat
no-recurrence-events  60-69  ge40       15-19       0-2        no         2          right   left_up      no          2
                      40-49  premeno    10-14       0-2        no         1          right   left_up      no          2
                      60-69  ge40       20-24       0-2        no         1          left    left_low     no          2
                      50-59  premeno    25-29       0-2        no         2          left    left_low     no          2
                      40-49  premeno    20-24       0-2        no         2          right   left_up      no          2
                                                                                                                     ..
                      50-59  ge40       20-24       3-5        yes        2          right   left_up      no          1
                                        25-29

In [485]:
data['node-caps'].value_counts()

no     222
yes     56
?        8
Name: node-caps, dtype: int64

#### __Usuwanie nieznanych wartości__

In [486]:
data.drop(data.loc[data['node-caps'] == '?'].index, inplace=True)

#### __Podział na zbiór trenujący i testowy__

In [487]:
RANDOM_STATE=65
PRED_LABEL = 'irradiat'
x, y = data.drop(PRED_LABEL, axis=1), data[PRED_LABEL]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=RANDOM_STATE)
train = pd.concat([x_train, y_train], axis=1)
test = pd.concat([x_test, y_test], axis=1)

#### __Prosty model testowy__

In [488]:
class PrimitiveModel():
    def __init__(self) -> None:
        pass

    def fit(self, x, y):
        pass

    def predict(self, x):
        return ['no' for _ in range(len(x))]

In [489]:
model = PrimitiveModel()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
acc = accuracy_score(y_test, y_pred)
print(acc)

0.8214285714285714


#### __Implementacja klasyfikatora _ID3___

In [490]:
class Node():
    def __init__(self, label = None, branches: dict = None, value=None) -> None:
        self.label = label
        self.value = value
        self.branches = branches

    def __repr__(self, level=0, path=''):
        output = "\t" * level
        if path:
            output += str(path) + ' -> '
        if self.value is not None:
            output += str(self.value)
        else:
            output += str(self.label)
        output += "\n"
        if self.branches:
            for path, node in self.branches.items():
                output += node.__repr__(level+1, path)
        return output

class ID3():
    def __init__(self) -> None:
        self._root = None

    def __repr__(self):
        return self._root.__repr__()

    def fit(self, data, y_column_name):

        def split(data, target_feature):
            values = data[target_feature].unique().tolist()
            return {feature_value:data[data[target_feature] == feature_value] for feature_value in values}

        def entropy(data, y_column_name):
                return -sum([c/data.shape[0]*log(c/data.shape[0]) for c in data[y_column_name].value_counts().to_list()])
        
        def weighted_entropy(data, y_column_name, target_feature):
                return sum([subset.shape[0]/data.shape[0]*entropy(subset, y_column_name) for subset in split(data, target_feature).values()])

        def inf_gain(data, y_column_name, target_feature):
            return entropy(data, y_column_name) - weighted_entropy(data, y_column_name, target_feature)

        def recurr(data: pd.DataFrame, y_column_name: str, features: set):
            """
            overview

            Args:
                data:
                y_column_name:
                features:

            Returns:
                root

            Raises:
                None
            """

            # check end conditions

            # if all instances represent the same class,
            # return leaf with the class label
            if (data[y_column_name] == data[y_column_name].iat[0]).all():
                return Node(value=data[y_column_name].iat[0])
            # if no features are left to split on,
            # return leaf with most frequently occuring class
            if not features:
                return Node(value=data[y_column_name].value_counts().idxmax())

            # calculate information gain and choose best feature
            target_feature = max(features, key=lambda f: inf_gain(data, y_column_name, f))
            # split the branch
            data_split = split(data, target_feature).items()
            features.remove(target_feature)
            return Node(target_feature, branches={feature_value:recurr(branch, y_column_name, features) for feature_value, branch in data_split})

        # initiate feature set
        features = set(data.columns.tolist())
        features.remove(y_column_name)
        self._root = recurr(data, y_column_name, features)

    def predict(self):
        pass

In [491]:
tmp = pd.DataFrame(data=[['A', 1, 0],
['B', 1, 1],
['B', 2, 1],
['B', 2, 0],
['B', 3, 1]], columns=['x1','x2', 'y'])

model = ID3()
model.fit(tmp, 'y')
print(repr(model))


x1
	A -> 0
	B -> x2
		1 -> 1
		2 -> 1
		3 -> 1



In [492]:
model = ID3()
model.fit(test, 'irradiat')
print(repr(model))

tumor-size
	20-24 -> inv-nodes
		3-5 -> age
			30-39 -> yes
			50-59 -> no
			40-49 -> yes
			60-69 -> yes
		0-2 -> no
	15-19 -> deg-malig
		2 -> node-caps
			no -> no
			yes -> yes
		1 -> no
		3 -> menopause
			premeno -> breast
				right -> no
				left -> yes
			ge40 -> yes
	10-14 -> no
	25-29 -> breast-quad
		left_up -> class
			no-recurrence-events -> no
			recurrence-events -> no
		left_low -> no
		right_up -> no
	5-9 -> no
	30-34 -> no
	40-44 -> no
	45-49 -> yes
	0-4 -> no
	35-39 -> no

