## Wczytanie i przetworzenie zbiorów danych

In [64]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
import pandas_profiling
import numpy as np

In [2]:
nazwy_kolumn = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16']
dane = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data', names = nazwy_kolumn)

In [3]:
pandas_profiling.ProfileReport(dane);

In [4]:
for i in nazwy_kolumn:
    dane[i]=dane[i].replace('?', -1)

In [5]:
X = dane.drop('A16', axis = 1)
y = dane['A16']

y = y.factorize()[0]
X['A1'] = X['A1'].factorize()[0]
X['A4'] = X['A4'].factorize()[0]
X['A5'] = X['A5'].factorize()[0]
X['A6'] = X['A6'].factorize()[0]
X['A7'] = X['A7'].factorize()[0]
X['A9'] = X['A9'].factorize()[0]
X['A10'] = X['A10'].factorize()[0]
X['A12'] = X['A12'].factorize()[0]
X['A13'] = X['A13'].factorize()[0]
X['A14'] = X['A14'].factorize()[0]

In [43]:
X.head();

## Trenowanie drzewa na pełnym zbiorze danych

In [45]:
drzewo = DecisionTreeClassifier()
drzewo.fit(X=X, y=y);

In [8]:
drzewo.score(X=X, y=y)

1.0

## Trenowanie drzewa na zbiorach uczącym i testowym

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=1)

In [39]:
drzewo = DecisionTreeClassifier()
drzewo.fit(X=X_train, y=y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [40]:
drzewo.score(X=X_test, y=y_test)

0.7971014492753623

## Wybór najlepszych hiperparametrów drzewa

In [41]:
wyniki = pd.DataFrame()
najlepszy = 0

for max_depth in range (2,15):
    print(max_depth)
    for min_samples_split in range(2,15):
        for min_samples_leaf in range(1,15):
            for criterion in ['entropy', 'gini']:
                drzewo = DecisionTreeClassifier(max_depth = max_depth, min_samples_split=min_samples_split,
                                               min_samples_leaf=min_samples_leaf, criterion = criterion)
                drzewo.fit(X=X_train, y=y_train)
                wynik = drzewo.score(X=X_test, y=y_test)
                wyniki = wyniki.append({'wynik': wynik, 
                                        'max depth': max_depth, 
                                        'min_samples_split': min_samples_split,
                                       'min_samples_leaf': min_samples_leaf, 
                                        'criterion': criterion}, ignore_index=True)
                if najlepszy < wynik:
                    najlepszy = wynik
                    hiperparametry = {'wynik': wynik, 
                                        'max depth': max_depth, 
                                        'min_samples_split': min_samples_split,
                                       'min_samples_leaf': min_samples_leaf, 
                                        'criterion': criterion}
print(najlepszy)
print(hiperparametry)


2
3
4
5
6
7
8
9
10
11
12
13
14
0.8623188405797102
{'wynik': 0.8623188405797102, 'max depth': 6, 'min_samples_split': 3, 'min_samples_leaf': 5, 'criterion': 'gini'}


## Ocena stabilności wyników drzewa i porównanie z DummyClassifier

In [49]:
wyniki = pd.DataFrame()

for random_state in range(0,20):
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=random_state)
    drzewo = DecisionTreeClassifier(max_depth=6, min_samples_split=3, min_samples_leaf=5, criterion='gini', random_state=1)
    drzewo.fit(X=X_train, y=y_train)
    wynik_drzewo = drzewo.score(X=X_test, y=y_test)
    
    dummy = DummyClassifier(strategy='stratified', random_state=1)
    dummy.fit(X=X_train, y=y_train)
    wynik_dummy = dummy.score(X=X_test, y=y_test)
    
    wyniki = wyniki.append({'random_state': random_state, 'wynik_drzewo': wynik_drzewo, 'wynik_dummy': wynik_dummy}, 
                           ignore_index=True)
    

print(wyniki)

    random_state  wynik_drzewo  wynik_dummy
0            0.0      0.840580     0.485507
1            1.0      0.847826     0.442029
2            2.0      0.855072     0.500000
3            3.0      0.818841     0.485507
4            4.0      0.789855     0.543478
5            5.0      0.905797     0.500000
6            6.0      0.869565     0.528986
7            7.0      0.847826     0.543478
8            8.0      0.818841     0.514493
9            9.0      0.833333     0.586957
10          10.0      0.818841     0.500000
11          11.0      0.826087     0.500000
12          12.0      0.847826     0.471014
13          13.0      0.869565     0.500000
14          14.0      0.862319     0.485507
15          15.0      0.913043     0.572464
16          16.0      0.818841     0.500000
17          17.0      0.862319     0.557971
18          18.0      0.891304     0.572464
19          19.0      0.833333     0.500000


In [62]:
pandas_profiling.ProfileReport(wyniki);

## Trenowanie lasu losowego

In [92]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=1)

a = int(np.round(np.round((np.sqrt(len(X.columns))))))

las = RandomForestClassifier(max_depth=6, min_samples_split=3, min_samples_leaf=5, criterion='gini', random_state=1,
                            bootstrap=True, max_features=a, n_estimators=100)

las.fit(X=X_train, y=y_train)
wynik = las.score(X=X_test, y=y_test)

wynik

0.8840579710144928