# Lab6 Metody zespolowe

In [1]:
from sklearn import datasets
data_breast_cancer = datasets.load_breast_cancer(as_frame=True)

### 1. Podziel zbiór data_breast_cancer na uczący i testujący

In [2]:
import pandas as pd

X_cancer = data_breast_cancer.data[["mean texture", "mean symmetry"]].copy()
y_cancer = data_breast_cancer.target

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, 
                                                    test_size=0.20, random_state=42)

### 2. Zbuduj ensemble używając klasyfikatorów binarnych

* drzewa decyzyjne

In [4]:
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier()

* regresja logistyczna

In [5]:
from sklearn.linear_model import LogisticRegression
log_clf = LogisticRegression()

* KNN

In [6]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier()

* zespol glosujacy typu *hard* oraz *soft*

In [7]:
from sklearn.ensemble import VotingClassifier

In [8]:
hardvoting_clf = VotingClassifier(
    estimators=[('tree', tree_clf),
                ('log', log_clf),
                ('knn', knn_clf)],
    voting='hard')

In [9]:
softvoting_clf = VotingClassifier(
    estimators=[('tree', tree_clf),
                ('log', log_clf),
                ('knn', knn_clf)],
    voting='soft')

### 3. Porównaj dokładność ww. klasyfikatorów

In [10]:
from sklearn.metrics import accuracy_score

acc_vote = []

for clf in (tree_clf, log_clf, knn_clf, hardvoting_clf, softvoting_clf):
    clf.fit(X_train, y_train)
    
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)
    
    acc_vote.append((accuracy_score(y_train, y_pred_train), 
                    accuracy_score(y_test, y_pred_test)))
    
    print(clf.__class__.__name__, "-",
          "train:", accuracy_score(y_train, y_pred_train),
          "test:", accuracy_score(y_test, y_pred_test))

DecisionTreeClassifier - train: 1.0 test: 0.631578947368421
LogisticRegression - train: 0.7230769230769231 test: 0.7017543859649122
KNeighborsClassifier - train: 0.7714285714285715 test: 0.6403508771929824
VotingClassifier - train: 0.8351648351648352 test: 0.6929824561403509
VotingClassifier - train: 0.9648351648351648 test: 0.6666666666666666


### 4. Zapisywanie do pliku rezultatow

In [11]:
import pickle
with open('acc_vote.pkl','wb') as f: pickle.dump(acc_vote, f)

#sprawdzenie zawartosci pliku
with open('acc_vote.pkl','rb') as f: print(pickle.load(f))

[(1.0, 0.631578947368421), (0.7230769230769231, 0.7017543859649122), (0.7714285714285715, 0.6403508771929824), (0.8351648351648352, 0.6929824561403509), (0.9648351648351648, 0.6666666666666666)]


In [12]:
vote = [tree_clf, log_clf, knn_clf, hardvoting_clf, softvoting_clf]
with open('vote.pkl','wb') as f: pickle.dump(vote, f)

#sprawdzenie zawartosci pliku
with open('vote.pkl','rb') as f: print(pickle.load(f))

[DecisionTreeClassifier(), LogisticRegression(), KNeighborsClassifier(), VotingClassifier(estimators=[('tree', DecisionTreeClassifier()),
                             ('log', LogisticRegression()),
                             ('knn', KNeighborsClassifier())]), VotingClassifier(estimators=[('tree', DecisionTreeClassifier()),
                             ('log', LogisticRegression()),
                             ('knn', KNeighborsClassifier())],
                 voting='soft')]


### 5. Wykonaj: Bagging, Bagging 50%, Pasting, Pasting 50%, random Forest, AdaBoost, Gradient Boosting
-> wykorzystujac 30 drzew decyzyjnych

* Bagging

In [13]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(DecisionTreeClassifier(), 
                            n_estimators=30, max_samples=1.0, bootstrap=True)

* Bagging 50% 

In [14]:
bag_h_clf = BaggingClassifier(DecisionTreeClassifier(), 
                            n_estimators=30, max_samples=0.5, bootstrap=True)

* Pasting

In [15]:
pas_clf = BaggingClassifier(DecisionTreeClassifier(), 
                            n_estimators=30, max_samples=1.0, bootstrap=False)

* Pasting 50%

In [16]:
pas_h_clf = BaggingClassifier(DecisionTreeClassifier(), 
                            n_estimators=30, max_samples=0.5, bootstrap=False)

* Random Forest

In [17]:
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators=30)

* AdaBoost

In [18]:
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(n_estimators=30)

* Gradient Boosting

In [19]:
from sklearn.ensemble import GradientBoostingClassifier
grb_clf = GradientBoostingClassifier(n_estimators=30)

### 6. Oblicz dokladnosci

In [20]:
from sklearn.metrics import accuracy_score

bag = [bag_clf, bag_h_clf, pas_clf, pas_h_clf, rnd_clf, ada_clf, grb_clf]
acc_bag = []

for clf in bag:
    clf.fit(X_train, y_train)
    
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)
    
    acc_bag.append((accuracy_score(y_train, y_pred_train), 
                    accuracy_score(y_test, y_pred_test)))
    
    print(clf.__class__.__name__, "-",
          "train:", accuracy_score(y_train, y_pred_train),
          "test:", accuracy_score(y_test, y_pred_test))

BaggingClassifier - train: 0.9978021978021978 test: 0.6491228070175439
BaggingClassifier - train: 0.9208791208791208 test: 0.7017543859649122
BaggingClassifier - train: 1.0 test: 0.6228070175438597
BaggingClassifier - train: 0.9648351648351648 test: 0.6842105263157895
RandomForestClassifier - train: 0.9956043956043956 test: 0.6666666666666666
AdaBoostClassifier - train: 0.8 test: 0.7368421052631579
GradientBoostingClassifier - train: 0.8373626373626374 test: 0.7105263157894737


* zapisz do pliku

In [21]:
with open('acc_bag.pkl','wb') as f: pickle.dump(acc_bag, f)

#sprawdzenie zawartosci pliku
with open('acc_bag.pkl','rb') as f: print(pickle.load(f))

[(0.9978021978021978, 0.6491228070175439), (0.9208791208791208, 0.7017543859649122), (1.0, 0.6228070175438597), (0.9648351648351648, 0.6842105263157895), (0.9956043956043956, 0.6666666666666666), (0.8, 0.7368421052631579), (0.8373626373626374, 0.7105263157894737)]


In [22]:
with open('bag.pkl','wb') as f: pickle.dump(bag, f)

#sprawdzenie zawartosci pliku
with open('bag.pkl','rb') as f: print(pickle.load(f))

[BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=30), BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.5,
                  n_estimators=30), BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  n_estimators=30), BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  max_samples=0.5, n_estimators=30), RandomForestClassifier(n_estimators=30), AdaBoostClassifier(n_estimators=30), GradientBoostingClassifier(n_estimators=30)]


### 7. Sampling 2 cech bez powtórzeń (30 drzew) -> połowę instancji dla każdego z drzew z powtórzeniami.

In [23]:
fea = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators = 30,
    bootstrap = True, bootstrap_features = False,
    max_samples = 0.5, max_features = 2)

In [24]:
X_cancer_all = data_breast_cancer.data.copy()
y_cancer_all = data_breast_cancer.target

X_train, X_test, y_train, y_test = train_test_split(X_cancer_all, y_cancer_all, 
                                                    test_size=0.20, random_state=42)

In [25]:
fea.fit(X_train, y_train)
    
y_pred_train = fea.predict(X_train)
y_pred_test = fea.predict(X_test)
    
acc_fea = [accuracy_score(y_train, y_pred_train), accuracy_score(y_test, y_pred_test)]
    
print(fea.__class__.__name__, "-",
      "train:", accuracy_score(y_train, y_pred_train),
      "test:", accuracy_score(y_test, y_pred_test))

BaggingClassifier - train: 0.9912087912087912 test: 0.9649122807017544


* zapisz do pliku

In [26]:
with open('acc_fea.pkl','wb') as f: pickle.dump(acc_fea, f)

#sprawdzenie zawartosci pliku
with open('acc_fea.pkl','rb') as f: print(pickle.load(f))

[0.9912087912087912, 0.9649122807017544]


In [27]:
with open('fea.pkl','wb') as f: pickle.dump(fea, f)

#sprawdzenie zawartosci pliku
with open('fea.pkl','rb') as f: print(pickle.load(f))

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_features=2,
                  max_samples=0.5, n_estimators=30)


### 9. które cechy dają najwięszą dokładność

In [28]:
train_acc = []
test_acc = []
features = []

for estimator, est_features in zip(fea.estimators_, fea.estimators_features_):
    
    X_cancer = data_breast_cancer.data.iloc[:, est_features].copy()
    y_cancer = data_breast_cancer.target
    
    X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, 
                                                    test_size=0.20, random_state=42)
    
    y_pred_train = estimator.predict(X_train)
    y_pred_test = estimator.predict(X_test)
    
    train_acc.append(accuracy_score(y_train, y_pred_train))
    test_acc.append(accuracy_score(y_test, y_pred_test))
    features.append([data_breast_cancer.data.columns[est_features[0]],data_breast_cancer.data.columns[est_features[1]]])

In [29]:
data = {'train_acc': train_acc, 'test_acc': test_acc, 'features': features}  
acc_fea_rank = pd.DataFrame(data)  
print(acc_fea_rank)  

    train_acc  test_acc                                     features
0    0.828571  0.675439     [concave points error, mean compactness]
1    0.898901  0.859649    [worst fractal dimension, mean concavity]
2    0.791209  0.745614    [concave points error, compactness error]
3    0.892308  0.842105                     [mean area, mean radius]
4    0.791209  0.605263       [mean texture, mean fractal dimension]
5    0.929670  0.921053                  [mean radius, worst radius]
6    0.907692  0.921053                  [mean texture, mean radius]
7    0.841758  0.771930               [symmetry error, radius error]
8    0.901099  0.894737                 [radius error, worst radius]
9    0.914286  0.894737               [mean texture, mean perimeter]
10   0.780220  0.631579   [mean smoothness, worst fractal dimension]
11   0.824176  0.657895                [radius error, texture error]
12   0.918681  0.921053             [worst radius, smoothness error]
13   0.800000  0.736842        [co

In [30]:
#posortuj
acc_fea_rank = acc_fea_rank.sort_values(by=['train_acc', 'test_acc'], ascending=False)
print(acc_fea_rank)

    train_acc  test_acc                                     features
25   0.940659  0.938596                  [worst area, mean symmetry]
5    0.929670  0.921053                  [mean radius, worst radius]
29   0.927473  0.929825             [mean area, mean concave points]
23   0.925275  0.938596                  [worst area, texture error]
27   0.920879  0.912281                 [symmetry error, worst area]
12   0.918681  0.921053             [worst radius, smoothness error]
9    0.914286  0.894737               [mean texture, mean perimeter]
21   0.912088  0.877193              [mean perimeter, mean symmetry]
6    0.907692  0.921053                  [mean texture, mean radius]
22   0.907692  0.824561              [worst perimeter, worst radius]
8    0.901099  0.894737                 [radius error, worst radius]
18   0.901099  0.789474    [mean perimeter, fractal dimension error]
19   0.898901  0.877193          [mean perimeter, compactness error]
1    0.898901  0.859649    [worst 

* zapisz plik

In [31]:
with open('acc_fea_rank.pkl','wb') as f: pickle.dump(acc_fea_rank, f)

#sprawdzenie zawartosci pliku
with open('acc_fea_rank.pkl','rb') as f: print(pickle.load(f))

    train_acc  test_acc                                     features
25   0.940659  0.938596                  [worst area, mean symmetry]
5    0.929670  0.921053                  [mean radius, worst radius]
29   0.927473  0.929825             [mean area, mean concave points]
23   0.925275  0.938596                  [worst area, texture error]
27   0.920879  0.912281                 [symmetry error, worst area]
12   0.918681  0.921053             [worst radius, smoothness error]
9    0.914286  0.894737               [mean texture, mean perimeter]
21   0.912088  0.877193              [mean perimeter, mean symmetry]
6    0.907692  0.921053                  [mean texture, mean radius]
22   0.907692  0.824561              [worst perimeter, worst radius]
8    0.901099  0.894737                 [radius error, worst radius]
18   0.901099  0.789474    [mean perimeter, fractal dimension error]
19   0.898901  0.877193          [mean perimeter, compactness error]
1    0.898901  0.859649    [worst 