In [1]:
from sklearn import datasets
from sklearn.ensemble import VotingClassifier, BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, \
    GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

data_breast_cancer = datasets.load_breast_cancer()

In [2]:
from sklearn.model_selection import train_test_split
X = data_breast_cancer.data[:, [1, 8]]
y = data_breast_cancer.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

tree_clf = DecisionTreeClassifier()
log_clf = LogisticRegression()
knn_clf = KNeighborsClassifier()

voting_clf_hard = VotingClassifier(estimators=[('treecl', tree_clf), ('logcl', log_clf), ('knncl', knn_clf)], voting='hard')
voting_clf_soft = VotingClassifier(estimators=[('treecl', tree_clf), ('logcl', log_clf), ('knncl', knn_clf)], voting='soft')

voting_clf_hard.fit(X_train, y_train)
accuracy_vclf_hard = accuracy_score(y_test, voting_clf_hard.predict(X_test))
print(accuracy_vclf_hard)

voting_clf_soft.fit(X_train, y_train)
accuracy_vclf_soft = accuracy_score(y_test, voting_clf_soft.predict(X_test))
print(accuracy_vclf_soft)

0.6929824561403509
0.6666666666666666


In [4]:
tree_clf.fit(X_train, y_train)
log_clf.fit(X_train, y_train)
knn_clf.fit(X_train, y_train)

first_pkl = [(tree_clf.score(X_train, y_train), tree_clf.score(X_test, y_test)),
             (log_clf.score(X_train, y_train), log_clf.score(X_test, y_test)),
             (knn_clf.score(X_train, y_train), knn_clf.score(X_test, y_test)),
             (voting_clf_hard.score(X_train, y_train), voting_clf_hard.score(X_test, y_test)),
             (voting_clf_soft.score(X_train, y_train), voting_clf_soft.score(X_test, y_test))]

print(first_pkl)

import pickle
with open("acc_vote.pkl", "wb") as file:
    pickle.dump(first_pkl,file)

[(1.0, 0.631578947368421), (0.7230769230769231, 0.7017543859649122), (0.7714285714285715, 0.6403508771929824), (0.8351648351648352, 0.6929824561403509), (0.9648351648351648, 0.6666666666666666)]


In [5]:
clfs = [tree_clf, log_clf, knn_clf, voting_clf_hard, voting_clf_soft]
with open("vote.pkl", "wb") as file:
    pickle.dump(clfs, file)

In [6]:
from sklearn.ensemble import BaggingClassifier
bagging_pickel = []
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, bootstrap=True, random_state=42)
bag_clf.fit(X_train, y_train)
train_accuracy = bag_clf.score(X_train, y_train)
test_accuracy = bag_clf.score(X_test, y_test)
print(train_accuracy, test_accuracy)
bagging_pickel.append((train_accuracy, test_accuracy))

bag_clf_half = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, bootstrap=True, max_samples=0.5, random_state=42)
bag_clf_half.fit(X_train, y_train)
train_accuracy_h = bag_clf_half.score(X_train, y_train)
test_accuracy_h = bag_clf_half.score(X_test, y_test)
bagging_pickel.append((train_accuracy_h, test_accuracy_h))

pas_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, bootstrap=False, random_state=42)
pas_clf.fit(X_train, y_train)
train_accuracy_pas = pas_clf.score(X_train, y_train)
test_accuracy_pas = pas_clf.score(X_test, y_test)
bagging_pickel.append((train_accuracy_pas, test_accuracy_pas))

pas_clf_half = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, bootstrap=False, max_samples=0.5, random_state=42)
pas_clf_half.fit(X_train, y_train)
train_accuracy_pas_h = pas_clf_half.score(X_train, y_train)
test_accuracy_pas_h = pas_clf_half.score(X_test, y_test)
bagging_pickel.append((train_accuracy_pas_h, test_accuracy_pas_h))

rand_forest_clf = RandomForestClassifier(n_estimators=30, random_state=42)
rand_forest_clf.fit(X_train, y_train)
forest_accuracy_train = rand_forest_clf.score(X_train, y_train)
forest_accuracy_test = rand_forest_clf.score(X_test, y_test)
bagging_pickel.append((forest_accuracy_train, forest_accuracy_test))

ada_clf = AdaBoostClassifier(n_estimators=30, random_state=42)
ada_clf.fit(X_train, y_train)
ada_acc_train = ada_clf.score(X_train, y_train)
ada_acc_test = ada_clf.score(X_test, y_test)
bagging_pickel.append((ada_acc_train, ada_acc_test))

grad_clf = GradientBoostingClassifier(n_estimators=30, random_state=42)
grad_clf.fit(X_train,y_train)
grad_acc_train = grad_clf.score(X_train,y_train)
grad_acc_test = grad_clf.score(X_test, y_test)
bagging_pickel.append((grad_acc_train, grad_acc_test))

with open("acc_bag.pkl", "wb") as file:
    pickle.dump(bagging_pickel, file)
    
with open("bag.pkl", "wb") as file:
    pickle.dump([bag_clf, bag_clf_half, pas_clf, pas_clf_half, rand_forest_clf, ada_clf, grad_clf], file)


0.9956043956043956 0.6754385964912281




In [7]:
X_all = data_breast_cancer.data
y_all = data_breast_cancer.target

X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)
bag_clf_random = BaggingClassifier(n_estimators=30, max_features=2, max_samples=0.5)

bag_clf_random.fit(X_train, y_train)
train_accuracy_rand = bag_clf_random.score(X_train, y_train)
test_accuracy_rand = bag_clf_random.score(X_test, y_test)

with open("acc_fea.pkl", "wb") as file:
    pickle.dump([train_accuracy_rand, test_accuracy_rand], file)

with open("fea.pkl", "wb") as file:
    pickle.dump([bag_clf_random], file)

In [8]:
import numpy as np
import pandas as pd

results = []
selected_features = np.random.choice(range(X_train.shape[1]), size=2, replace=False)
X_train_selected = X_train[:, selected_features]
X_test_selected = X_test[:, selected_features]

for i, estimator in enumerate(bag_clf_random.estimators_):
    train_accuracy = estimator.score(X_train_selected, y_train)

    test_accuracy = estimator.score(X_test_selected, y_test)

    selected_features = [data_breast_cancer.feature_names[f] for f in bag_clf_random.estimators_features_[i]]

    results.append((train_accuracy, test_accuracy, selected_features))

df_results = pd.DataFrame(results,
                          columns=['Dokładność dla zbioru uczącego', 'Dokładność dla zbioru testującego', 'Wybrane cechy'])

df_results_sorted = df_results.sort_values(by=['Dokładność dla zbioru testującego', 'Dokładność dla zbioru uczącego'],
                                           ascending=False)

with open('acc_fea_rank.pkl', 'wb') as f:
    pickle.dump(df_results_sorted, f)