In [1]:
from sklearn import datasets
data_breast_cancer = datasets.load_breast_cancer(as_frame=True)

In [74]:
from sklearn.model_selection import train_test_split

X = data_breast_cancer.data[['mean texture', 'mean symmetry']]
y = data_breast_cancer.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [75]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

clf_dt = DecisionTreeClassifier()
clf_lr = LogisticRegression()
clf_knn = KNeighborsClassifier()

from sklearn.ensemble import VotingClassifier

ensamble_hard = VotingClassifier(estimators=[
    ('dt', clf_dt),
    ('lr', clf_lr),
    ('knn', clf_knn)
], voting='hard')

ensaable_soft = VotingClassifier(estimators=[
    ('dt', clf_dt),
    ('lr', clf_lr),
    ('knn', clf_knn)
], voting='soft')


In [76]:
def train_and_evaluate(clf, X_train, y_train, X_test, y_test):
    clf.fit(X_train, y_train)
    train_acc = clf.score(X_train, y_train)
    test_acc = clf.score(X_test, y_test)
    return train_acc, test_acc

In [77]:
results = []
vote = []
for model in [clf_dt, clf_lr, clf_knn, ensamble_hard, ensaable_soft]:
    train_acc, test_acc = train_and_evaluate(model, X_train, y_train, X_test, y_test)
    results.append((train_acc, test_acc))
    vote.append(model)

print(results)

[(1.0, 0.6929824561403509), (0.6901098901098901, 0.7543859649122807), (0.7582417582417582, 0.7192982456140351), (0.843956043956044, 0.7368421052631579), (0.9692307692307692, 0.7280701754385965)]


In [78]:
# zapis wynikow w picklu
import pickle
with open('acc_vote.pkl', 'wb') as f:
    pickle.dump(results, f)

with open('vote.pkl', 'wb') as f:
    pickle.dump(vote, f)



In [79]:
# 30 decision trees
from sklearn.ensemble import (
    BaggingClassifier,
    RandomForestClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier
)

base_tree = DecisionTreeClassifier()

models = [
    BaggingClassifier(estimator=base_tree, n_estimators=30, bootstrap=True), 
    BaggingClassifier(estimator=base_tree, n_estimators=30, bootstrap=True, max_samples=0.5),
    BaggingClassifier(estimator=base_tree, n_estimators=30, bootstrap=False),
    BaggingClassifier(estimator=base_tree, n_estimators=30, bootstrap=False, max_samples=0.5), 
    RandomForestClassifier(n_estimators=30),
    AdaBoostClassifier(estimator=base_tree, n_estimators=30), 
    GradientBoostingClassifier(n_estimators=30) 
]


In [None]:
# train and evaluate models
bag_results = []
bag_classifiers = []

for model in models:
    train_acc, test_acc = train_and_evaluate(model, X_train, y_train, X_test, y_test)
    bag_results.append((train_acc, test_acc))
    bag_classifiers.append(model)

print(bag_results)

[(1.0, 0.7105263157894737), (0.9164835164835164, 0.7017543859649122), (1.0, 0.7017543859649122), (0.9604395604395605, 0.7105263157894737), (0.9978021978021978, 0.6842105263157895), (1.0, 0.7017543859649122), (0.7956043956043956, 0.7631578947368421)]




In [55]:
# save results
with open('acc_bag.pkl', 'wb') as f:
    pickle.dump(bag_results, f)

with open('bag.pkl', 'wb') as f:
    pickle.dump(bag_classifiers, f)

In [72]:
# 7 sampling with replacement
X = data_breast_cancer.data
y = data_breast_cancer.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


fea_model = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=30,
    bootstrap=True,
    max_samples=0.5,
    bootstrap_features=False,
    max_features=2
)

train_acc, test_acc = train_and_evaluate(fea_model, X_train, y_train, X_test, y_test)

with open('acc_fea.pkl', 'wb') as f:
    pickle.dump([(train_acc, test_acc)], f)

with open('fea.pkl', 'wb') as f:
    pickle.dump([fea_model], f)


In [71]:
# 9 feature ranking
import pandas as pd
from sklearn.metrics import accuracy_score

estimators = fea_model.estimators_
features_per_estimator = fea_model.estimators_features_
feature_names = X_train.columns


# Compute accuracies for each estimator
results = []
for i, (estimator, feat_indices) in enumerate(zip(estimators, features_per_estimator)):
    # Get the selected feature names
    selected_features = list(feature_names[feat_indices])
    
    # Predict on train and test sets (using only selected features)
    X_train_subset = X_train.iloc[:, feat_indices].values
    X_test_subset = X_test.iloc[:, feat_indices].values
    
    y_train_pred = estimator.predict(X_train_subset)
    y_test_pred = estimator.predict(X_test_subset)
    
    # Compute accuracies
    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)
    
    results.append({
        'train_accuracy': train_acc,
        'test_accuracy': test_acc,
        'selected_features': selected_features
    })



# Convert to DataFrame and sort
df_ranking = pd.DataFrame(results)
print(df_ranking)

df_ranking = df_ranking.sort_values(
    by=['test_accuracy', 'train_accuracy'],
    ascending=False
)


    train_accuracy  test_accuracy  \
0         0.802198       0.587719   
1         0.758242       0.631579   
2         0.839560       0.719298   
3         0.760440       0.675439   
4         0.749451       0.605263   
5         0.927473       0.947368   
6         0.898901       0.833333   
7         0.810989       0.666667   
8         0.815385       0.736842   
9         0.813187       0.657895   
10        0.835165       0.789474   
11        0.819780       0.789474   
12        0.945055       0.894737   
13        0.931868       0.912281   
14        0.749451       0.614035   
15        0.745055       0.614035   
16        0.736264       0.508772   
17        0.907692       0.903509   
18        0.907692       0.956140   
19        0.857143       0.763158   
20        0.756044       0.605263   
21        0.885714       0.859649   
22        0.918681       0.885965   
23        0.912088       0.885965   
24        0.887912       0.868421   
25        0.914286       0.868421   
2