In [1]:
from sklearn import datasets
data_breast_cancer = datasets.load_breast_cancer(as_frame=True)

In [2]:
X = data_breast_cancer["data"]
y = data_breast_cancer["target"]

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [4]:
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier()

In [5]:
from sklearn.linear_model import LogisticRegression
log_clf = LogisticRegression()

In [6]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier()

In [7]:
from sklearn.ensemble import VotingClassifier
voting_clf_hard = VotingClassifier(estimators=[('tr', tree_clf), ('lr', log_clf), ('knn', knn_clf)], voting='hard')
voting_clf_soft = VotingClassifier(estimators=[('tr', tree_clf), ('lr', log_clf), ('knn', knn_clf)], voting='soft')

In [8]:
X_train_bc = X_train[["mean texture", "mean symmetry"]]
X_test_bc = X_test[["mean texture", "mean symmetry"]]

In [9]:
acc_vote = []

In [10]:
from sklearn.metrics import accuracy_score
for clf in (tree_clf, log_clf, knn_clf, voting_clf_hard, voting_clf_soft):
    clf.fit(X_train_bc, y_train)
    y_pred_train = clf.predict(X_train_bc)
    y_pred_test = clf.predict(X_test_bc)
    acc_train = accuracy_score(y_train, y_pred_train)
    acc_test = accuracy_score(y_test, y_pred_test)
    acc_vote.append((acc_train, acc_test))

In [11]:
clfs_vote = [tree_clf, log_clf, knn_clf, voting_clf_hard, voting_clf_soft]
clfs_vote

[DecisionTreeClassifier(),
 LogisticRegression(),
 KNeighborsClassifier(),
 VotingClassifier(estimators=[('tr', DecisionTreeClassifier()),
                              ('lr', LogisticRegression()),
                              ('knn', KNeighborsClassifier())]),
 VotingClassifier(estimators=[('tr', DecisionTreeClassifier()),
                              ('lr', LogisticRegression()),
                              ('knn', KNeighborsClassifier())],
                  voting='soft')]

In [12]:
acc_vote

[(1.0, 0.6491228070175439),
 (0.7252747252747253, 0.7105263157894737),
 (0.7692307692307693, 0.6666666666666666),
 (0.8483516483516483, 0.6403508771929824),
 (0.9736263736263736, 0.6228070175438597)]

In [13]:
import pickle
with open('acc_vote.pkl', 'wb') as fp:
    pickle.dump(acc_vote, fp)

In [14]:
with open('vote.pkl', 'wb') as fp:
    pickle.dump(clfs_vote, fp)

In [15]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, bootstrap=True)
bag_clf_05 = BaggingClassifier(DecisionTreeClassifier(), max_samples=0.5, n_estimators=30, bootstrap=True)
past_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, bootstrap=False)
past_clf_05 = BaggingClassifier(DecisionTreeClassifier(), max_samples=0.5, n_estimators=30, bootstrap=False)

In [16]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=30)

In [17]:
from sklearn.ensemble import AdaBoostClassifier
ab_clf = AdaBoostClassifier(n_estimators=30)

In [18]:
from sklearn.ensemble import GradientBoostingClassifier
gb_clf = GradientBoostingClassifier(n_estimators=30)

In [19]:
acc_bag = []

In [20]:
for clf in (bag_clf, bag_clf_05, past_clf, past_clf_05, rf_clf, ab_clf, gb_clf):
    clf.fit(X_train_bc, y_train)
    y_pred_train = clf.predict(X_train_bc)
    y_pred_test = clf.predict(X_test_bc)
    acc_train = accuracy_score(y_train, y_pred_train)
    acc_test = accuracy_score(y_test, y_pred_test)
    acc_bag.append((acc_train, acc_test))

In [21]:
clfs_bag = [bag_clf, bag_clf_05, past_clf, past_clf_05, rf_clf, ab_clf, gb_clf]

In [22]:
acc_bag

[(0.9978021978021978, 0.5877192982456141),
 (0.9296703296703297, 0.6228070175438597),
 (1.0, 0.6228070175438597),
 (0.9692307692307692, 0.6140350877192983),
 (0.9956043956043956, 0.631578947368421),
 (0.8131868131868132, 0.6754385964912281),
 (0.8307692307692308, 0.6666666666666666)]

In [23]:
clfs_bag

[BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=30),
 BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.5,
                   n_estimators=30),
 BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                   n_estimators=30),
 BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                   max_samples=0.5, n_estimators=30),
 RandomForestClassifier(n_estimators=30),
 AdaBoostClassifier(n_estimators=30),
 GradientBoostingClassifier(n_estimators=30)]

In [24]:
with open('acc_bag.pkl', 'wb') as fp:
    pickle.dump(acc_bag, fp)

In [25]:
with open('bag.pkl', 'wb') as fp:
    pickle.dump(clfs_bag, fp)

In [26]:
fea_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, max_samples=0.5, max_features=2, bootstrap_features=False, bootstrap=True)

In [27]:
fea_clf.fit(X_train, y_train)
y_pred_train = fea_clf.predict(X_train)
y_pred_test = fea_clf.predict(X_test)
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)

In [28]:
acc_fea = [acc_train, acc_test]
acc_fea

[0.9934065934065934, 0.9385964912280702]

In [29]:
fea = [fea_clf]

In [30]:
with open('acc_fea.pkl', 'wb') as fp:
    pickle.dump(acc_fea, fp)

In [31]:
with open('fea.pkl', 'wb') as fp:
    pickle.dump(fea, fp)

In [32]:
rank = []

In [33]:
for estimator, features in zip(fea_clf.estimators_, fea_clf.estimators_features_):
    y_pred_train = estimator.predict(X_train.iloc[:, features])
    y_pred_test = estimator.predict(X_test.iloc[:, features])
    acc_train = accuracy_score(y_train, y_pred_train)
    acc_test = accuracy_score(y_test, y_pred_test)
    rank.append([acc_train, acc_test, list(X.columns[features])])



In [34]:
import pandas as pd
acc_fea_rank = pd.DataFrame(rank, columns=["accuracy train", "accuracy test", "features"])

In [35]:
acc_fea_rank

Unnamed: 0,accuracy train,accuracy test,features
0,0.896703,0.77193,"[mean concavity, worst smoothness]"
1,0.940659,0.850877,"[mean concavity, mean radius]"
2,0.8,0.657895,"[smoothness error, worst fractal dimension]"
3,0.784615,0.701754,"[texture error, worst texture]"
4,0.76044,0.614035,"[mean smoothness, smoothness error]"
5,0.923077,0.850877,"[radius error, mean concave points]"
6,0.793407,0.640351,"[mean symmetry, worst smoothness]"
7,0.923077,0.850877,"[worst compactness, mean concave points]"
8,0.931868,0.868421,"[texture error, worst concave points]"
9,0.775824,0.649123,"[mean symmetry, worst symmetry]"


In [36]:
acc_fea_rank.sort_values(by=["accuracy test", "accuracy train"], ascending=False, inplace=True)

In [37]:
acc_fea_rank

Unnamed: 0,accuracy train,accuracy test,features
15,0.925275,0.903509,"[mean area, worst fractal dimension]"
25,0.942857,0.894737,"[worst radius, radius error]"
22,0.945055,0.877193,"[worst symmetry, worst area]"
11,0.942857,0.877193,"[mean area, mean concave points]"
23,0.938462,0.868421,"[mean compactness, worst radius]"
8,0.931868,0.868421,"[texture error, worst concave points]"
16,0.949451,0.859649,"[worst perimeter, mean symmetry]"
1,0.940659,0.850877,"[mean concavity, mean radius]"
5,0.923077,0.850877,"[radius error, mean concave points]"
7,0.923077,0.850877,"[worst compactness, mean concave points]"


In [38]:
with open('acc_fea_rank.pkl', 'wb') as fp:
    pickle.dump(acc_fea_rank, fp)