In [112]:
# we load our dataset

import pandas as pd
import numpy as np
from sklearn import datasets
data_breast_cancer = datasets.load_breast_cancer(as_frame=True)


X = data_breast_cancer["data"].iloc[:, [1,8]]
y = data_breast_cancer["target"]

In [113]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [114]:

# we define classifiers

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

dtc = DecisionTreeClassifier(random_state=42)
lrc = LogisticRegression(random_state=42)
nbrs = KNeighborsClassifier()



In [115]:

# we define voting classifier

from sklearn.ensemble import VotingClassifier

voting_clf_hard = VotingClassifier(estimators=[('dt', dtc),('lr', lrc), ('knn', nbrs)],voting='hard')

voting_clf_soft = VotingClassifier(estimators=[('dt', dtc),('lr', lrc), ('knn', nbrs)],voting='soft')


# we train our votings classifiers

voting_clf_hard.fit(X_train,y_train)
voting_clf_soft.fit(X_train,y_train)

VotingClassifier(estimators=[('dt', DecisionTreeClassifier(random_state=42)),
                             ('lr', LogisticRegression(random_state=42)),
                             ('knn', KNeighborsClassifier())],
                 voting='soft')

In [116]:

# we check te accuracy

from sklearn.metrics import accuracy_score

# firstly we check accuracies for alone classifiers sequentially

#accuracy_score(y_true, y_pred)

accuracyList = []

for clf in voting_clf_hard.estimators_:
    
    # we predict train set
    y_predTrain = clf.predict(X_train)
    
    # we predict test set
    y_predTest = clf.predict(X_test)
    
    accuracyTrain = accuracy_score(y_train, y_predTrain)
    accuracyTest = accuracy_score(y_test, y_predTest)
    
    accuracyList.append((accuracyTrain,accuracyTest))


for clf in [voting_clf_hard,voting_clf_soft]:
    
    # we predict train set
    y_predTrain = clf.predict(X_train)
    
    # we predict test set
    y_predTest = clf.predict(X_test)
    
    accuracyTrain = accuracy_score(y_train, y_predTrain)
    accuracyTest = accuracy_score(y_test, y_predTest)
    
    accuracyList.append((accuracyTrain,accuracyTest))
    
    
#accuracyList

listOfClassifiers = voting_clf_hard.estimators_ + [voting_clf_hard,voting_clf_soft]
#listOfClassifiers
    



In [117]:

# we pickling the data

import pickle

with open('acc_vote.pkl', 'wb') as fh:
    pickle.dump(accuracyList, fh)


with open('vote.pkl', 'wb') as fh:
    pickle.dump(listOfClassifiers, fh)

# check unpickling

In [118]:
from sklearn.metrics import accuracy_score

accuracyList2 = []
listOfClassifiers2= []

In [119]:

#1 bagging

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier


bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30,random_state=42,bootstrap=True)
bag_clf.fit(X_train,y_train)

listOfClassifiers2.append(bag_clf)


In [120]:

#2 bagging with 50 % instances

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier


bag_clf50 = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30,random_state=42,max_samples=0.5,bootstrap=True)
bag_clf50.fit(X_train,y_train)

listOfClassifiers2.append(bag_clf50)

In [121]:

#3 pasting   ( without repetition of samples )

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier


pas_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30,random_state=42,bootstrap=False)
pas_clf.fit(X_train,y_train)

listOfClassifiers2.append(pas_clf)

In [122]:

#4 pasting with 50 % instances

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier


pas_clf50 = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30,random_state=42,max_samples=0.5,bootstrap=False)
pas_clf50.fit(X_train,y_train)

listOfClassifiers2.append(pas_clf50)


In [123]:
#5 random forest

from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=30, random_state=42)

rfc.fit(X_train,y_train)

listOfClassifiers2.append(rfc)

In [124]:
#6 ada boosting

from sklearn.ensemble import AdaBoostClassifier

abc = AdaBoostClassifier(n_estimators=30,random_state=42)

abc.fit(X_train,y_train)

listOfClassifiers2.append(abc)

In [125]:
#7 gradient boosting

from sklearn.ensemble import GradientBoostingClassifier

gbrt = GradientBoostingClassifier(n_estimators=30, random_state=42)

gbrt.fit(X_train,y_train)

listOfClassifiers2.append(gbrt)

In [126]:
print(listOfClassifiers2)

# we calculate accuracy


for clf2 in listOfClassifiers2:
    
    # we predict train set
    y_predTrain = clf2.predict(X_train)
    
    # we predict test set
    y_predTest = clf2.predict(X_test)
    
    accuracyTrain = accuracy_score(y_train, y_predTrain)
    accuracyTest = accuracy_score(y_test, y_predTest)
    
    accuracyList2.append((accuracyTrain,accuracyTest))
    
print(accuracyList2)

[BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=30,
                  random_state=42), BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.5,
                  n_estimators=30, random_state=42), BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  n_estimators=30, random_state=42), BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  max_samples=0.5, n_estimators=30, random_state=42), RandomForestClassifier(n_estimators=30, random_state=42), AdaBoostClassifier(n_estimators=30, random_state=42), GradientBoostingClassifier(n_estimators=30, random_state=42)]
[(0.9956043956043956, 0.6754385964912281), (0.9296703296703297, 0.6842105263157895), (1.0, 0.6228070175438597), (0.9736263736263736, 0.6491228070175439), (0.9956043956043956, 0.6754385964912281), (0.8, 0.7368421052631579), (0.8373626373626374, 0.7105263157894737)]


In [127]:
# we pickle data

import pickle

with open('acc_bag.pkl', 'wb') as fh:
    pickle.dump(accuracyList2, fh)


with open('bag.pkl', 'wb') as fh:
    pickle.dump(listOfClassifiers2, fh)

# check unpickling

In [128]:
# we generate second dataset with all features

X2 = data_breast_cancer["data"]
y2 = data_breast_cancer["target"]


# we split into train and test sets

from sklearn.model_selection import train_test_split

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=42)



In [129]:
# bagging with 2 features from all
 

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier


fea_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30,random_state=42,bootstrap=False,max_features=2,max_samples=0.5)
fea_clf.fit(X_train2,y_train2)

fea_clf.fit(X_train2,y_train2)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  max_features=2, max_samples=0.5, n_estimators=30,
                  random_state=42)

In [130]:
# accuracies

# we predict train set
y_predTrain2 = fea_clf.predict(X_train2)
    
# we predict test set
y_predTest2 = fea_clf.predict(X_test2)
    
accuracyTrain = accuracy_score(y_train2, y_predTrain2)
accuracyTest = accuracy_score(y_test2, y_predTest2)

listAccuracies3 = [accuracyTrain,accuracyTest]
classificator = [fea_clf]
    
print(listAccuracies3)
print(classificator)


[1.0, 0.9736842105263158]
[BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  max_features=2, max_samples=0.5, n_estimators=30,
                  random_state=42)]


In [131]:
# we pickle data

import pickle

with open('acc_fea.pkl', 'wb') as fh:
    pickle.dump(listAccuracies3, fh)


with open('fea.pkl', 'wb') as fh:
    pickle.dump(classificator, fh)

# check unpickling

In [132]:
# we create rank of best features for accuracy of estimator

#fea_clf.estimators_features_

#data_breast_cancer["data"].iloc[:, [25,17]]
fea_clf.estimators_features_[0][1]
#data_breast_cancer["data"].columns[25]
#data_breast_cancer["data"].iloc[:, [1,8]]

listAccTrain = []
listAccTest = []
listFeatures = []

i = 0

for estimator in fea_clf.estimators_:
    
    
    feature1 = fea_clf.estimators_features_[i][0]
    feature2 = fea_clf.estimators_features_[i][1]
    
    listOfFeatures = [data_breast_cancer["data"].columns[feature1],data_breast_cancer["data"].columns[feature2]]
 
    # we predict train set
    y_predTrain2 = estimator.predict(X_train2.iloc[:, [feature1,feature2]])
    
    # we predict test set
    y_predTest2 = estimator.predict(X_test2.iloc[:, [feature1,feature2]])
    
    accuracyTrain = accuracy_score(y_train2, y_predTrain2)
    accuracyTest = accuracy_score(y_test2, y_predTest2)
    
    #print(str(accuracyTrain) + " : " +  str(accuracyTest))
    #print(str(feature1) + " ; " + str(feature2))
    #print(listOfFeatures)
    
    listAccTrain.append(accuracyTrain)
    listAccTest.append(accuracyTest)
    listFeatures.append(listOfFeatures)
    

    i += 1

data = {"Acc_train" : listAccTrain , "Acc_test" : listAccTest, "Features" : listFeatures}



# creating dataframe from dict

import pandas as pd

df = pd.DataFrame(data)


# we sort values 
df_sorted = df.sort_values(by=['Acc_test','Acc_train'], ascending=False) 
print(df_sorted)



    Acc_train  Acc_test                                      Features
15   0.938462  0.938596          [worst fractal dimension, mean area]
11   0.934066  0.938596                    [worst radius, area error]
26   0.934066  0.929825               [mean radius, mean compactness]
9    0.931868  0.921053                     [mean area, worst radius]
6    0.938462  0.912281            [worst perimeter, mean smoothness]
19   0.953846  0.894737  [worst concave points, concave points error]
1    0.945055  0.894737                 [mean radius, mean concavity]
4    0.927473  0.894737          [worst area, mean fractal dimension]
7    0.912088  0.885965            [concave points error, area error]
29   0.901099  0.885965                 [concavity error, area error]
5    0.912088  0.850877                [mean radius, perimeter error]
13   0.909890  0.850877               [perimeter error, worst radius]
20   0.931868  0.842105             [worst perimeter, symmetry error]
25   0.909890  0.833

In [133]:
# we pickle our rank

import pickle

with open('acc_fea_rank.pkl', 'wb') as fh:
    pickle.dump(df_sorted, fh)

# unpickling
