# Listing and using scikit-learn machine learning algorithms

#### This file lists the machine learning algorithms included in scikit-learn and classifies the Wisconsin breast cancer dataset using these algorithms. 5 of these algorithms was optimized using the RandomizedSearchCV method.  The optimized algorithms are:

* RandomForestClassifier
* ExtraTreeClassifier
* SVC
* GradientBoostingClassifier
* DecisionTreeClassifier

### importing of required libraries for listing Sk-learn estimators

In [8]:
from sklearn.utils import all_estimators
import warnings
warnings.filterwarnings("ignore")



In [5]:
estimators = all_estimators(type_filter='classifier')

estimator_list = []
for number, estimator in estimators:
    print(number, estimator)
    try:
        clf = estimator()
        estimator_list.append(clf)
    except Exception as e:
        print('\nUnable to import   ------------------>', estimator,"\n")
        print(e)


AdaBoostClassifier <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>
BaggingClassifier <class 'sklearn.ensemble._bagging.BaggingClassifier'>
BernoulliNB <class 'sklearn.naive_bayes.BernoulliNB'>
CalibratedClassifierCV <class 'sklearn.calibration.CalibratedClassifierCV'>
CategoricalNB <class 'sklearn.naive_bayes.CategoricalNB'>
ClassifierChain <class 'sklearn.multioutput.ClassifierChain'>

Unable to import   ------------------> <class 'sklearn.multioutput.ClassifierChain'> 

__init__() missing 1 required positional argument: 'base_estimator'
ComplementNB <class 'sklearn.naive_bayes.ComplementNB'>
DecisionTreeClassifier <class 'sklearn.tree._classes.DecisionTreeClassifier'>
DummyClassifier <class 'sklearn.dummy.DummyClassifier'>
ExtraTreeClassifier <class 'sklearn.tree._classes.ExtraTreeClassifier'>
ExtraTreesClassifier <class 'sklearn.ensemble._forest.ExtraTreesClassifier'>
GaussianNB <class 'sklearn.naive_bayes.GaussianNB'>
GaussianProcessClassifier <class 'sklearn.gaussia

### list of obtained algorithms

In [6]:
estimator_list

[AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                    n_estimators=50, random_state=None),
 BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False,
                   max_features=1.0, max_samples=1.0, n_estimators=10,
                   n_jobs=None, oob_score=False, random_state=None, verbose=0,
                   warm_start=False),
 BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True),
 CalibratedClassifierCV(base_estimator=None, cv=None, method='sigmoid'),
 CategoricalNB(alpha=1.0, class_prior=None, fit_prior=True),
 ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=None, max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                

### importing required libraries for using this MLs

In [9]:
from sklearn.tree import ExtraTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm.classes import OneClassSVM
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.neural_network.multilayer_perceptron import MLPClassifier
from sklearn.neighbors.classification import RadiusNeighborsClassifier
from sklearn.neighbors.classification import KNeighborsClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OutputCodeClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model.stochastic_gradient import SGDClassifier
from sklearn.linear_model.ridge import RidgeClassifierCV
from sklearn.linear_model.ridge import RidgeClassifier
from sklearn.linear_model.passive_aggressive import PassiveAggressiveClassifier    
from sklearn.gaussian_process.gpc import GaussianProcessClassifier
from sklearn.ensemble.weight_boosting import AdaBoostClassifier
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.ensemble.bagging import BaggingClassifier
from sklearn.ensemble.forest import ExtraTreesClassifier
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import  ComplementNB
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import GaussianNB
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB  
from sklearn.neighbors import NearestCentroid
from sklearn.svm import NuSVC
from sklearn.linear_model import Perceptron
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier


from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt

from sklearn import datasets
import sklearn
import time


from sklearn.model_selection import RandomizedSearchCV
import numpy as np
from scipy.stats import randint as sp_randint
from scipy.stats import randint as sp_randFloat

### Load the breast cancer wisconsin dataset

In [10]:
cancer=datasets.load_breast_cancer()


X_cancer = cancer.data
y_cancer= cancer.target

###  List of MLs in dictinary

In [11]:
ml_list={"ExtraTreeClassifier":ExtraTreeClassifier(),
"DecisionTreeClassifier":DecisionTreeClassifier(),
"OneClassSVM":OneClassSVM(),
"MLPClassifier":MLPClassifier(),
"ComplementNB":ComplementNB(),
"DummyClassifier":DummyClassifier(),         
"RadiusNeighborsClassifier":RadiusNeighborsClassifier(),
"KNeighborsClassifier":KNeighborsClassifier(),
"ClassifierChain":ClassifierChain(base_estimator=DecisionTreeClassifier()),
"MultiOutputClassifier":MultiOutputClassifier(estimator=DecisionTreeClassifier()),
"OutputCodeClassifier":OutputCodeClassifier(estimator=DecisionTreeClassifier()),
"OneVsOneClassifier":OneVsOneClassifier(estimator=DecisionTreeClassifier()),
"OneVsRestClassifier":OneVsRestClassifier(estimator=DecisionTreeClassifier()),
"SGDClassifier":SGDClassifier(),
"RidgeClassifierCV":RidgeClassifierCV(),
"RidgeClassifier":RidgeClassifier(),
"PassiveAggressiveClassifier    ":PassiveAggressiveClassifier    (),
"GaussianProcessClassifier":GaussianProcessClassifier(),
"AdaBoostClassifier":AdaBoostClassifier(),
"GradientBoostingClassifier":GradientBoostingClassifier(),
"BaggingClassifier":BaggingClassifier(),
"ExtraTreesClassifier":ExtraTreesClassifier(),
"RandomForestClassifier":RandomForestClassifier(),
"BernoulliNB":BernoulliNB(),
"CalibratedClassifierCV":CalibratedClassifierCV(),
"GaussianNB":GaussianNB(),
"LabelPropagation":LabelPropagation(),
"LabelSpreading":LabelSpreading(),
"LinearDiscriminantAnalysis":LinearDiscriminantAnalysis(),
"LinearSVC":LinearSVC(),
"LogisticRegression":LogisticRegression(),
"LogisticRegressionCV":LogisticRegressionCV(),
"MultinomialNB  ":MultinomialNB  (),
"NearestCentroid":NearestCentroid(),
"NuSVC":NuSVC(),
"Perceptron":Perceptron(),
"QuadraticDiscriminantAnalysis":QuadraticDiscriminantAnalysis(),
"SVC":SVC(),
"HistGradientBoostingClassifier":HistGradientBoostingClassifier(),
"CategoricalNB" : CategoricalNB()}


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, test_size = 0.25, random_state = 0)

print ('%-40s %-20s %-20s %-20s %-20s' % ("Model".center(22) ,"F1 Score".center(20),"Accuracy".center(15) ,"Training Time".center(15),"Testing Time".center(15) ))
print ('%-40s %-20s %-20s %-20s %-20s' % ("|____________________|","____________________","____________________" ,"____________________","____________________" ))
#X_train, X_test, y_train, y_test = train_test_split(X_wine, y_wine, test_size = 0.25, random_state = 0)

for i in ml_list:
    try:
        clf=ml_list[i]
        second=time.time()
        clf.fit(X_train, y_train)
        train=round(time.time()-second,5)
        second=time.time()
        predict =clf.predict(X_test)
        test=round(time.time()-second,5)
        f1=round(sklearn.metrics.f1_score(y_test, predict, average='macro'),5)
        acc=round(sklearn.metrics.accuracy_score(y_test, predict),5)
        print ('%-40s %-20s %-20s %-20s %-20s' % (i,f1,acc,train,test ))
    except:        print ('%-40s %-20s %-20s' % (i,"Error","Error" ))


        Model                                  F1 Score           Accuracy          Training Time         Testing Time      
|____________________|                   ____________________ ____________________ ____________________ ____________________
ExtraTreeClassifier                      0.92614              0.93007              0.001                0.0                 
DecisionTreeClassifier                   0.89092              0.8951               0.00698              0.0                 
OneClassSVM                              0.20513              0.33566              0.012                0.00196             
MLPClassifier                            0.89586              0.9021               0.46438              0.001               
ComplementNB                             0.89024              0.9021               0.001                0.00101             
DummyClassifier                          0.55368              0.58042              0.001                0.0                 


# optimizing some MLs using the RandomizedSearchCV method

In [13]:
opt = {"RandomForestClassifier":{"max_depth":np.linspace(1, 32, 32, endpoint=True),
"n_estimators" : sp_randint(1, 200),
"max_features": sp_randint(1, 11),
"min_samples_split":sp_randint(2, 11),
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]},
          
"ExtraTreeClassifier":{"max_depth":np.linspace(1, 32, 32, endpoint=True),
"max_features": sp_randint(1, 11),
"min_samples_split":sp_randint(2, 11),
#"ccp_alpha":sp_randint(2, 11),
#"class_weight":["balanced", "balanced_subsample"],"max_leaf_nodes"
"criterion": ["gini", "entropy"]},

"SVC": {"C": np.linspace(1, 1000, 10000, endpoint=True),
"gamma": np.linspace(0.1, 1000, 10000, endpoint=True)},
          
"GradientBoostingClassifier":{"learning_rate":np.linspace(0.0, 100, 10000, endpoint=True), #sp_randFloat(0.2,1.0),
"subsample"    :np.linspace(0.0, 1, 100, endpoint=True), #sp_randFloat(0.2,1.0),
"n_estimators" : sp_randInt(1, 1000),
"max_depth"    : sp_randInt(1, 1000)},       

          
          
"DecisionTreeClassifier" :  { 'criterion':['gini','entropy'],
"max_depth":np.linspace(1, 100, 100, endpoint=True),
"min_samples_split": sp_randint(2,100),#uniform(0.1,1),
 #"min_samples_leafs" : np.linspace(0.1, 0.5, 5, endpoint=True),
"max_features" : sp_randint(1,X_train.shape[1])}    
         }

In [14]:
X = cancer.data
y= cancer.target
models=[RandomForestClassifier(),
ExtraTreeClassifier(),
SVC(),
GradientBoostingClassifier(),
DecisionTreeClassifier()]
for i in models:
    
    clf = i#(n_estimators=20)
    second=time.time()
    # use a full grid over all parameters
    temp=str(i)[:-2]
    print(temp)
    for ii in [0,1]:
        if ii:
            param_dist =   opt[temp]
            print("OPTIMIZED")
        else:
            param_dist={}
            print("NOT OPTIMIZED")
        n_iter_search = 10
        random_search = RandomizedSearchCV(clf, param_distributions=param_dist)
        random_search.fit(X, y)
        print(random_search.best_params_)
        print (random_search.best_score_)
        print (random_search.best_params_)
        print (random_search.best_estimator_)
        print("time= ", (time.time()-second))
        print("-------------------------------------------------------------")

RandomForestClassifier
NOT OPTIMIZED
{}
0.9578481602235678
{}
RandomForestClassifier()
time=  1.186774730682373
-------------------------------------------------------------
OPTIMIZED
{'bootstrap': False, 'criterion': 'entropy', 'max_depth': 10.0, 'max_features': 6, 'min_samples_split': 5, 'n_estimators': 155}
0.9701443875174661
{'bootstrap': False, 'criterion': 'entropy', 'max_depth': 10.0, 'max_features': 6, 'min_samples_split': 5, 'n_estimators': 155}
RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=10.0,
                       max_features=6, min_samples_split=5, n_estimators=155)
time=  11.857276439666748
-------------------------------------------------------------
ExtraTreeClassifier
NOT OPTIMIZED
{}
0.924468250271697
{}
ExtraTreeClassifier()
time=  0.012934684753417969
-------------------------------------------------------------
OPTIMIZED
{'criterion': 'gini', 'max_depth': 10.0, 'max_features': 8, 'min_samples_split': 4}
0.947259742276044
{'criterion': 'g