In [27]:
#
import pandas as pd
import numpy as np

data = pd.read_csv('D:\\MBA\\D A T A     S C I E N C E\\Data\\diabetes.csv')
data.head()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [28]:
# here we do some correction in the dataset  - - 

data['Glucose'] = np.where(data['Glucose'] == 0, data['Glucose'].median(), data['Glucose'])
data['Insulin'] = np.where(data['Insulin'] == 0, data['Insulin'].median(), data['Insulin'])
data['SkinThickness'] = np.where(data['SkinThickness'] == 0, data['SkinThickness'].median(), data['SkinThickness'])

data.head()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72,35.0,30.5,33.6,0.627,50,1
1,1,85.0,66,29.0,30.5,26.6,0.351,31,0
2,8,183.0,64,23.0,30.5,23.3,0.672,32,1
3,1,89.0,66,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40,35.0,168.0,43.1,2.288,33,1


In [29]:
# divide  - - 
X = data.iloc[:,:-1]
Y = data['Outcome']


In [30]:
Y.value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [31]:

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.20,random_state=0)


In [32]:
# without any hyperparameter tuning  ---

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10)
classifier.fit(X_train, y_train)

predict = classifier.predict(X_test)


In [33]:

from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
print(confusion_matrix(y_test,predict))
print(accuracy_score(y_test,predict))
print(classification_report(y_test,predict))


[[91 16]
 [19 28]]
0.7727272727272727
              precision    recall  f1-score   support

           0       0.83      0.85      0.84       107
           1       0.64      0.60      0.62        47

    accuracy                           0.77       154
   macro avg       0.73      0.72      0.73       154
weighted avg       0.77      0.77      0.77       154



#### Random Forest Classifier    - -

> The main parameters used by a Random Forest Classifier are:

1) criterion = the function used to evaluate the quality of a split.
2) max_depth = maximum number of levels allowed in each tree.
3) max_features = maximum number of features considered when splitting a node.
4) min_samples_leaf = minimum number of samples which can be stored in a tree leaf.
5) min_samples_split = minimum number of samples necessary in a node to cause node splitting.
6) n_estimators = number of trees in the ensamble

In [8]:


### Manual Hyperparameter Tuning
model = RandomForestClassifier(n_estimators=300,
                             criterion='entropy',
                             max_features='sqrt',
                             min_samples_leaf=10,
                             random_state=100)

model.fit(X_train, y_train)

predict = model.predict(X_test)

print(confusion_matrix(y_test,predict))
print(accuracy_score(y_test,predict))
print(classification_report(y_test,predict))


[[97 10]
 [17 30]]
0.8246753246753247
              precision    recall  f1-score   support

           0       0.85      0.91      0.88       107
           1       0.75      0.64      0.69        47

    accuracy                           0.82       154
   macro avg       0.80      0.77      0.78       154
weighted avg       0.82      0.82      0.82       154



### with manual Hyperparameter Tuning we get good accuracy
### Now we apply some Tuning Technique   - - --

In [9]:
 # new - -

## 1) Randomized Search CV    --

####  it randomly makes pair from the given parameters and tell us which one is best  --

In [10]:
#
from sklearn.model_selection import RandomizedSearchCV

# number of trees in Random Forest --
n_estimators = [int(x) for x in np.linspace(200,2000,10)]

# number of features to consider at every split - -
max_features = ['auto', 'sqrt', 'log2']

# maximum number of levels in tree  --
max_depth = [int(x)  for x in np.linspace(10,1000,10)]

# minimum number ofample required to split the node  --
min_samples_split = [1,2,3,5,6,4,7]

# minimum number of sample required at each leaf
min_samples_leaf = [2,4,3,5,6]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [1, 2, 3, 5, 6, 4, 7], 'min_samples_leaf': [2, 4, 3, 5, 6], 'criterion': ['entropy', 'gini']}


In [11]:
#
random = RandomForestClassifier()

# here in randomsearchcv first we fit our algorithm in estimator
## then, those params which is randomly selected by model
### then all  -- 

random_cv = RandomizedSearchCV(estimator = random,
                              param_distributions = random_grid,
                              n_iter = 100,
                              cv = 3,
                              verbose = 2,
                              random_state = 100,
                              n_jobs = -1)

random_cv.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   40.3s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  8.2min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [12]:
#
## here we assign all the output values to best_params  and from these we find our accuracy and build model 

best_params = random_cv.best_estimator_
random_cv.best_params_

{'n_estimators': 400,
 'min_samples_split': 6,
 'min_samples_leaf': 6,
 'max_features': 'log2',
 'max_depth': 120,
 'criterion': 'gini'}

In [13]:

y_pred = best_params.predict(X_test)

from sklearn.metrics import accuracy_score
print(confusion_matrix(y_test,y_pred))
print("Accuracy Score {}".format(accuracy_score(y_test,y_pred)))
print("Classification report: {}".format(classification_report(y_test,y_pred)))


[[97 10]
 [15 32]]
Accuracy Score 0.8376623376623377
Classification report:               precision    recall  f1-score   support

           0       0.87      0.91      0.89       107
           1       0.76      0.68      0.72        47

    accuracy                           0.84       154
   macro avg       0.81      0.79      0.80       154
weighted avg       0.83      0.84      0.83       154



#### as we see here our accuracy increase in some amount  --^

In [14]:
#  new  - -

## 2) Grid Search CV  --

In [16]:

# 

from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion' : [random_cv.best_params_['criterion']],
    'max_depth' : [random_cv.best_params_['max_depth']],
    'max_features' : [random_cv.best_params_['max_features']],
    'min_samples_leaf' : [random_cv.best_params_['min_samples_leaf'],
                        random_cv.best_params_['min_samples_leaf'] + 2,
                        random_cv.best_params_['min_samples_leaf'] + 4],
    'min_samples_split' : [random_cv.best_params_['min_samples_split']-2,
                         random_cv.best_params_['min_samples_split']-1,
                         random_cv.best_params_['min_samples_split'],
                         random_cv.best_params_['min_samples_split']+1,
                         random_cv.best_params_['min_samples_split']+2],
    'n_estimators' : [random_cv.best_params_['n_estimators']-200,
                     random_cv.best_params_['n_estimators']-100,
                     random_cv.best_params_['n_estimators'],
                     random_cv.best_params_['n_estimators']+100,
                     random_cv.best_params_['n_estimators']+200]
} 

print(param_grid)

{'criterion': ['gini'], 'max_depth': [120], 'max_features': ['log2'], 'min_samples_leaf': [6, 8, 10], 'min_samples_split': [4, 5, 6, 7, 8], 'n_estimators': [200, 300, 400, 500, 600]}


In [17]:
# - -
random = RandomForestClassifier()

grid_search = GridSearchCV(estimator = random,
                          param_grid = param_grid,
                          cv = 10,
                          n_jobs = -1,
                          verbose = 2)

grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 75 candidates, totalling 750 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   19.2s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 750 out of 750 | elapsed:  8.1min finished


GridSearchCV(cv=10, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rand

In [18]:
#
best_grid = grid_search.best_estimator_


In [19]:

y_pred = best_grid.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print("Accuracy Score {}".format(accuracy_score(y_test,y_pred)))
print("Classification report: {}".format(classification_report(y_test,y_pred)))


[[97 10]
 [16 31]]
Accuracy Score 0.8311688311688312
Classification report:               precision    recall  f1-score   support

           0       0.86      0.91      0.88       107
           1       0.76      0.66      0.70        47

    accuracy                           0.83       154
   macro avg       0.81      0.78      0.79       154
weighted avg       0.83      0.83      0.83       154



In [20]:
!pip install hyperopt

Collecting hyperopt
  Downloading hyperopt-0.2.4-py2.py3-none-any.whl (964 kB)
Installing collected packages: hyperopt
Successfully installed hyperopt-0.2.4


In [21]:
# new - -  -

In [8]:
!pip install hyperopt

Collecting hyperopt

Error processing line 1 of C:\Users\hemant\.conda\envs\new base\lib\site-packages\matplotlib-3.3.0-py3.7-nspkg.pth:

  Traceback (most recent call last):
    File "C:\Users\hemant\.conda\envs\new base\lib\site.py", line 168, in addpackage
      exec(line)
    File "<string>", line 1, in <module>
    File "<frozen importlib._bootstrap>", line 580, in module_from_spec
  AttributeError: 'NoneType' object has no attribute 'loader'

Remainder of file ignored
ERROR: Error checking for conflicts.
Traceback (most recent call last):
  File "C:\Users\hemant\.conda\envs\new base\lib\site-packages\pip\_vendor\pkg_resources\__init__.py", line 3021, in _dep_map
    return self.__dep_map
  File "C:\Users\hemant\.conda\envs\new base\lib\site-packages\pip\_vendor\pkg_resources\__init__.py", line 2815, in __getattr__
    raise AttributeError(attr)
AttributeError: _DistInfoDistribution__dep_map



  Using cached hyperopt-0.2.4-py2.py3-none-any.whl (964 kB)
Collecting cloudpickle
  Downloading cloudpickle-1.5.0-py3-none-any.whl (22 kB)
Collecting networkx>=2.2
  Downloading networkx-2.4-py3-none-any.whl (1.6 MB)
Installing collected packages: cloudpickle, networkx, hyperopt
Successfully installed cloudpickle-1.5.0 hyperopt-0.2.4 networkx-2.4



During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\hemant\.conda\envs\new base\lib\site-packages\pip\_vendor\pkg_resources\__init__.py", line 3012, in _parsed_pkg_info
    return self._pkg_info
  File "C:\Users\hemant\.conda\envs\new base\lib\site-packages\pip\_vendor\pkg_resources\__init__.py", line 2815, in __getattr__
    raise AttributeError(attr)
AttributeError: _pkg_info

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\hemant\.conda\envs\new base\lib\site-packages\pip\_internal\commands\install.py", line 512, in _warn_about_conflicts
    package_set, _dep_info = check_install_conflicts(to_install)
  File "C:\Users\hemant\.conda\envs\new base\lib\site-packages\pip\_internal\operations\check.py", line 114, in check_install_conflicts
    package_set, _ = create_package_set_from_installed()
  File "C:\Users\hemant\.conda\envs\new base\lib\sit

## 3) Bayesian Optimization
Bayesian optimization uses probability to find the minimum of a function. The final aim is to find the input value to a function which can gives us the lowest possible output value.It usually performs better than random,grid and manual search providing better performance in the testing phase and reduced optimization time. In Hyperopt, Bayesian Optimization can be implemented giving 3 three main parameters to the function fmin.

Objective Function = defines the loss function to minimize.
Domain Space = defines the range of input values to test (in Bayesian Optimization this space creates a probability distribution for each of the used Hyperparameters).
Optimization Algorithm = defines the search algorithm to use to select the best input values to use in each new iteration.

In [9]:

from hyperopt import hp,fmin,tpe,STATUS_OK,Trials

space = {'criterion': hp.choice('criterion', ['entropy', 'gini']),
        'max_depth': hp.quniform('max_depth', 10, 1200, 10),
        'max_features': hp.choice('max_features', ['auto', 'sqrt','log2', None]),
        'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),
        'min_samples_split' : hp.uniform ('min_samples_split', 0, 1),
        'n_estimators' : hp.choice('n_estimators', [10, 50, 300, 750, 1200,1300,1500])
    }

In [10]:

def objective(space):
    
    model = RandomForestClassifier(criterion = space['criterion'],
                                 max_depth = space['max_depth'],
                                 max_features = space['max_features'],
                                 min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split'],
                                 n_estimators = space['n_estimators'],
                                )
    
    accuracy = cross_val_score(model, X_train, y_train, cv = 5).mean()
    # We aim to maximize accuracy, therefore we return it as a negative value   
    return {'loss' : -accuracy, 'status' : STATUS_OK}


In [11]:
#
from sklearn.model_selection import cross_val_score
trials = Trials()
best = fmin(fn = objective,
           space = space,
           algo= tpe.suggest,
           max_evals = 80,
           trials= trials)
best

100%|██████████| 80/80 [16:00<00:00, 12.00s/trial, best loss: -0.7703451952552313]


{'criterion': 1,
 'max_depth': 940.0,
 'max_features': 2,
 'min_samples_leaf': 0.09822663981947563,
 'min_samples_split': 0.17051290339068528,
 'n_estimators': 2}

In [12]:
crit = {0: 'entropy', 1: 'gini'}
feat = {0: 'auto', 1: 'sqrt', 2: 'log2', 3: None}
est = {0: 10, 1: 50, 2: 300, 3: 750, 4: 1200,5:1300,6:1500}


print(crit[best['criterion']])
print(feat[best['max_features']])
print(est[best['n_estimators']])

gini
log2
300


In [15]:
trainedforest = RandomForestClassifier(criterion = crit[best['criterion']], max_depth = best['max_depth'], 
                                       max_features = feat[best['max_features']], 
                                       min_samples_leaf = best['min_samples_leaf'], 
                                       min_samples_split = best['min_samples_split'], 
                                       n_estimators = est[best['n_estimators']]).fit(X_train,y_train)
predictionforest = trainedforest.predict(X_test)
print(confusion_matrix(y_test,predictionforest))
print(accuracy_score(y_test,predictionforest))
print(classification_report(y_test,predictionforest))
acc5 = accuracy_score(y_test,predictionforest)

[[97 10]
 [26 21]]
0.7662337662337663
              precision    recall  f1-score   support

           0       0.79      0.91      0.84       107
           1       0.68      0.45      0.54        47

    accuracy                           0.77       154
   macro avg       0.73      0.68      0.69       154
weighted avg       0.75      0.77      0.75       154



In [16]:
# new  - -

## 4) Genetic Algorithm  - - 

Genetic Algorithms
Genetic Algorithms tries to apply natural selection mechanisms to Machine Learning contexts.

Let's immagine we create a population of N Machine Learning models with some predifined Hyperparameters. We can then calculate the accuracy of each model and decide to keep just half of the models (the ones that performs best). We can now generate some offsprings having similar Hyperparameters to the ones of the best models so that go get again a population of N models. At this point we can again caltulate the accuracy of each model and repeate the cycle for a defined number of generations. In this way, just the best models will survive at the end of the process.

In [34]:

import numpy as np
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 1000,10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Create the random grid
param = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(param)


{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 5, 10, 14], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [35]:
conda install -c conda-forge ipywidgets


Note: you may need to restart the kernel to use updated packages.


usage: conda-script.py [-h] [-V] command ...
conda-script.py: error: unrecognized arguments: ipywidgets


In [36]:
# #
# from tpot import TPOTClassifier

# tpot_classifier = TPOTClassifier(generations = 5,
#                                  population_size= 24, 
#                                  offspring_size= 12,
#                                  verbosity= 2, 
#                                  early_stop= 12,
#                                  config_dict={'sklearn.ensemble.RandomForestClassifier': param}, 
#                                  cv = 4, 
#                                  scoring = 'accuracy')

# tpot_classifier.fit(X_train,y_train)
from tpot import TPOTClassifier


tpot_classifier = TPOTClassifier(generations= 5, population_size= 24, offspring_size= 12,
                                 verbosity= 2, early_stop= 12,
                                 config_dict={'sklearn.ensemble.RandomForestClassifier': param}, 
                                 cv = 4, scoring = 'accuracy')
tpot_classifier.fit(X_train,y_train)

ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

In [37]:

#  it creates some issue regarding tpot by the way it is one of the best technique to use  - - 

# in which it first respect to xtest find ypred and then it compares with ytest  and gives accuracy  - --

accuracy = tpot_classifier.score(X_test, y_test)
print(accuracy)


RuntimeError: A pipeline has not yet been optimized. Please call fit() first.