In [1]:
# Cross_validation: get a more reliable evaluation matrix such as an accuracy_score, k-fold cross validation
# Hyperparemeter tuning. 2 kinds of parameters: model parameter (derived from the dataset that we have) & 
# hyperparameter (determines how your model is being trained and they are changed to get the best fit of the model)
# 2 main important techniques to work on: GridSearchCV and RandomizedSearchCV
# Model selection
# Accuracy and confusion matrix
# Precision, Recall and F1 score - reliable (accuracy_score)
# Metrix for regression (Mean Absolute Error, Mean Squared Error)

### K-Fold Cross-Validation
  - we split the dataset into "K" number of folds (subsets). One chunk of data is used as test data for evaluation
  & the remaining part of the data is used for training the model. Each time, a different chunk/fold will be used as the test data
  - The number of folds is equal to the number of iterations of data. At time, a certain chunk(s) of data is used 
  for training while the other is used for testing(evaluation)
  - Advantage of using K-Fold to Train_Test_split is that you are testing your model with different chunks of data(iterations).
  We have different test data for every training session hence a good one to evaluate
  - A different instance of model is used in different iterations
  ADVANTAGES OF K-FOLD CROSS-VALIDATION :
  1. Better alternative for train_test split when the dataset is small. It takes a long time when dataset is large
  2. Better for multiclass classification problems
  3. More reliable
  4. useful for model selection

### 1. cross validation

In [32]:
# import necessary dependencies
import sklearn.datasets
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler,MinMaxScaler

# below class is for cross validation
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score

# importing the models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [18]:
# read data or csv to pandas dataframe
heart_data = pd.read_csv('../heart.csv')
heart_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [19]:
# number of rows and columns
heart_data.shape

(303, 14)

In [20]:
# check for missing values
heart_data.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [21]:
heart_data['target'].unique()

array([1, 0], dtype=int64)

In [22]:
# checking distribution of target variable
# 1 - defective heart
# 0 - healthy heart 
heart_data['target'].value_counts()

1    165
0    138
Name: target, dtype: int64

In [23]:
# splitting the features and target
X = heart_data.drop('target', axis=1)
y = heart_data['target']

X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [35]:
# check unique columns
for col in heart_data:
    print(f'{col} : {heart_data[col].unique()}')

age : [63 37 41 56 57 44 52 54 48 49 64 58 50 66 43 69 59 42 61 40 71 51 65 53
 46 45 39 47 62 34 35 29 55 60 67 68 74 76 70 38 77]
sex : [1 0]
cp : [3 2 1 0]
trestbps : [145 130 120 140 172 150 110 135 160 105 125 142 155 104 138 128 108 134
 122 115 118 100 124  94 112 102 152 101 132 148 178 129 180 136 126 106
 156 170 146 117 200 165 174 192 144 123 154 114 164]
chol : [233 250 204 236 354 192 294 263 199 168 239 275 266 211 283 219 340 226
 247 234 243 302 212 175 417 197 198 177 273 213 304 232 269 360 308 245
 208 264 321 325 235 257 216 256 231 141 252 201 222 260 182 303 265 309
 186 203 183 220 209 258 227 261 221 205 240 318 298 564 277 214 248 255
 207 223 288 160 394 315 246 244 270 195 196 254 126 313 262 215 193 271
 268 267 210 295 306 178 242 180 228 149 278 253 342 157 286 229 284 224
 206 167 230 335 276 353 225 330 290 172 305 188 282 185 326 274 164 307
 249 341 407 217 174 281 289 322 299 300 293 184 409 259 200 327 237 218
 319 166 311 169 187 176 241 131]
fbs :

In [38]:
#Scale the data above 
scaler = MinMaxScaler()
heart_data = pd.DataFrame(scaler.fit_transform(heart_data), columns=heart_data.columns)
heart_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,0.708333,1.0,1.0,0.481132,0.244292,1.0,0.0,0.603053,0.0,0.370968,0.0,0.0,0.333333,1.0
1,0.166667,1.0,0.666667,0.339623,0.283105,0.0,0.5,0.885496,0.0,0.564516,0.0,0.0,0.666667,1.0
2,0.25,0.0,0.333333,0.339623,0.178082,0.0,0.0,0.770992,0.0,0.225806,1.0,0.0,0.666667,1.0
3,0.5625,1.0,0.333333,0.245283,0.251142,0.0,0.5,0.816794,0.0,0.129032,1.0,0.0,0.666667,1.0
4,0.583333,0.0,0.0,0.245283,0.520548,0.0,0.5,0.70229,1.0,0.096774,1.0,0.0,0.666667,1.0


In [41]:
# check unique columns
# for col in heart_data:
#     print(f'{col} : {heart_data[col].unique()}')

In [42]:
 print(y.head())

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64


### train_test_split

In [43]:
# split dataset into train and test data
# use train_test_split from scikit learn
# stratify makes the data have the same proportion i.e target data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0, stratify=y)

In [44]:
# X_train contains all the features for the training data
# y_train contains corresponding target

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(242, 13)
(242,)
(61, 13)
(61,)


In [45]:
# train all the models
# Purpose is to compare the performance of the models

In [46]:
# create list of models
models = [LogisticRegression(max_iter=1000), SVC(kernel='linear'), KNeighborsClassifier(), RandomForestClassifier()]

In [47]:
# create a function to fit and predict the above list of models
def compare_models_train_test():
    
    for model in models:
#         train the models
        model.fit(X_train, y_train)
#         evaluate the models
        test_data_prediction = model.predict(X_test)
#         compare true labels (y_test) and the predicted lables test_data_prediction
        accuracy = accuracy_score(y_test, test_data_prediction)
    
        print(f'Accuracy score of the model {model} = {accuracy}')

In [48]:
# call the function 
compare_models_train_test()

Accuracy score of the model LogisticRegression(max_iter=1000) = 0.8688524590163934
Accuracy score of the model SVC(kernel='linear') = 0.8688524590163934
Accuracy score of the model KNeighborsClassifier() = 0.7049180327868853
Accuracy score of the model RandomForestClassifier() = 0.8524590163934426


#### The error above is because we have not standardized some columns. Standardized each columns individually 
apart from the ones with zeros and and 1's

In [49]:
# cross_val_score : single function used to automatically do the splitting of data and calculate the accuracy
# It is better than KFold since we dont have to train our data seperately

### cross_validation

In [50]:
# LogisticRegression
# cv = how many chunks/folds/iterations you want, when you use it, you automatically use the StratifiedKFold method
# with StratifiedKFold your class distribution will be even similar to entire dataset
# KFold data wount be stratified
cv_score_lr = cross_val_score(LogisticRegression(max_iter=1000), X, y, cv=5)

In [51]:
print(cv_score_lr)

mean_accuracy_lr = sum(cv_score_lr)/len(cv_score_lr)

mean_accuracy_lr = mean_accuracy_lr * 100

mean_accuracy_lr = round(mean_accuracy_lr, 2)

mean_accuracy_lr

[0.80327869 0.86885246 0.85245902 0.86666667 0.75      ]


82.83

In [52]:
# Support Vector Classifier
cv_score_svc = cross_val_score(SVC(kernel='linear'), X, y, cv=5)

print(cv_score_svc)

mean_accuracy_svc = sum(cv_score_svc)/len(cv_score_svc)

mean_accuracy_svc = mean_accuracy_svc * 100

mean_accuracy_svc = round(mean_accuracy_svc, 2)

mean_accuracy_svc

[0.81967213 0.8852459  0.80327869 0.86666667 0.76666667]


82.83

In [53]:
# use one function to compare the models
def compare_models_cross_validation():
    
    for model in models:
        
        cv_score = cross_val_score(model, X, y, cv = 5)
        
        mean_accuracy = sum(cv_score)/len(cv_score)
        
        mean_accuracy = mean_accuracy * 100
        
        mean_accuracy = round(mean_accuracy, 2)
        
        print(f"Cross validation accuracies for {model} = {cv_score}")
        
        print(f"Accuracy % of the {model} {mean_accuracy}")
        
        print("-----------------------------------------------")
        

In [54]:
compare_models_cross_validation()

Cross validation accuracies for LogisticRegression(max_iter=1000) = [0.80327869 0.86885246 0.85245902 0.86666667 0.75      ]
Accuracy % of the LogisticRegression(max_iter=1000) 82.83
-----------------------------------------------
Cross validation accuracies for SVC(kernel='linear') = [0.81967213 0.8852459  0.80327869 0.86666667 0.76666667]
Accuracy % of the SVC(kernel='linear') 82.83
-----------------------------------------------
Cross validation accuracies for KNeighborsClassifier() = [0.60655738 0.6557377  0.57377049 0.73333333 0.65      ]
Accuracy % of the KNeighborsClassifier() 64.39
-----------------------------------------------
Cross validation accuracies for RandomForestClassifier() = [0.83606557 0.86885246 0.7704918  0.78333333 0.73333333]
Accuracy % of the RandomForestClassifier() 79.84
-----------------------------------------------


### Addtional notes on cross validation techniques
1. HoldOut validation approach -Train_test_split. Changing the random_state changes the accuracy score and may lead to 
overfitting and underfitting
2. KFold cross validation : suppose 10 splits you decide to take, 10 splits of training and testing data will be taken
and average mean of the splits taken. Prior to train_test_split which changes on every alter on random_state, the latter
will take the mean of all the values received on every change of the random_state

3. Stratified cross validation: used mostly when the dataset is imbalanced. It makes sure that your dataset is taken 
in almost similar proportion i.e target(dependent) and features(independent)

In [57]:
# KFold cross validation
from sklearn.model_selection import KFold
model = RandomForestClassifier()
KFold_validation = KFold(10)

from sklearn.model_selection import cross_val_score
import numpy as np
results = cross_val_score(model, X, y, cv=KFold_validation)
print(results)
print(np.mean(results))

[0.83870968 0.83870968 0.80645161 0.76666667 0.86666667 0.83333333
 0.73333333 0.83333333 0.73333333 0.6       ]
0.7850537634408602


In [58]:
# StratifiedKFold cross validation
from sklearn.model_selection import StratifiedKFold
skfold = StratifiedKFold(n_splits = 5)
model = RandomForestClassifier()

scores = cross_val_score(model, X, y, cv=skfold)
print(scores)
print(np.mean(scores))

[0.81967213 0.85245902 0.80327869 0.8        0.75      ]
0.8050819672131148


### NUMBER 2: HYPERPAREMETER TUNING
      1. GridSearchCV
      2. RandomizedSearchCV
      
      Types of parameters
      - model parameters : parameters of the model that can be determined by training with training data. They can be
      considered as Internal Parameters. Examples are weights and bias. The model controls these parameters and finds the best
      out of it
       
       Y = m * X + b : m = slope, b = intercept.... m and b are the model parameters and the best values of it are determined
      - Hyperparameters: parameters whose values control the learning process. They are adjustable parameters used to obtain 
      an optimal model. External Parameters. We control the hyperparameters. Examples: learning rate, Number of Epochs/iterations, n_estimators (number of decision trees you want in your model) - Random Forest
      
      
      ---MACHINE LEARNING----
      - hYPERparameter tuning (best hyperparameters) :
            .Refers to the process of choosing the optimum set of hyperparameters for a Machine Learning Model.
            .The process is also called Hyperparameter Optimization
            
            Types : GridSearchCV and RandomizedSearchCV
                    .GridSearchCV : check a particular hyperparameter value and check which is giving us the highest accuracy
                                 -> It finds the accuracy for all the different combinations of hyperparameter values
                    .RandomizedSearchCV : does not calculate performance of the model for different combinations of           Hyperparameters. It only selects randomly few parameters and find which is the best value
                                 -> Takes some random hyperparameter values and see which combination of values we are getting
                                  high accuracy
      - Model training (best model parameters)

### Breast cancer dataset from sklearn

In [None]:
# All sklearn datasets are divided into data (all feature variables that help the model learn) and target (which is the actual label)
# feature_names are the names of the feature variable: names of the columns in data
# target_names is the name(s) of the: name of the target column



# ------EXAMPLE-------

# from sklearn import datasets

# the load below does not return tabular data. Rather it returns a bunch
# data = datasets.load_breast_cancer()

# import pandas as pd

# Read the DataFrame, first using the feature data
# df = pd.DataFrame(data.data, columns=data.feature_names)

# Add a target column, and fill it with the target data
# df['target'] = data.target

# df.head()


In [21]:
# load the data from sklearn
# 1 - Benign
# 0 - Malignant
breast_cancer_dataset = sklearn.datasets.load_breast_cancer()

In [22]:
# read data
print(breast_cancer_dataset)

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]]), 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
 

In [23]:
# loading data to a dataframe
data_frame = pd.DataFrame(breast_cancer_dataset.data, columns = breast_cancer_dataset.feature_names)
data_frame.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [24]:
# adding the target column to the dataframe
data_frame['label'] = breast_cancer_dataset.target

In [25]:
# recheck the dataset again
data_frame.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,label
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [26]:
# check rows and columns
data_frame.shape

(569, 31)

In [27]:
# checking for missing values
data_frame.isna().sum()

mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
label                      0
dtype: int64

In [28]:
# check distribution of target variable
print(data_frame.label.unique())
print(data_frame.label.value_counts())

[0 1]
1    357
0    212
Name: label, dtype: int64


In [29]:
# seperating the features and targets

# feature variables
X = data_frame.drop(columns='label', axis=1)

# label
y = data_frame['label']

In [30]:
# convert the above dataframes into numpy arrays
X = np.asarray(X)
y = np.asarray(y)

### GridSearchCV
   - used for determining the best parameters for our models
   - we'll use Support Vector Classifier Model for the tasks

In [31]:
# load the svc model
model = SVC()

In [32]:
# Hyperparameters
parameters = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C' : [1, 5, 10, 20]
}

In [33]:
# grid search - does the splitting of data to train and test in the background
classifier = GridSearchCV(model, parameters, cv=5)

In [34]:
# fitting the data to our model
classifier.fit(X, y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [1, 5, 10, 20],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']})

In [35]:
classifier.cv_results_

{'mean_fit_time': array([1.28276463e+00, 3.20072174e-03, 4.99181747e-03, 1.21100903e-02,
        2.44899449e+00, 3.12418938e-03, 6.24942780e-03, 6.24961853e-03,
        3.50057569e+00, 0.00000000e+00, 6.24966621e-03, 1.56238079e-02,
        6.46940823e+00, 4.19416428e-03, 4.19721603e-03, 1.09906673e-02]),
 'std_fit_time': array([5.88804018e-01, 3.99171387e-04, 6.58898547e-06, 8.46580159e-03,
        4.70833830e-01, 6.24837875e-03, 7.65395464e-03, 7.65418825e-03,
        5.89943978e-01, 0.00000000e+00, 7.65424665e-03, 2.13248060e-07,
        1.93078216e+00, 7.44861594e-04, 3.99541926e-04, 6.23684272e-04]),
 'mean_score_time': array([0.0004056 , 0.00119667, 0.00220137, 0.00239844, 0.00020018,
        0.0031249 , 0.        , 0.00312495, 0.003125  , 0.00312524,
        0.        , 0.        , 0.00040021, 0.00059967, 0.00159945,
        0.00239844]),
 'std_score_time': array([0.00049676, 0.00039396, 0.0003989 , 0.00119922, 0.00040035,
        0.00624981, 0.        , 0.0062499 , 0.00625   , 

In [63]:
# Best parameters
best_parameters = classifier.best_params_
print(best_parameters)

{'C': 10, 'kernel': 'linear'}


In [64]:
# highest accuracy
highest_accuracy = classifier.best_score_
print(highest_accuracy)

0.9525694767893185


In [67]:
# loading the results to pandas dataframe
result = pd.DataFrame(classifier.cv_results_)
result.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.575193,0.70865,0.001205,0.000399,1,linear,"{'C': 1, 'kernel': 'linear'}",0.947368,0.929825,0.973684,0.921053,0.955752,0.945536,0.018689,4
1,0.006595,0.005745,0.001203,0.000399,1,poly,"{'C': 1, 'kernel': 'poly'}",0.842105,0.885965,0.929825,0.947368,0.938053,0.908663,0.039382,12
2,0.007196,0.002638,0.004598,0.002331,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.850877,0.894737,0.929825,0.947368,0.938053,0.912172,0.035444,11
3,0.023806,0.010497,0.003597,0.000489,1,sigmoid,"{'C': 1, 'kernel': 'sigmoid'}",0.54386,0.45614,0.464912,0.385965,0.451327,0.460441,0.050253,13
4,3.026274,0.664478,0.0004,0.000489,5,linear,"{'C': 5, 'kernel': 'linear'}",0.947368,0.938596,0.973684,0.929825,0.964602,0.950815,0.016216,2


In [68]:
grid_search_result = result[['param_C', 'param_kernel', 'mean_test_score']]
grid_search_result

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,linear,0.945536
1,1,poly,0.908663
2,1,rbf,0.912172
3,1,sigmoid,0.460441
4,5,linear,0.950815
5,5,poly,0.922729
6,5,rbf,0.931501
7,5,sigmoid,0.411178
8,10,linear,0.952569
9,10,poly,0.920975


### RandomizedSearchCV

In [69]:
# loading SVC model
model = SVC()

In [70]:
# Hyperparameters
parameters = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C' : [1, 5, 10, 20]
}

In [71]:
# Randomized search
classifier = RandomizedSearchCV(model, parameters, cv=5)

In [72]:
# fit our data to our model
classifier.fit(X, y)

RandomizedSearchCV(cv=5, estimator=SVC(),
                   param_distributions={'C': [1, 5, 10, 20],
                                        'kernel': ['linear', 'poly', 'rbf',
                                                   'sigmoid']})

In [73]:
# print the classifier results/cross validation results
classifier.cv_results_

{'mean_fit_time': array([0.05476685, 0.00579596, 3.20612097, 0.01519241, 0.00399799,
        0.00479684, 0.0047956 , 0.00834093, 0.01228509, 1.63752623]),
 'std_fit_time': array([9.25655918e-02, 1.83115019e-03, 8.48961436e-01, 5.07117744e-03,
        1.19113884e-06, 1.32681367e-03, 1.16586264e-03, 4.75746093e-03,
        1.16418940e-03, 6.62028887e-01]),
 'mean_score_time': array([0.00699511, 0.00139928, 0.00080023, 0.00539613, 0.00240035,
        0.00220032, 0.00220113, 0.00219784, 0.00415587, 0.00100021]),
 'std_score_time': array([0.00460176, 0.0004892 , 0.00040011, 0.0023342 , 0.00280263,
        0.00040116, 0.00098409, 0.00040026, 0.0033435 , 0.00063226]),
 'param_kernel': masked_array(data=['rbf', 'poly', 'linear', 'sigmoid', 'poly', 'rbf',
                    'rbf', 'rbf', 'sigmoid', 'linear'],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_C': masked_array

In [74]:
# print the best parameters that are determined by RandomizedSearchCV
best_paramaters = classifier.best_params_
print(best_parameters)

{'C': 10, 'kernel': 'linear'}


In [75]:
# print highest accuracy achieved
best_accuracy = classifier.best_score_
best_accuracy

0.9508150908244062

In [76]:
result = pd.DataFrame(classifier.cv_results_)
result.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kernel,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.054767,0.092566,0.006995,0.004602,rbf,10,"{'kernel': 'rbf', 'C': 10}",0.877193,0.921053,0.912281,0.95614,0.946903,0.922714,0.027879,4
1,0.005796,0.001831,0.001399,0.000489,poly,10,"{'kernel': 'poly', 'C': 10}",0.885965,0.921053,0.903509,0.938596,0.955752,0.920975,0.024701,5
2,3.206121,0.848961,0.0008,0.0004,linear,5,"{'kernel': 'linear', 'C': 5}",0.947368,0.938596,0.973684,0.929825,0.964602,0.950815,0.016216,1
3,0.015192,0.005071,0.005396,0.002334,sigmoid,10,"{'kernel': 'sigmoid', 'C': 10}",0.482456,0.403509,0.421053,0.342105,0.362832,0.402391,0.048906,10
4,0.003998,1e-06,0.0024,0.002803,poly,20,"{'kernel': 'poly', 'C': 20}",0.877193,0.921053,0.903509,0.938596,0.955752,0.919221,0.0273,7


In [77]:
randomized_search_result = result[['param_C', 'param_kernel', 'mean_test_score']]
randomized_search_result

Unnamed: 0,param_C,param_kernel,mean_test_score
0,10,rbf,0.922714
1,10,poly,0.920975
2,5,linear,0.950815
3,10,sigmoid,0.402391
4,20,poly,0.919221
5,20,rbf,0.920944
6,5,rbf,0.931501
7,1,rbf,0.912172
8,5,sigmoid,0.411178
9,1,linear,0.945536


### Model Selection
------process of choosing the best suited model for a particular problem
------selecting a model depends on various factors such as dataset, task, nature of model etc

TWO FACTOS TO BE CONSIDERED:
    - Logical reason to select a model
    - Comparing the performance of the model
MODELS CAN BE SELECTED BASED ON:

    1. Type of data available
       - Images and videos : CNN (Convolutional neural network) which is a type of artificial neural network used in image
                             recognition and processing specifically designed to process pixel data
                             
       - Text data or Speech data : RNN (Recurrent Neural Network) which uses sequential data or time series data
       
                      -----------THE ABOVE TWO ARE FOR DEEP LEARNING ----------------------------
                      
       - Numerical data : SVM, Logistic Regression, Decision trees etc -- MACHINE LEARNING MODELS --
       
       
    2. Based on the task we need to carry out
       -  Classification tasks : SVM, Logistic Regression, Decision Trees, RandomForestClassifier etc 
            . Logistic Regression : (used for binary classification problems): Easy to implement, performs well on data 
                                  with linear relationships, less prone to overfitting for low dimensional datasets
                                  
                                  -High dimensional datasets causes overfitting, difficult to capture complex relationships
                                   in a dataset, sensitive to outliers, Needs a larger dataset
                                   
            . Decision tree : can be used for both classification and regression, easy to interpret, no need for 
                             normalization or scaling, not sensitive to outliers
                             
                             : overfitting issue, instability, training time is relatively high
            
            
            . SVM : used when you know several number of columns in your datasets
         
       - Regression tasks : Linear Regression, Random Forest, Polynomial Regression
                     . Linear Regression : performs well on data with linear relationships, It's sensitive to 
                                          to outliers, underfitting issue
       
       - Clustering tasks : K-Means Clustering, Hierarchical clustering  (UNSUPERVISED LEARNING TECHNIQUES)

In [78]:
# Comparing the models with default hyperparameter values using Cross validation

In [79]:
# list of models
# max_iter used because we not standardizing the data
model = [LogisticRegression(max_iter=1000), SVC(kernel='linear'), KNeighborsClassifier(), RandomForestClassifier(random_state=0)]

In [80]:
def compare_models_using_cross_validation():
    
    for model in models:
        
        cv_score = cross_val_score(model, X, y, cv=5)
        
        mean_accuracy = np.mean(cv_score)*100
        
        mean_accuracy = round(mean_accuracy, 2)
        
        print(f'Cross Validation for {model} = {cv_score}')
        print(f'Accuracy score of {model} : {mean_accuracy}')
        print('-----------------------------------------------')

In [82]:
compare_models_using_cross_validation()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Cross Validation for LogisticRegression(max_iter=1000) = [0.93859649 0.93859649 0.97368421 0.94736842 0.96460177]
Accuracy score of LogisticRegression(max_iter=1000) : 95.26
-----------------------------------------------
Cross Validation for SVC(kernel='linear') = [0.94736842 0.92982456 0.97368421 0.92105263 0.95575221]
Accuracy score of SVC(kernel='linear') : 94.55
-----------------------------------------------
Cross Validation for KNeighborsClassifier() = [0.88596491 0.93859649 0.93859649 0.94736842 0.92920354]
Accuracy score of KNeighborsClassifier() : 92.79
-----------------------------------------------
Cross Validation for RandomForestClassifier() = [0.93859649 0.94736842 0.98245614 0.97368421 0.97345133]
Accuracy score of RandomForestClassifier() : 96.31
-----------------------------------------------
