# importing the dependencies


In [1]:
import numpy as np
import pandas as pd
import sklearn.datasets as data
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Data collection and Data preprocessing

In [2]:
#loading the data from sklearn
breast_cancer_dataset=data.load_breast_cancer()

In [3]:
#first 5 rows
print(breast_cancer_dataset)

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]]), 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
 

In [4]:
#loading the data to a dataframe
data_frame=pd.DataFrame(breast_cancer_dataset.data,columns=breast_cancer_dataset.feature_names)
data_frame.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [5]:
#adding target label to the data frame
data_frame['label']=breast_cancer_dataset.target
data_frame.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,label
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [6]:
#number of rows and columns in this dataset
data_frame.shape

(569, 31)

In [7]:
#checking the missing values
data_frame.isnull().sum()

mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
label                      0
dtype: int64

In [8]:
#checking the distribution of target variable
data_frame['label'].value_counts()

1    357
0    212
Name: label, dtype: int64

 1--> Benign

 0--> Malignant

# Separating the features and target

In [9]:
x=data_frame.drop(columns='label',axis=1)
y=data_frame['label']
print(x)

     mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0          17.99         10.38          122.80     1001.0          0.11840   
1          20.57         17.77          132.90     1326.0          0.08474   
2          19.69         21.25          130.00     1203.0          0.10960   
3          11.42         20.38           77.58      386.1          0.14250   
4          20.29         14.34          135.10     1297.0          0.10030   
..           ...           ...             ...        ...              ...   
564        21.56         22.39          142.00     1479.0          0.11100   
565        20.13         28.25          131.20     1261.0          0.09780   
566        16.60         28.08          108.30      858.1          0.08455   
567        20.60         29.33          140.10     1265.0          0.11780   
568         7.76         24.54           47.92      181.0          0.05263   

     mean compactness  mean concavity  mean concave points  mea

In [10]:
print(y)

0      0
1      0
2      0
3      0
4      0
      ..
564    0
565    0
566    0
567    0
568    1
Name: label, Length: 569, dtype: int64


In [11]:
x=np.asarray(x)
y=np.asarray(y)

# GridSearchCV

 GridSearchCV is used for determining the best parameters for our model

In [12]:
#loading the SVC model
model=SVC()

In [13]:
#hyperparameters
parameters={
    'kernel':['linear','poly','rbf','sigmoid'],
    'C':[1,5,10,20]
}

In [14]:
# grid search
classifier=GridSearchCV(model,parameters,cv=5)

In [15]:
# fitting the data to our model
classifier.fit(x,y)

In [16]:
classifier.cv_results_

{'mean_fit_time': array([2.70809126e+00, 6.04496002e-03, 1.34174824e-02, 2.89460182e-02,
        4.03606048e+00, 3.84635925e-03, 4.12011147e-03, 1.41855240e-02,
        4.50502868e+00, 4.32138443e-03, 4.02255058e-03, 1.39637470e-02,
        7.17340803e+00, 4.20913696e-03, 4.23812866e-03, 1.38592243e-02]),
 'std_fit_time': array([1.09586157e+00, 4.60233720e-04, 5.39302222e-03, 6.38438951e-03,
        2.58492205e+00, 2.73959667e-04, 9.79304422e-05, 5.42552007e-04,
        8.60427186e-01, 2.60576364e-04, 1.12495315e-04, 6.07033918e-04,
        2.16937573e+00, 2.23984813e-04, 5.58332312e-04, 6.18917630e-04]),
 'mean_score_time': array([0.00156779, 0.00196342, 0.00603442, 0.00584817, 0.00106163,
        0.00107961, 0.00148067, 0.00393729, 0.0011189 , 0.00106664,
        0.00137959, 0.00344391, 0.00105281, 0.00098205, 0.00136719,
        0.00371709]),
 'std_score_time': array([1.63473826e-04, 9.72804367e-05, 3.84540403e-03, 4.17924906e-04,
        3.22104824e-05, 4.02963795e-05, 6.15477643e-

In [17]:
#best parameters
best_parameters=classifier.best_params_
print(best_parameters)

{'C': 10, 'kernel': 'linear'}


In [18]:
#highest Accuracy
high_acc=classifier.best_score_
print(high_acc)

0.9525694767893185


In [19]:
#loading the result to pandas Dataframe
result=pd.DataFrame(classifier.cv_results_)
result.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,2.708091,1.095862,0.001568,0.000163,1,linear,"{'C': 1, 'kernel': 'linear'}",0.947368,0.929825,0.973684,0.921053,0.955752,0.945536,0.018689,4
1,0.006045,0.00046,0.001963,9.7e-05,1,poly,"{'C': 1, 'kernel': 'poly'}",0.842105,0.885965,0.929825,0.947368,0.938053,0.908663,0.039382,12
2,0.013417,0.005393,0.006034,0.003845,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.850877,0.894737,0.929825,0.947368,0.938053,0.912172,0.035444,11
3,0.028946,0.006384,0.005848,0.000418,1,sigmoid,"{'C': 1, 'kernel': 'sigmoid'}",0.54386,0.45614,0.464912,0.385965,0.451327,0.460441,0.050253,13
4,4.03606,2.584922,0.001062,3.2e-05,5,linear,"{'C': 5, 'kernel': 'linear'}",0.947368,0.938596,0.973684,0.929825,0.964602,0.950815,0.016216,2


In [20]:
grid_search_result=result[['param_C','param_kernel','mean_test_score']]

In [21]:
grid_search_result

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,linear,0.945536
1,1,poly,0.908663
2,1,rbf,0.912172
3,1,sigmoid,0.460441
4,5,linear,0.950815
5,5,poly,0.922729
6,5,rbf,0.931501
7,5,sigmoid,0.411178
8,10,linear,0.952569
9,10,poly,0.920975


GridSearchCV:

Highest Accuracy = 95.2%

Best parameters = {'C' :10 ,'kernel' :'linear' }

# RandomizedSearchCV

In [22]:
# Random search
classifier=RandomizedSearchCV(model,parameters,cv=5)

In [23]:
#fitting the data to our model
classifier.fit(x,y)

In [24]:
classifier.cv_results_

{'mean_fit_time': array([3.52611542e-03, 1.38774872e-02, 4.51421266e+00, 4.37793732e-03,
        3.76973152e-03, 2.89400439e+00, 4.02288437e-03, 4.01601791e-03,
        3.94101143e-03, 1.47422791e-02]),
 'std_fit_time': array([7.37429243e-05, 1.10442696e-03, 9.70034823e-01, 5.57336148e-04,
        1.82523187e-04, 7.55961622e-01, 1.51664383e-04, 1.81525625e-04,
        1.25601876e-04, 2.75989195e-04]),
 'mean_score_time': array([0.0010675 , 0.00329189, 0.00107508, 0.00097275, 0.00129619,
        0.00108294, 0.00144916, 0.00136013, 0.00103755, 0.00377083]),
 'std_score_time': array([1.91024322e-05, 4.09581747e-05, 2.76180525e-05, 5.41528323e-05,
        6.53598520e-05, 3.72862266e-05, 4.52587044e-05, 4.06544996e-05,
        8.93967813e-05, 5.82368310e-05]),
 'param_kernel': masked_array(data=['poly', 'sigmoid', 'linear', 'poly', 'rbf', 'linear',
                    'rbf', 'rbf', 'poly', 'sigmoid'],
              mask=[False, False, False, False, False, False, False, False,
              

In [25]:
#best parameters
best_para=classifier.best_params_
print(best_para)

{'kernel': 'linear', 'C': 10}


In [26]:
#highest accuracy
high_acc=classifier.best_score_
print(high_acc)

0.9525694767893185


In [27]:
#loading to dataframe
result=pd.DataFrame(classifier.cv_results_)

In [28]:
result.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kernel,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003526,7.4e-05,0.001067,1.9e-05,poly,1,"{'kernel': 'poly', 'C': 1}",0.842105,0.885965,0.929825,0.947368,0.938053,0.908663,0.039382,8
1,0.013877,0.001104,0.003292,4.1e-05,sigmoid,20,"{'kernel': 'sigmoid', 'C': 20}",0.473684,0.403509,0.421053,0.342105,0.353982,0.398867,0.04764,10
2,4.514213,0.970035,0.001075,2.8e-05,linear,10,"{'kernel': 'linear', 'C': 10}",0.938596,0.938596,0.973684,0.947368,0.964602,0.952569,0.0142,1
3,0.004378,0.000557,0.000973,5.4e-05,poly,20,"{'kernel': 'poly', 'C': 20}",0.877193,0.921053,0.903509,0.938596,0.955752,0.919221,0.0273,7
4,0.00377,0.000183,0.001296,6.5e-05,rbf,20,"{'kernel': 'rbf', 'C': 20}",0.877193,0.921053,0.921053,0.947368,0.938053,0.920944,0.024105,6


In [29]:
random_search_result=result[['param_C','param_kernel','mean_test_score']]

In [30]:
random_search_result

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,poly,0.908663
1,20,sigmoid,0.398867
2,10,linear,0.952569
3,20,poly,0.919221
4,20,rbf,0.920944
5,5,linear,0.950815
6,5,rbf,0.931501
7,10,rbf,0.922714
8,10,poly,0.920975
9,1,sigmoid,0.460441


RandomizedSearchCV:
Highest Accuracy = 95.2%
Best Parameters = { 'C' :10 ,'kernel' : 'linear'}

# splitting the data into training and testing data

In [31]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,stratify=y,random_state=2)

In [32]:
print(x.shape,x_train.shape,x_test.shape)

(569, 30) (455, 30) (114, 30)


# Support Vector classifier

In [33]:
model=SVC(kernel='linear')

In [34]:
model.fit(x_train,y_train)

In [35]:
from sklearn.metrics import accuracy_score

# model evaluation

### evaluating on training data

In [36]:
training_data_pred=model.predict(x_train)
training_acc=accuracy_score(y_train,training_data_pred)
print('Accuracy score of training data =',training_acc)

Accuracy score of training data = 0.9626373626373627


### evaluating on test data

In [37]:
# testing the accuracy on test data
test_data_pred=model.predict(x_test)
test_acc=accuracy_score(y_test,test_data_pred)
print('Accuracy score of test data =',test_acc)

Accuracy score of test data = 0.9736842105263158
