In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer

In [2]:
cancer = load_breast_cancer()

In [4]:
print ('Dataset Keys:',cancer.keys())

Dataset Keys: dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])


In [7]:
print ('Dataset Description: \n',cancer['DESCR'])

Dataset Description: 
 .. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry 
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, fi

In [8]:
print ('Building Dataframe from cancer dataset:')
data_set = pd.DataFrame(cancer['data'],columns=cancer['feature_names'])
print (data_set.head())

Building Dataframe from cancer dataset:
   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension           ...             

In [11]:
print ('Framing result df:')
result = cancer['target']
print(result)

Framing result df:
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0
 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1
 1 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 1 0 0 1 1 0 1 1 0 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0
 1 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 1 0 0 1 1
 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 0 0 0 1 1
 1 1 0 1 0 1 0 1 1 1 0 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 0
 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1
 1 0 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 0 1 1 1 1 1 0 1 1
 0 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1

In [12]:
from sklearn.model_selection import train_test_split
print ('Splitting Train & Test dataset:')
X_train,X_test,y_train,y_test = train_test_split(data_set,result,test_size=0.33,random_state=42)

Splitting Train & Test dataset:


In [13]:
from sklearn.svm import SVC
print ('Creating and fitting SVM:')
model = SVC()
print (model.fit(X_train,y_train))

Creating and fitting SVM:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)




In [14]:
predictions = model.predict(X_test)

In [15]:
from sklearn.metrics import classification_report,confusion_matrix

print (confusion_matrix(y_test,predictions))
print (classification_report(y_test,predictions))

[[  0  67]
 [  0 121]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        67
           1       0.64      1.00      0.78       121

   micro avg       0.64      0.64      0.64       188
   macro avg       0.32      0.50      0.39       188
weighted avg       0.41      0.64      0.50       188



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [16]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C':[0.001,0.01,0.1,1,10,100,1000],'gamma':[1,0.1,0.01,0.001,0.0001]}

In [17]:
grid = GridSearchCV(SVC(),param_grid,verbose=3)

In [18]:
grid.fit(X_train,y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


Fitting 3 folds for each of 35 candidates, totalling 105 fits
[CV] C=0.001, gamma=1 ................................................
[CV] ................ C=0.001, gamma=1, score=0.6171875, total=   0.0s
[CV] C=0.001, gamma=1 ................................................
[CV] ....... C=0.001, gamma=1, score=0.6220472440944882, total=   0.0s
[CV] C=0.001, gamma=1 ................................................
[CV] ....... C=0.001, gamma=1, score=0.6190476190476191, total=   0.0s
[CV] C=0.001, gamma=0.1 ..............................................
[CV] .............. C=0.001, gamma=0.1, score=0.6171875, total=   0.0s
[CV] C=0.001, gamma=0.1 ..............................................
[CV] ..... C=0.001, gamma=0.1, score=0.6220472440944882, total=   0.0s
[CV] C=0.001, gamma=0.1 ..............................................
[CV] ..... C=0.001, gamma=0.1, score=0.6190476190476191, total=   0.0s
[CV] C=0.001, gamma=0.01 .............................................
[CV] ..........

[CV] ................... C=10, gamma=1, score=0.6171875, total=   0.0s
[CV] C=10, gamma=1 ...................................................
[CV] .......... C=10, gamma=1, score=0.6220472440944882, total=   0.0s
[CV] C=10, gamma=1 ...................................................
[CV] .......... C=10, gamma=1, score=0.6190476190476191, total=   0.0s
[CV] C=10, gamma=0.1 .................................................
[CV] ................. C=10, gamma=0.1, score=0.6171875, total=   0.0s
[CV] C=10, gamma=0.1 .................................................
[CV] ........ C=10, gamma=0.1, score=0.6220472440944882, total=   0.0s
[CV] C=10, gamma=0.1 .................................................
[CV] ........ C=10, gamma=0.1, score=0.6190476190476191, total=   0.0s
[CV] C=10, gamma=0.01 ................................................
[CV] ................ C=10, gamma=0.01, score=0.6171875, total=   0.0s
[CV] C=10, gamma=0.01 ................................................
[CV] .

[Parallel(n_jobs=1)]: Done 105 out of 105 | elapsed:    2.0s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [19]:
print (grid.best_params_)
print (grid.best_estimator_)

{'C': 10, 'gamma': 0.0001}
SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


In [20]:
grid_predictions = grid.predict(X_test)

In [21]:
print (confusion_matrix(y_test,grid_predictions))
print (classification_report(y_test,grid_predictions))

[[ 63   4]
 [  3 118]]
              precision    recall  f1-score   support

           0       0.95      0.94      0.95        67
           1       0.97      0.98      0.97       121

   micro avg       0.96      0.96      0.96       188
   macro avg       0.96      0.96      0.96       188
weighted avg       0.96      0.96      0.96       188



In [None]:
#C= Soft Margin Cost
"""
A standard SVM seeks to find a margin that separates all positive and negative examples.
However, this can lead to poorly fit models if any examples are mislabeled or extremely unusual. 
To account for this, in 1995, Cortes and Vapnik proposed the idea of a "soft margin" SVM that allows 
some examples to be "ignored" or placed on the wrong side of the margin; this innovation often leads
to a better overall fit. C is the parameter for the soft margin cost function, 
which controls the influence of each individual support vector; this process involves trading error penalty for stability.
"""
#Gamma = Distance between Points - Optimal Point
"""
large gamma leads to high bias and low variance models, and vice-versa.
"""

#Kernel Trick - Expanding Data to Multiple Dimensions
"""
 The idea is mapping the non-linear separable data-set into a higher dimensional space where we can find a 
 hyperplane that can separate the samples.
"""
#Hyperplane - Line between two border lines
#Support Vectors - Border points of each label