In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, make_scorer, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import matplotlib as plt
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from scipy.stats import uniform
from scipy import stats

In [2]:
file_path = './data./Data Set for Chapter - Sheet1.csv'
df = pd.read_csv(file_path)
df.dropna(inplace=True)

In [3]:
df

Unnamed: 0,S.no,Age,Partners,Protection,Symptoms,Location,Education,Testing,STD Status
0,1,45.0,4,0,0,2,1,1,1
1,2,43.0,4,0,0,2,1,1,1
2,3,39.0,5,0,0,3,1,1,1
3,4,35.0,5,2,0,3,1,0,1
4,5,32.0,1,1,1,2,1,0,0
...,...,...,...,...,...,...,...,...,...
540,541,36.0,2,0,1,2,0,1,1
541,542,45.0,4,0,0,2,0,1,1
542,543,43.0,4,0,0,3,0,0,0
543,544,39.0,5,0,0,3,0,0,0


for protection usage:
0: never
1: sometimes
2: always

for STD testing history:
0: no
1: yes


In [4]:

x = df.loc[:,'Age':'Testing']
y = df['STD Status']

scaler = StandardScaler()
x = scaler.fit_transform(x)
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=41)
X_cv,y_cv = X_test[:int(len(X_test)/2)],y_test[:int(len(y_test)/2)]
X_test,y_test = X_test[int(len(X_test)/2):],y_test[int(len(y_test)/2):]
print("length of training set: ", len(X_train))
print("length of test set: ", len(X_test))
print("length of cv set: ", len(X_cv))

length of training set:  380
length of test set:  82
length of cv set:  82


In [5]:
param_grid = {'C': [0.1,0.3,0.05,0.001,0.01,1,3,5, 10, 50,75,100, 1000],
              'gamma': [1, 0.1, 0.01, 0.05,0.001, 0.0001,0.02,0.009,0.008,0.03],
              'kernel': ['rbf','linear'],
              }

rbf_linear_grid = GridSearchCV(SVC(), param_grid,verbose=10)

# fitting the model for grid search
rbf_linear_grid.fit(X_train, y_train)
print("\n \n")
print(rbf_linear_grid.best_params_)


Fitting 5 folds for each of 260 candidates, totalling 1300 fits
[CV 1/5; 1/260] START C=0.1, gamma=1, kernel=rbf................................
[CV 1/5; 1/260] END .C=0.1, gamma=1, kernel=rbf;, score=0.632 total time=   0.0s
[CV 2/5; 1/260] START C=0.1, gamma=1, kernel=rbf................................
[CV 2/5; 1/260] END .C=0.1, gamma=1, kernel=rbf;, score=0.632 total time=   0.0s
[CV 3/5; 1/260] START C=0.1, gamma=1, kernel=rbf................................
[CV 3/5; 1/260] END .C=0.1, gamma=1, kernel=rbf;, score=0.645 total time=   0.0s
[CV 4/5; 1/260] START C=0.1, gamma=1, kernel=rbf................................
[CV 4/5; 1/260] END .C=0.1, gamma=1, kernel=rbf;, score=0.658 total time=   0.0s
[CV 5/5; 1/260] START C=0.1, gamma=1, kernel=rbf................................
[CV 5/5; 1/260] END .C=0.1, gamma=1, kernel=rbf;, score=0.671 total time=   0.0s
[CV 1/5; 2/260] START C=0.1, gamma=1, kernel=linear.............................
[CV 1/5; 2/260] END C=0.1, gamma=1, kernel=li

In [6]:
param_grid = {'C': [0.001,0.01,1,5,10,100,500,300,600,250,750],
              'kernel': ['poly',],
              'degree': [1,3,4,5,6,7,8,9,10],}

poly_grid = GridSearchCV(SVC(), param_grid,verbose=10)

# fitting the model for grid search
poly_grid.fit(X_train, y_train)
print("\n \n")
print(poly_grid.best_params_)
print(poly_grid.best_estimator_)

Fitting 5 folds for each of 99 candidates, totalling 495 fits
[CV 1/5; 1/99] START C=0.001, degree=1, kernel=poly.............................
[CV 1/5; 1/99] END C=0.001, degree=1, kernel=poly;, score=0.513 total time=   0.0s
[CV 2/5; 1/99] START C=0.001, degree=1, kernel=poly.............................
[CV 2/5; 1/99] END C=0.001, degree=1, kernel=poly;, score=0.526 total time=   0.0s
[CV 3/5; 1/99] START C=0.001, degree=1, kernel=poly.............................
[CV 3/5; 1/99] END C=0.001, degree=1, kernel=poly;, score=0.526 total time=   0.0s
[CV 4/5; 1/99] START C=0.001, degree=1, kernel=poly.............................
[CV 4/5; 1/99] END C=0.001, degree=1, kernel=poly;, score=0.526 total time=   0.0s
[CV 5/5; 1/99] START C=0.001, degree=1, kernel=poly.............................
[CV 5/5; 1/99] END C=0.001, degree=1, kernel=poly;, score=0.526 total time=   0.0s
[CV 1/5; 2/99] START C=0.001, degree=3, kernel=poly.............................
[CV 1/5; 2/99] END C=0.001, degree=3,

In [7]:
print("rbf and linear kernel grid")
rbf_linear_grid_predictions = rbf_linear_grid.predict(X_test)
# print classification report
print(classification_report(y_test, rbf_linear_grid_predictions))

rbf and linear kernel grid
              precision    recall  f1-score   support

           0       0.76      0.91      0.83        34
           1       0.93      0.79      0.85        48

    accuracy                           0.84        82
   macro avg       0.84      0.85      0.84        82
weighted avg       0.86      0.84      0.84        82



In [8]:
print("polynomial kernel grid")
polynomial_grid_predictions = poly_grid.predict(X_test)

# print classification report
print(classification_report(y_test,polynomial_grid_predictions))

polynomial kernel grid
              precision    recall  f1-score   support

           0       0.79      0.88      0.83        34
           1       0.91      0.83      0.87        48

    accuracy                           0.85        82
   macro avg       0.85      0.86      0.85        82
weighted avg       0.86      0.85      0.85        82



In [9]:
auc = make_scorer(roc_auc_score)
rand_params = {'C': [0.1,0.3,0.05,0.001,0.01,1, 10, 100, 80,90,70],
            'gamma': [1, 0.1, 0.01,0.009 ,0.02,0.03,0.001, 0.0001,5,10,15],
            'kernel': ['linear', 'poly', 'rbf']
           }

rand_search = RandomizedSearchCV(SVC(), param_distributions = rand_params, n_iter = 30)
rand_search.fit(X_train, y_train)
print(rand_search.best_params_)



{'gamma': 0.01, 'C': 100}


In [10]:
print(rand_search.best_estimator_)

SVC(C=100, gamma=0.01)


In [11]:
print("random search")
rand_search_predictions = rand_search.predict(X_test)

# print classification report
print(classification_report(y_test, rand_search_predictions))

random search
              precision    recall  f1-score   support

           0       0.76      0.91      0.83        34
           1       0.93      0.79      0.85        48

    accuracy                           0.84        82
   macro avg       0.84      0.85      0.84        82
weighted avg       0.86      0.84      0.84        82



In [12]:
#check accuracies
print("accuracy scores:")
print("polynomial grid: ", accuracy_score(y_test, polynomial_grid_predictions))
print("rbf and linear grid: ", accuracy_score(y_test, rbf_linear_grid_predictions))
print("random search: ", accuracy_score(y_test, rand_search_predictions))

accuracy scores:
polynomial grid:  0.8536585365853658
rbf and linear grid:  0.8414634146341463
random search:  0.8414634146341463
