## Classification using Support Vector Machine (SVM)


### Required libraries

In [120]:
# Data manipulation
import pandas as pd

# Model
from sklearn import svm
from sklearn.model_selection import train_test_split

# Model evaluation
from sklearn import metrics

# Resampling - Cross validation
from sklearn.model_selection import KFold

### Interactive shell requirements

In [122]:
# Make notebook interactive
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### Import Data

In [58]:
# Read file and convert to a Dataframe
data_orig = pd.read_csv('/Users/test/Documents/git_workspace/object_classification_sonar_data/sonar.all-data.csv')
print("Data structure type: ", type(data_orig))

Data structure type:  <class 'pandas.core.frame.DataFrame'>


### Explore Data

In [59]:
# Explore few data set records
data_orig.head(20)

Unnamed: 0,0.0200,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0027,0.0065,0.0159,0.0072,0.0167,0.0180,0.0084,0.0090,0.0032,R
0,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0084,0.0089,0.0048,0.0094,0.0191,0.014,0.0049,0.0052,0.0044,R
1,0.0262,0.0582,0.1099,0.1083,0.0974,0.228,0.2431,0.3771,0.5598,0.6194,...,0.0232,0.0166,0.0095,0.018,0.0244,0.0316,0.0164,0.0095,0.0078,R
2,0.01,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0121,0.0036,0.015,0.0085,0.0073,0.005,0.0044,0.004,0.0117,R
3,0.0762,0.0666,0.0481,0.0394,0.059,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0031,0.0054,0.0105,0.011,0.0015,0.0072,0.0048,0.0107,0.0094,R
4,0.0286,0.0453,0.0277,0.0174,0.0384,0.099,0.1201,0.1833,0.2105,0.3039,...,0.0045,0.0014,0.0038,0.0013,0.0089,0.0057,0.0027,0.0051,0.0062,R
5,0.0317,0.0956,0.1321,0.1408,0.1674,0.171,0.0731,0.1401,0.2083,0.3513,...,0.0201,0.0248,0.0131,0.007,0.0138,0.0092,0.0143,0.0036,0.0103,R
6,0.0519,0.0548,0.0842,0.0319,0.1158,0.0922,0.1027,0.0613,0.1465,0.2838,...,0.0081,0.012,0.0045,0.0121,0.0097,0.0085,0.0047,0.0048,0.0053,R
7,0.0223,0.0375,0.0484,0.0475,0.0647,0.0591,0.0753,0.0098,0.0684,0.1487,...,0.0145,0.0128,0.0145,0.0058,0.0049,0.0065,0.0093,0.0059,0.0022,R
8,0.0164,0.0173,0.0347,0.007,0.0187,0.0671,0.1056,0.0697,0.0962,0.0251,...,0.009,0.0223,0.0179,0.0084,0.0068,0.0032,0.0035,0.0056,0.004,R
9,0.0039,0.0063,0.0152,0.0336,0.031,0.0284,0.0396,0.0272,0.0323,0.0452,...,0.0062,0.012,0.0052,0.0056,0.0093,0.0042,0.0003,0.0053,0.0036,R


In [62]:
# Create dummy value for the dependent variable (prediction feature)
data = pd.get_dummies(data_orig)
data.head(20)

Unnamed: 0,0.0200,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0065,0.0159,0.0072,0.0167,0.0180,0.0084,0.0090,0.0032,R_M,R_R
0,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0089,0.0048,0.0094,0.0191,0.014,0.0049,0.0052,0.0044,0,1
1,0.0262,0.0582,0.1099,0.1083,0.0974,0.228,0.2431,0.3771,0.5598,0.6194,...,0.0166,0.0095,0.018,0.0244,0.0316,0.0164,0.0095,0.0078,0,1
2,0.01,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0036,0.015,0.0085,0.0073,0.005,0.0044,0.004,0.0117,0,1
3,0.0762,0.0666,0.0481,0.0394,0.059,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0054,0.0105,0.011,0.0015,0.0072,0.0048,0.0107,0.0094,0,1
4,0.0286,0.0453,0.0277,0.0174,0.0384,0.099,0.1201,0.1833,0.2105,0.3039,...,0.0014,0.0038,0.0013,0.0089,0.0057,0.0027,0.0051,0.0062,0,1
5,0.0317,0.0956,0.1321,0.1408,0.1674,0.171,0.0731,0.1401,0.2083,0.3513,...,0.0248,0.0131,0.007,0.0138,0.0092,0.0143,0.0036,0.0103,0,1
6,0.0519,0.0548,0.0842,0.0319,0.1158,0.0922,0.1027,0.0613,0.1465,0.2838,...,0.012,0.0045,0.0121,0.0097,0.0085,0.0047,0.0048,0.0053,0,1
7,0.0223,0.0375,0.0484,0.0475,0.0647,0.0591,0.0753,0.0098,0.0684,0.1487,...,0.0128,0.0145,0.0058,0.0049,0.0065,0.0093,0.0059,0.0022,0,1
8,0.0164,0.0173,0.0347,0.007,0.0187,0.0671,0.1056,0.0697,0.0962,0.0251,...,0.0223,0.0179,0.0084,0.0068,0.0032,0.0035,0.0056,0.004,0,1
9,0.0039,0.0063,0.0152,0.0336,0.031,0.0284,0.0396,0.0272,0.0323,0.0452,...,0.012,0.0052,0.0056,0.0093,0.0042,0.0003,0.0053,0.0036,0,1


In [63]:
# Get shape of the dataset
data.shape

(207, 62)

In [64]:
# Dividing between dependent and independent variables

# Converting DataFrame to Numpy Array
data_array = data.values
print("Data structure type: ", type(data_w_dummy_array))

# Divide dataset into Dependent and Independent variables
x = data_array[:, 0:60].astype(float)
y = data_array[:, 60].astype(float)

Data structure type:  <class 'numpy.ndarray'>


### Split Data into Train and Test

In [101]:
# Test data size (percentage)
test_size = 0.2

# Seed for random state - To ensure same dataset creation
split_seed = 8

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=split_seed)

In [102]:
# Response rate
response_rate = sum(y) / len(y)
print("Response rate: ", response_rate)

Response rate:  0.5362318840579711


### Resampling - Cross validation

In [103]:
# Cross validation
folds_num = 10
cross_validation_seed = 8
scoring = 'accuracy'

cv_folds = KFold(n_splits=folds_num, random_state=cross_validation_seed)

### Model creation and fitting

In [104]:
# Model creation
model = svm.SVC()

# Fitting / Running model
model_fit = model.fit(x_train, y_train)

# Model parameters
model_fit



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

### Model Accuracy and Evaluation

In [105]:
# Model accuracy
model_score = model.score(x_train, y_train)
print("Model 1 Score: ", model_score)

Model 1 Score:  0.703030303030303


In [117]:
# Model predicitons

# 1. Prediction on Training data
y_train_pred = model.predict(x_train)
accuracy_train = metrics.accuracy_score(y_true=y_train, y_pred=y_train_pred)
accuracy_train = float(format(accuracy_train * 100, '0.2f'))

# 2. Prediction on Test data
y_test_pred = model.predict(x_test)
accuracy_test = metrics.accuracy_score(y_true=y_test, y_pred=y_test_pred)
accuracy_test = float(format(accuracy_test * 100, '0.2f'))

### Results

In [119]:
print("Accuracy - Training data: {}%".format(accuracy_train))
print("Accuracy - Test data: {}%".format(accuracy_test))

Accuracy - Training data: 70.3%
Accuracy - Test data: 71.43%
