In [15]:
#Import all the important libraries and packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.svm import SVC #This is where we get our svm package
from sklearn.metrics import confusion_matrix, classification_report #get the confusion matrix and reporting packages from sklearn
from sklearn import preprocessing as sk_preprocessing
from sklearn.model_selection import train_test_split #This allows us to split between our testing and training data

In [16]:
#import the data from the webpage into a dataframe called DF
DF = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/00236/seeds_dataset.txt", sep= '\s+', header=None)
#Rename columns since they don't come with headers
DF.columns=['Area','Perimeter','Compactness','Length of Kernal','Width of Kernel','Asymmetry Coefficient','Length of kernel groove','Class']

In [17]:
#Check to make sure the data looks right in the data frame
DF.describe()

Unnamed: 0,Area,Perimeter,Compactness,Length of Kernal,Width of Kernel,Asymmetry Coefficient,Length of kernel groove,Class
count,210.0,210.0,210.0,210.0,210.0,210.0,210.0,210.0
mean,14.847524,14.559286,0.870999,5.628533,3.258605,3.700201,5.408071,2.0
std,2.909699,1.305959,0.023629,0.443063,0.377714,1.503557,0.49148,0.818448
min,10.59,12.41,0.8081,4.899,2.63,0.7651,4.519,1.0
25%,12.27,13.45,0.8569,5.26225,2.944,2.5615,5.045,1.0
50%,14.355,14.32,0.87345,5.5235,3.237,3.599,5.223,2.0
75%,17.305,15.715,0.887775,5.97975,3.56175,4.76875,5.877,3.0
max,21.18,17.25,0.9183,6.675,4.033,8.456,6.55,3.0


In [18]:
DF.info() #more data exploration

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210 entries, 0 to 209
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Area                     210 non-null    float64
 1   Perimeter                210 non-null    float64
 2   Compactness              210 non-null    float64
 3   Length of Kernal         210 non-null    float64
 4   Width of Kernel          210 non-null    float64
 5   Asymmetry Coefficient    210 non-null    float64
 6   Length of kernel groove  210 non-null    float64
 7   Class                    210 non-null    int64  
dtypes: float64(7), int64(1)
memory usage: 13.2 KB


In [19]:
#Assign our Y vairable which will in this case be class
Y = DF['Class']
#Edit the main data frame so it no longer includes the class attribute
X = DF.drop(['Class'], axis = 1)

In [20]:
#Use the train test split function to separate the training set from the test set.  
#In this case I chose 5/7 to give us a 50 20 split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=5/7, stratify=Y)  

In [21]:
X_train.shape #Check on how many records and attributes are in our training set

(150, 7)

In [22]:
X_test.shape #Check on how many records and attributes are in our testing set

(60, 7)

In [23]:
#Scale the data because SVM's are a distance based model
scaler = sk_preprocessing.StandardScaler() 
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [24]:
#This is where we'll implement our SVM model.  We're using the packages and libraries we imported in the beginning
svc = SVC()
svc.fit(X_train, Y_train.values.ravel()) #By "Fitting" we're training the model on the training data
Y_pred = svc.predict(X_test) #This is where we test the data and get our results
print(confusion_matrix(Y_test, Y_pred)) # Print the confusion matrix so we can see it
print(classification_report(Y_test, Y_pred)) #Print the test results

[[19  1  0]
 [ 1 19  0]
 [ 1  0 19]]
              precision    recall  f1-score   support

           1       0.90      0.95      0.93        20
           2       0.95      0.95      0.95        20
           3       1.00      0.95      0.97        20

    accuracy                           0.95        60
   macro avg       0.95      0.95      0.95        60
weighted avg       0.95      0.95      0.95        60



In [None]:
#Accuracy is 95%
#What this means is the model was able to correctly classify the seeds in the testing set with an accuracy of 95%