# Detect flower species with support vector machines (SVM) in Python
12-detect-flower-species-w-svm-in-python

In [19]:
# Set-up libraries
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [4]:
# Read data into dataframe 
df = pd.read_csv('../00-Datasets/iris.csv')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [58]:
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [6]:
df.tail()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica
149,150,5.9,3.0,5.1,1.8,Iris-virginica


In [7]:
df.sample(6)

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
8,9,4.4,2.9,1.4,0.2,Iris-setosa
117,118,7.7,3.8,6.7,2.2,Iris-virginica
11,12,4.8,3.4,1.6,0.2,Iris-setosa
90,91,5.5,2.6,4.4,1.2,Iris-versicolor
33,34,5.5,4.2,1.4,0.2,Iris-setosa
116,117,6.5,3.0,5.5,1.8,Iris-virginica


In [8]:
# Check for missing values
df.isna().sum()

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [10]:
# Check for duplicates
df.duplicated().sum()

0

In [11]:
# Explore tabular summary
df.describe()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0,150.0
mean,75.5,5.843333,3.054,3.758667,1.198667
std,43.445368,0.828066,0.433594,1.76442,0.763161
min,1.0,4.3,2.0,1.0,0.1
25%,38.25,5.1,2.8,1.6,0.3
50%,75.5,5.8,3.0,4.35,1.3
75%,112.75,6.4,3.3,5.1,1.8
max,150.0,7.9,4.4,6.9,2.5


In [20]:
# Prepare the data
df = df[['SepalLengthCm', 'SepalWidthCm', 'Species']]

In [21]:
# Split dataset into 80% train and 20% validation
X = df.drop('Species', axis=1)
y = df['Species']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [22]:
# Build model
model = SVC()
model.fit(X_train, y_train)

In [23]:
# Feed test data into model
y_predict = model.predict(X_test)

actual_vs_predict = pd.DataFrame({'Actual': y_test,
                                 'Predict': y_predict
                                 })

actual_vs_predict.sample(12)

Unnamed: 0,Actual,Predict
107,Iris-virginica,Iris-virginica
40,Iris-setosa,Iris-setosa
90,Iris-versicolor,Iris-versicolor
37,Iris-setosa,Iris-setosa
73,Iris-versicolor,Iris-versicolor
16,Iris-setosa,Iris-setosa
44,Iris-setosa,Iris-setosa
76,Iris-versicolor,Iris-virginica
93,Iris-versicolor,Iris-versicolor
8,Iris-setosa,Iris-setosa


In [24]:
# Evaluate model
print(classification_report(y_test, y_predict))

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        11
Iris-versicolor       0.73      0.62      0.67        13
 Iris-virginica       0.38      0.50      0.43         6

       accuracy                           0.73        30
      macro avg       0.70      0.71      0.70        30
   weighted avg       0.76      0.73      0.74        30



## Notes
* We wanted to detect species of flowers.
* Our dataset had 150 records and 6 features, 4 of which are of data type float, 1 of type integer, and 1 of type object. 
* A quick look showed that there were no missing or duplicated values.
* We picked a sample size of 5000 to use in our model.
* The dataset was split into 80% train and 20% test sets. No validation set was created or used. No hyperparameter tuning occurred.
* We built, trained, and tested an SVM model.
* We evaluated our model using ground truth and several metrics including precision, recall, and f1-score. 