In [None]:
# Based on: medium.com/@dspaulolima/data-science-para-iniciantes-iris-ec17b205f0d6

In [1]:
import pandas as pd

In [2]:
iris = pd.read_csv('iris_transformed', sep=',')
iris.drop('species', inplace=True, axis=1)

In [3]:
iris.head()

Unnamed: 0,sepal_lenght,sepal_width,petal_lenght,petal_width,species_code
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [None]:
## We want to discover species and this is a classification problem.
## So, we're going to use four models and compare then: SVM, Logistic Regression, KNN and Decision Tree.

In [4]:
# Models to test and compare
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier

# Training-test division
from sklearn.model_selection import train_test_split

# Performance and comparison metrics
from sklearn import metrics

In [5]:
# We need to separate the target and predictor variables for testing and trainig purposes
predictors = iris.iloc[:,:-1]
target = iris['species_code']

print('Predictors:')
print(predictors.iloc[[10,40,60,80,100,101]])
print('\nTarget:')
print(target.iloc[[10,40,60,80,100,101]])

Predictors:
     sepal_lenght  sepal_width  petal_lenght  petal_width
10            5.4          3.7           1.5          0.2
40            5.0          3.5           1.3          0.3
60            5.0          2.0           3.5          1.0
80            5.5          2.4           3.8          1.1
100           6.3          3.3           6.0          2.5
101           5.8          2.7           5.1          1.9

Target:
10     0
40     0
60     1
80     1
100    2
101    2
Name: species_code, dtype: int64


In [6]:
# Dataset division: 30% test and 70% training
x_train, x_test, y_train, y_test = train_test_split(predictors, target, test_size=0.3)

print('Training data - 70%: ', x_train.shape, y_train.shape)
print('Testing data - 30%: ', x_test.shape, y_test.shape)

Training data - 70%:  (105, 4) (105,)
Testing data - 30%:  (45, 4) (45,)


## Model 1: SVM - Support Vector Machine

In [8]:
svm_model = svm.SVC(kernel='linear', C=1)
svm_fit = svm_model.fit(x_train, y_train)
svm_prediction = svm_fit.predict(x_test)

svm_metric = metrics.accuracy_score(svm_prediction, y_test)
print('Accuracy: ', svm_metric)

Accuracy:  0.9555555555555556


In [19]:
# Confusion matrix: to verify the correctness and erros in the prediction, comparing it with the original target
pd.DataFrame(metrics.confusion_matrix(y_test, svm_prediction, labels=[0,1,2])
            ,columns=['setosa_predicted', 'versicolor_predicted', 'virginica_predicted']
            ,index=['setosa_original', 'versicolor_original', 'virginica_original'])

Unnamed: 0,setosa_predicted,versicolor_predicted,virginica_predicted
setosa_original,15,0,0
versicolor_original,0,12,1
virginica_original,0,1,16


## Only one versicolor was predicted as virginica, that is 97% of accuracy.

## Model 2 - Logistic Regression

In [10]:
lr_model = LogisticRegression()
lr_fit = lr_model.fit(x_train, y_train)
lr_prediction = lr_fit.predict(x_test)

lr_metric = metrics.accuracy_score(lr_prediction, y_test)
print('Accuracy: ', lr_metric)

Accuracy:  0.9777777777777777


In [11]:
pd.DataFrame(metrics.confusion_matrix(y_test, lr_prediction, labels=[0,1,2])
            ,columns=['setosa_predicted', 'versicolor_predicted', 'virginica_predicted']
            ,index=['setosa_original', 'versicolor_original', 'virginica_original'])

Unnamed: 0,setosa_predicted,versicolor_predicted,virginica_predicted
setosa_original,19,0,0
versicolor_original,0,14,1
virginica_original,0,0,11


## Only one versicolor was predicted as virginica, that is 97% of accuracy.

## Model 3 - Decision Tree

In [18]:
dt_model = DecisionTreeClassifier()
dt_fit = dt_model.fit(x_train, y_train)
dt_prediction = dt_fit.predict(x_test)

dt_metric = metrics.accuracy_score(dt_prediction, y_test)
print('Accuracy: ', dt_metric)

Accuracy:  0.9333333333333333


In [14]:
pd.DataFrame(metrics.confusion_matrix(y_test, dt_prediction, labels=[0,1,2])
            ,columns=['setosa_predicted', 'versicolor_predicted', 'virginica_predicted']
            ,index=['setosa_original', 'versicolor_original', 'virginica_original'])

Unnamed: 0,setosa_predicted,versicolor_predicted,virginica_predicted
setosa_original,19,0,0
versicolor_original,0,14,1
virginica_original,0,1,10


## One versicolor was predicted as virginica and one virginica was predicted as versicolor, so, we have 95% accurary.

## Model 4 - KNN - K-Nearst Neighbor

In [15]:
knn_model = KNeighborsClassifier()
knn_fit = knn_model.fit(x_train, y_train)
knn_prediction = knn_fit.predict(x_test)

knn_metric = metrics.accuracy_score(knn_prediction, y_test)
print('Accuracy: ', knn_metric)

Accuracy:  0.9333333333333333


In [16]:
pd.DataFrame(metrics.confusion_matrix(y_test, knn_prediction, labels=[0,1,2])
            ,columns=['setosa_predicted', 'versicolor_predicted', 'virginica_predicted']
            ,index=['setosa_original', 'versicolor_original', 'virginica_original'])

Unnamed: 0,setosa_predicted,versicolor_predicted,virginica_predicted
setosa_original,19,0,0
versicolor_original,0,12,3
virginica_original,0,0,11


## Three versicolor were predicted as virginica. So, we have 93% accuracy.

## Comparison of accuracy between models

In [17]:
print('Support Vector Machine: ', round(svm_metric,3))
print('Logistic Regression: ', round(lr_metric,3))
print('Decision Tree; ', round(dt_metric,3))
print('KNeighbor; ', round(knn_metric,3))

Support Vector Machine:  0.978
Logistic Regression:  0.978
Decision Tree;  0.956
KNeighbor;  0.933


In [18]:
# We use pickle to export our model object as a binary, which can be used by the web service
import pickle
pickle.dump(svm_fit, open('../deploy/svm_model.pkl', 'wb'))