In [1]:
# import libraries
import pandas as pd
import numpy as np

# import machine learning libraries/modelling
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
iris = pd.read_csv('Iris.csv')
iris.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
iris.shape

(150, 6)

In [4]:
iris.isna().sum()

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [5]:
iris.duplicated().sum()

0

## data preprocessing

In [6]:
# drop id column
iris.drop(columns='Id', inplace=True)

In [7]:
# check unique values
iris['Species'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [8]:
# convert species to numerical
mapping = {'Iris-setosa':0, 'Iris-versicolor':1, 'Iris-virginica':2}

In [9]:
iris['Species'].replace(mapping, inplace=True)

In [10]:
iris.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


# modelling

In [11]:
# split data
x = iris.drop('Species', axis=1)
y = iris['Species']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [12]:
# function for training the model
def model_trainer(data, models):
    x_train, x_test, y_train, y_test = data # unpacking the tuple 
    for model in models:
        model.fit(x_train, y_train)
        preds = model.predict(x_test)
        accuracy = accuracy_score(y_test, preds)
        print(f'Model: {model}, accuracy: {accuracy}')

In [13]:
# define the parameters

# define data
data = (x_train, x_test, y_train, y_test)

# define our models
svc = SVC()
forest = RandomForestClassifier()
dt = DecisionTreeClassifier()

models = [svc, forest, dt]



In [14]:
# train my models
model_trainer(data=data, models=models)

Model: SVC(), accuracy: 0.9666666666666667
Model: RandomForestClassifier(), accuracy: 0.9666666666666667
Model: DecisionTreeClassifier(), accuracy: 0.9666666666666667


In [15]:
# dt was 93%
# save the model. The best model (how big the model is and how fast they make predictions)
import joblib

# save Random Forest
joblib.dump(forest, 'forest.pkl')

# save svc
joblib.dump(svc, 'svc.pkl')

# we'll go with svc coz its smaller in size than forest

['svc.pkl']

In [16]:
# streamlit and flask used for deployment

In [17]:
iris.columns

Index(['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')