In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
# Library used for ignore warnings
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
#Loading the dataset to pandas dataframe
iris_data = pd.read_csv('/content/Iris Flower.csv')

In [3]:
iris_data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
#Number of rows and columns
iris_data.shape

(150, 6)

In [5]:
#Dataset_Info
iris_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [6]:
#Dataset Duplicate Value Count
dup = iris_data.duplicated().sum()
print('Number of duplicate values:', dup)

Number of duplicate values: 0


In [7]:
#Missing Values
iris_data.isnull().sum()

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [8]:
#Dataset Variables
iris_data.columns

Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')

In [9]:
#Describe --> statistical measures of the data
iris_data.describe()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0,150.0
mean,75.5,5.843333,3.054,3.758667,1.198667
std,43.445368,0.828066,0.433594,1.76442,0.763161
min,1.0,4.3,2.0,1.0,0.1
25%,38.25,5.1,2.8,1.6,0.3
50%,75.5,5.8,3.0,4.35,1.3
75%,112.75,6.4,3.3,5.1,1.8
max,150.0,7.9,4.4,6.9,2.5


In [10]:
#Check Unique Values for each variable
iris_data.nunique()

Id               150
SepalLengthCm     35
SepalWidthCm      23
PetalLengthCm     43
PetalWidthCm      22
Species            3
dtype: int64

In [11]:
# We don't need the 1st column so let's drop that
new_data=iris_data.iloc[:,1:]

In [12]:
# New updated dataset
new_data.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [13]:
#Encode the categorical columns
# Create a LabelEncoder object
le = LabelEncoder()

# Encode the 'Species' column to convert the species names to numerical labels
new_data['Species'] = le.fit_transform(new_data['Species'])

# Check the unique values in the 'Species' column after encoding
unique_species = new_data['Species'].unique()

# Display the unique encoded values
print("Encoded Species Values:")
print(unique_species) # 'Iris-setosa' == 0, 'Iris-versicolor' == 1, 'Iris-virginica' == 2

Encoded Species Values:
[0 1 2]


In [14]:
# separating data and Labels
X = new_data.drop(columns=['Species'], axis=1)
Y = new_data['Species']

In [15]:
print(X)
print(Y)

     SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
0              5.1           3.5            1.4           0.2
1              4.9           3.0            1.4           0.2
2              4.7           3.2            1.3           0.2
3              4.6           3.1            1.5           0.2
4              5.0           3.6            1.4           0.2
..             ...           ...            ...           ...
145            6.7           3.0            5.2           2.3
146            6.3           2.5            5.0           1.9
147            6.5           3.0            5.2           2.0
148            6.2           3.4            5.4           2.3
149            5.9           3.0            5.1           1.8

[150 rows x 4 columns]
0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: Species, Length: 150, dtype: int64


In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1, stratify=Y, random_state=1)
print(X.shape, X_train.shape, X_test.shape)

(150, 4) (135, 4) (15, 4)


In [17]:
print(X_train)
print(Y_train)

     SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
77             6.7           3.0            5.0           1.7
114            5.8           2.8            5.1           2.4
110            6.5           3.2            5.1           2.0
139            6.9           3.1            5.4           2.1
39             5.1           3.4            1.5           0.2
..             ...           ...            ...           ...
45             4.8           3.0            1.4           0.3
118            7.7           2.6            6.9           2.3
41             4.5           2.3            1.3           0.3
127            6.1           3.0            4.9           1.8
42             4.4           3.2            1.3           0.2

[135 rows x 4 columns]
77     1
114    2
110    2
139    2
39     0
      ..
45     0
118    2
41     0
127    2
42     0
Name: Species, Length: 135, dtype: int64


In [18]:
model = LogisticRegression()
#training the Logistic Regression model with training data
model.fit(X_train, Y_train)

In [19]:
#accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [20]:
print('Accuracy on training data : ', training_data_accuracy)

Accuracy on training data :  0.9703703703703703


In [21]:
#accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [22]:
print('Accuracy on test data : ', test_data_accuracy)

Accuracy on test data :  0.9333333333333333


In [23]:
Category = ['Iris-Setosa', 'Iris-Versicolor', 'Iris-Virginica']

In [24]:
input_data = (5.1,3.5,1.4,0.2)

# changing the input_data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the np array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

print('The predicted category is:', Category[prediction[0]])

[0]
The predicted category is: Iris-Setosa
