In [216]:
# Importing all the relevant libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

# Handling Categorical Attributes
from sklearn.preprocessing import OneHotEncoder

In [217]:
# Loading the csv file

df = pd.read_csv("Iris.csv")

df.head()


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [218]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [219]:
df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


## Identifying the numerical features and extracting the dataframe

In [220]:
# The independent variables will be SepalLengthCm, SepalWidthCm, PetalLengthCm and PetalWidthCm

numerical_features = list(df.columns)
numerical_features.remove('Species')

print(numerical_features)

# Applying standard scaler to the numerical features.
scaler = StandardScaler()

numerical_features_scaled=pd.DataFrame(scaler.fit_transform(df[numerical_features]),columns=numerical_features)
numerical_features_scaled


['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,-1.720542,-0.900681,1.032057,-1.341272,-1.312977
1,-1.697448,-1.143017,-0.124958,-1.341272,-1.312977
2,-1.674353,-1.385353,0.337848,-1.398138,-1.312977
3,-1.651258,-1.506521,0.106445,-1.284407,-1.312977
4,-1.628164,-1.021849,1.263460,-1.341272,-1.312977
...,...,...,...,...,...
145,1.628164,1.038005,-0.124958,0.819624,1.447956
146,1.651258,0.553333,-1.281972,0.705893,0.922064
147,1.674353,0.795669,-0.124958,0.819624,1.053537
148,1.697448,0.432165,0.800654,0.933356,1.447956


## Categorical Encoding

In [221]:
#Checking the columns with categorical values.
object_columns = df.columns[df.dtypes == 'object']
object_columns

Index(['Species'], dtype='object')

In [222]:
print(df['Species'].unique())
df['Species'].value_counts()

['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']


Species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64

In [311]:
# Inspecting the Categorical attribute
species_cat = df[["Species"]]
species_cat


Unnamed: 0,Species
0,Iris-setosa
1,Iris-setosa
2,Iris-setosa
3,Iris-setosa
4,Iris-setosa
...,...
145,Iris-virginica
146,Iris-virginica
147,Iris-virginica
148,Iris-virginica


In [323]:
encoded_species=species_cat.replace(to_replace=["Iris-versicolor","Iris-virginica"],value=0)
tidied_species=encoded_species.replace(to_replace=["Iris-setosa"],value=1)
tidied_species

  tidied_species=encoded_species.replace(to_replace=["Iris-setosa"],value=1)


Unnamed: 0,Species
0,1
1,1
2,1
3,1
4,1
...,...
145,0
146,0
147,0
148,0


In [306]:
iris_setosa=df[df["Species"]=="Iris-setosa"]

# Replacing both "Iris-versicolor","Iris-virginica" to "Not Iris-setosa"
sorted_values = df[df["Species"]!="Iris-setosa"]
not_iris_setosa=sorted_values.replace(to_replace=["Iris-versicolor","Iris-virginica"],value="Not Iris-setosa")
not_iris_setosa.shape

(100, 6)

In [305]:
# Concatenating the 2 dataframes together to form an amended dataframe that only has 2 unique
#values for the species: 'Iris-setosa' and 'Not Iris-setosa'

df_amended=pd.concat([iris_setosa,not_iris_setosa])
df_amended

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,0
146,147,6.3,2.5,5.0,1.9,0
147,148,6.5,3.0,5.2,2.0,0
148,149,6.2,3.4,5.4,2.3,0


In [226]:
df_amended.shape

(150, 6)

## Encoding the categorical column with One-Hot Encoding

In [280]:
# Inspecting the Categorical attribute
species_cat=df_amended[["Species"]]

# Instantiating the OneHot Encoder class
ohe = OneHotEncoder(sparse_output=False)

# Encoding the Text
species_cat_onehot = ohe.fit_transform(species_cat)

# # Converting into numpy arrays.
print(species_cat_onehot[:5])
print("\n")
print(ohe.categories_)

# Create a DataFrame with the one-hot encoded columns
categorical_columns = species_cat.select_dtypes(include=['object']).columns.tolist()

# We use get_feature_names_out() to get the column names for the encoded data
species_ohe = pd.DataFrame(species_cat_onehot, columns=ohe.get_feature_names_out(categorical_columns))

[[1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]]


[array(['Iris-setosa', 'Not Iris-setosa'], dtype=object)]
['Species']


In [281]:
# verifying that we have the correct data frame with the one-hot-encoding.
species_ohe

Unnamed: 0,Species_Iris-setosa,Species_Not Iris-setosa
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0
...,...,...
145,0.0,1.0
146,0.0,1.0
147,0.0,1.0
148,0.0,1.0


## Features and Targets

In [289]:
# merging the numerical features and the species one-hot-encoding data frame together
merged_data = pd.concat([numerical_features_scaled,encoded_species],axis=1)
merged_data

# Dropping columns that are not relevant for the feature dataframe
X=merged_data.drop(columns=['Id','Species'])
X


Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,-0.900681,1.032057,-1.341272,-1.312977
1,-1.143017,-0.124958,-1.341272,-1.312977
2,-1.385353,0.337848,-1.398138,-1.312977
3,-1.506521,0.106445,-1.284407,-1.312977
4,-1.021849,1.263460,-1.341272,-1.312977
...,...,...,...,...
145,1.038005,-0.124958,0.819624,1.447956
146,0.553333,-1.281972,0.705893,0.922064
147,0.795669,-0.124958,0.819624,1.053537
148,0.432165,0.800654,0.933356,1.447956


In [325]:
# Target Variable
y = tidied_species['Species']
y

0      1
1      1
2      1
3      1
4      1
      ..
145    0
146    0
147    0
148    0
149    0
Name: Species, Length: 150, dtype: int64

## Train Test Split

In [326]:
#random state
r = 42
# 75% training and 25% test
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=r)

In [327]:
# Inspecting the Dataset:
print ("Training data:",X_train.shape, y_train.shape)
print ("Test data:",X_test.shape, y_test.shape)

Training data: (112, 4) (112,)
Test data: (38, 4) (38,)


In [328]:
# fit a model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# make predictions on test data
y_pred = log_reg.predict(X_test).reshape(-1,1)

In [329]:
y_pred[:10]

array([[0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0]], dtype=int64)

In [330]:
X_test[:10]

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
73,0.310998,-0.587764,0.535296,0.001753
18,-0.173674,1.726266,-1.170675,-1.181504
118,2.249683,-1.050569,1.786341,1.447956
78,0.18983,-0.356361,0.421564,0.396172
76,1.159173,-0.587764,0.592162,0.264699
31,-0.537178,0.800654,-1.284407,-1.050031
64,-0.294842,-0.356361,-0.090227,0.133226
141,1.28034,0.106445,0.762759,1.447956
68,0.432165,-1.976181,0.421564,0.396172
82,-0.052506,-0.819166,0.08037,0.001753


In [331]:
# use score method to get accuracy of model
score = log_reg.score(X_test, y_test)

print('Accuracy: {}'.format(score))

Accuracy: 1.0


In [332]:
from sklearn.metrics import confusion_matrix

classes=['Iris-setosa','Not Iris-setosa']
conf_mat = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(conf_mat, columns=classes, index=classes)
cm_df

Unnamed: 0,Iris-setosa,Not Iris-setosa
Iris-setosa,23,0
Not Iris-setosa,0,15


In [333]:
# Extracting the the TP, FN, FP, TN Values from the confusion matrix

TP=cm_df['Iris-setosa'].loc[cm_df.index[0]]
print(TP)

FN=cm_df['Iris-setosa'].loc[cm_df.index[1]]
print(FN)

FP=cm_df['Not Iris-setosa'].loc[cm_df.index[0]]
print(FP)

TN=cm_df['Not Iris-setosa'].loc[cm_df.index[1]]
print(TN)



23
0
0
15


## Accuracy:

In [334]:
# This cell will work out the accuracy of the logistic regression model

accuracy=(TP+TN)/(TP+FN+TN+FP)
print(f"The accuracy of the linear regression more is: {accuracy}")

The accuracy of the linear regression more is: 1.0


## Precision:

In [335]:
# This cell will work out the precision of the logistic regression model

precision=TP/(TP+FP)
print(f"The precision of the linear regression more is: {precision}")

The precision of the linear regression more is: 1.0


## Recall:

In [336]:
# This cell will work out the recall of the logistic regression model

recall=TP/(TP+FN)
print(f"The recall of the linear regression more is: {recall}")

The recall of the linear regression more is: 1.0


## F1 Score:

In [337]:
# This cell will work out the F1 score of the logistic regression model

f1_score=2*((precision*recall)/(precision+recall))
print(f"The F1 Score of the linear regression more is: {f1_score}")

The F1 Score of the linear regression more is: 1.0
