In [None]:
import pandas as pd
import numpy as np

In [None]:
#loaded our dataset
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'Class']
iris =pd.read_csv('/content/iris.data', names= names)

In [None]:
#preview
iris.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [None]:
#checking for data info
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal-length  150 non-null    float64
 1   sepal-width   150 non-null    float64
 2   petal-length  150 non-null    float64
 3   petal-width   150 non-null    float64
 4   Class         150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [None]:
#statistical summary of the dataset
iris.describe()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [None]:
#check for null values
iris.isnull().sum()

sepal-length    0
sepal-width     0
petal-length    0
petal-width     0
Class           0
dtype: int64

We don't have null values

In [None]:
#check for duplicates
iris.duplicated().sum()

3

We have 3 duplicates

In [None]:
#checking if the duplicates are actually duplicates
iris[iris.duplicated()]

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,Class
34,4.9,3.1,1.5,0.1,Iris-setosa
37,4.9,3.1,1.5,0.1,Iris-setosa
142,5.8,2.7,5.1,1.9,Iris-virginica


In [None]:
iris.columns

Index(['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'Class'], dtype='object')

## Modeling

Model 1

In [None]:
#splitting our data into independent and dependent values
X = iris.iloc[:, :-1].values
y = iris.iloc[:, 4].values

In [None]:
#split into train and test
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=43)

In [None]:
#scaling our dataset

from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#We use fit_transform() on the train data so that we learn the parameters of scaling on the train data and in the same time we scale the train data.
# We only use transform() on the test data because we use the scaling paramaters learned on the train data to scale the test data
#

In [None]:
# Fitting our model
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 3)
classifier.fit(X_train_scaled,y_train)
knn_pred = classifier.predict(X_test_scaled)

In [None]:
# classification metrics
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
print(accuracy_score(y_test,knn_pred))
print(confusion_matrix(y_test,knn_pred))
print(classification_report(y_test,knn_pred))


0.9736842105263158
[[14  0  0]
 [ 0 12  0]
 [ 0  1 11]]
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        14
Iris-versicolor       0.92      1.00      0.96        12
 Iris-virginica       1.00      0.92      0.96        12

       accuracy                           0.97        38
      macro avg       0.97      0.97      0.97        38
   weighted avg       0.98      0.97      0.97        38



## GridSearch

Model 2

In [None]:
#splitting our data into independent and dependent values
X = iris.iloc[:, :-1].values
y = iris.iloc[:, 4].values

# Splitting our data into a training set and a test set

X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=43)

#scaling our data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#classifier
classifier = KNeighborsClassifier()

#Param grid
param_grid = {
    'n_neighbors': range(1, 31),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Creating  and fitting grid search using 5-fold cross validation
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator=classifier, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

#Finding best hyperparameters
best_params = grid_search.best_params_
print(f"Best parameters found: {best_params}")

Best parameters found: {'metric': 'euclidean', 'n_neighbors': 8, 'weights': 'distance'}


In [None]:
# Training the classifier with the best hyperparameters
best_knn = grid_search.best_estimator_

# model prediction
y_pred_grid = best_knn.predict(X_test)

# model's performance
print(accuracy_score(y_test, y_pred_grid))
print(confusion_matrix(y_test, y_pred_grid))
print(classification_report(y_test, y_pred_grid))



1.0
[[14  0  0]
 [ 0 12  0]
 [ 0  0 12]]
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        14
Iris-versicolor       1.00      1.00      1.00        12
 Iris-virginica       1.00      1.00      1.00        12

       accuracy                           1.00        38
      macro avg       1.00      1.00      1.00        38
   weighted avg       1.00      1.00      1.00        38

