# Iris Species Classification

In [1]:
# Import libraries
import warnings
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

warnings.filterwarnings("ignore")

In [2]:
# Loading dataset
iris_df = pd.read_csv("/kaggle/input/iris/Iris.csv")
iris_df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
# Everything about the dataset
print(f"Shape: {iris_df.shape}")
print(f"\nFeatures: {iris_df.columns.to_list()}")
print("\nInformation:")
print(iris_df.info())
print("\n Description:")
print(iris_df.describe())

Shape: (150, 6)

Features: ['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species']

Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB
None

 Description:
               Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
count  150.000000     150.000000    150.000000     150.000000    150.000000
mean    75.500000       5.843333      3.054000       3.758667      1.198667
std     43.445368       0.828066      0.433594       1.764420      0.763161
min      1.000000       4.300000 

In [4]:
# Checking dependent feature
iris_df['Species'].value_counts()

Species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64

In [5]:
# Encoding dependent feature
iris_df['Species'] = iris_df['Species'].map({"Iris-setosa": 0, "Iris-versicolor":1, "Iris-virginica":2})

In [6]:
# Splitting data into independent & dependent
X = iris_df.drop(columns=['Id','Species'])
y = iris_df['Species']

In [7]:
# Splitting dataset into training & testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

print(f"Training Samples - {X_train.shape[0]}")
print(f"Testing Samples - {X_test.shape[0]}")

Training Samples - 112
Testing Samples - 38


In [8]:
# Performing Data Scaling (To make all features in same scale)
scaler = StandardScaler()
scaler.fit(X_train)

# transforming training and testing data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# converting scaled data into pandas dataframe
X_train = pd.DataFrame(data=X_train_scaled, columns=iris_df.columns[1:-1])
X_test = pd.DataFrame(data=X_test_scaled, columns=iris_df.columns[1:-1])

#After standardization, let's have a look at training data
X_train.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,-1.018271,1.306365,-1.39489,-1.358652
1,-0.77301,2.463904,-1.336964,-1.492722
2,-0.037227,-0.777205,0.748389,0.920532
3,0.208034,0.84335,0.40083,0.518323
4,1.066448,0.148826,0.516683,0.384253


## Logistic Regression

In [9]:
# Applying Logistic Regression
lr = LogisticRegression(C=3, random_state=42)
lr.fit(X_train, y_train)

print(f"Training Accuracy- {round(lr.score(X_train, y_train)*100)}%")
print(f"Testing Accuracy- {round(lr.score(X_test, y_test)*100)}%")

Training Accuracy- 96%
Testing Accuracy- 100%


## SVM: Support Vector Classifier

In [10]:
# Training & Evaluation different SVM with different Kernel Models
svm_classifiers = {"Linear SVM": SVC(kernel='linear', random_state=42, gamma=.10, C=1.0),
                   "RBF SVM": SVC(kernel='rbf', random_state=42, gamma=.10, C=1.0),
                   "Poly SVM": SVC(kernel='poly', random_state=42, gamma=.10, C=1.0)}

for name, clf in svm_classifiers.items():
    print("--"*40)
    print(name)
    clf.fit(X_train, y_train)
    print(f"Training Accuracy- {round(clf.score(X_train, y_train)*100)}%")
    print(f"Testing Accuracy- {round(clf.score(X_test, y_test)*100)}%")

print("--"*40)

--------------------------------------------------------------------------------
Linear SVM
Training Accuracy- 97%
Testing Accuracy- 97%
--------------------------------------------------------------------------------
RBF SVM
Training Accuracy- 96%
Testing Accuracy- 100%
--------------------------------------------------------------------------------
Poly SVM
Training Accuracy- 82%
Testing Accuracy- 84%
--------------------------------------------------------------------------------


## KNN: K-Nearest Neighbor Classifier

In [11]:
# Applying KNN
knn = KNeighborsClassifier(n_neighbors=6, metric='euclidean')
knn.fit(X_train, y_train)

print(f"Training Accuracy- {round(knn.score(X_train, y_train)*100)}%")
print(f"Testing Accuracy- {round(knn.score(X_test, y_test)*100)}%")

Training Accuracy- 96%
Testing Accuracy- 100%


## Summary of the model performances:

- **Logistic Regression** and **KNN** both achieved 100% testing accuracy, indicating they performed exceptionally well on unseen data. Logistic Regression had a 96% training accuracy, while KNN had a 96% training accuracy as well.

- **Linear SVM** demonstrated strong performance with a 97% training accuracy and 97% testing accuracy, showing good generalization.

- **RBF SVM** also performed well, with 96% training accuracy and 100% testing accuracy, effectively capturing complex patterns.

Overall, Logistic Regression, KNN, and RBF SVM showed the highest performance, with Poly SVM having relatively lower accuracy.