In [51]:
import pandas as pd

In [52]:
# loading data
iris_data = pd.read_csv("Iris.csv")

In [53]:
# displaying a small portion of dataset
iris_data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [54]:
# check information and missing values
data_info = iris_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [55]:
# check duplicate entries
duplicate_entries = iris_data.duplicated().sum()

In [56]:
#statistics for numerical columns
statistics = iris_data.describe()

In [57]:
data_info, duplicate_entries, statistics

(None,
 0,
                Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
 count  150.000000     150.000000    150.000000     150.000000    150.000000
 mean    75.500000       5.843333      3.054000       3.758667      1.198667
 std     43.445368       0.828066      0.433594       1.764420      0.763161
 min      1.000000       4.300000      2.000000       1.000000      0.100000
 25%     38.250000       5.100000      2.800000       1.600000      0.300000
 50%     75.500000       5.800000      3.000000       4.350000      1.300000
 75%    112.750000       6.400000      3.300000       5.100000      1.800000
 max    150.000000       7.900000      4.400000       6.900000      2.500000)

In [None]:
# findings: 150 entries, 6 columns. No missing values. No duplicate entries. No outliers

In [58]:
# removing column "Id" because it is not useful
iris_dataclean = iris_data.drop('Id', axis=1)

In [59]:
# converting into numeric values
from sklearn.preprocessing import LabelEncoder

In [60]:
label_encoder = LabelEncoder()

In [61]:
# fit and transform "Species"
iris_dataclean['Species'] = label_encoder.fit_transform(iris_dataclean['Species'])

In [62]:
# split data
x = iris_dataclean.drop('Species', axis=1)
y = iris_dataclean['Species']

In [63]:
#split data to train and test
from sklearn.model_selection import train_test_split

In [64]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [65]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((120, 4), (30, 4), (120,), (30,))

In [None]:
# train - 120 samples, test - 30 samples

In [None]:
# first, Random Forest Classifier

In [66]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [67]:
# initialize
rf_classifier = RandomForestClassifier(random_state=42)

In [68]:
# train model
rf_classifier.fit(x_train,y_train)

In [69]:
# make a prediction
rf_prediction = rf_classifier.predict(x_test)

In [70]:
# evaluating
rf_accuracy = accuracy_score(y_test, rf_prediction)
rf_classification_report = classification_report(y_test, rf_prediction)

In [71]:
rf_accuracy, rf_classification_report

(1.0,
 '              precision    recall  f1-score   support\n\n           0       1.00      1.00      1.00        10\n           1       1.00      1.00      1.00         9\n           2       1.00      1.00      1.00        11\n\n    accuracy                           1.00        30\n   macro avg       1.00      1.00      1.00        30\nweighted avg       1.00      1.00      1.00        30\n')

In [None]:
# findings: accuracy, precision, recall, f1-score = 100%

In [None]:
# now, K-Nearest Neighbour

In [72]:
from sklearn.neighbors import KNeighborsClassifier

In [73]:
# initialize
knn_classifier = KNeighborsClassifier(n_neighbors=3)

In [74]:
# train model
knn_classifier.fit(x_train,y_train)

In [75]:
# make a prediction
knn_prediction = knn_classifier.predict(x_test)

In [76]:
# evaluating
knn_accuracy = accuracy_score(y_test, knn_prediction)
knn_classification_report = classification_report(y_test, knn_prediction)

In [77]:
knn_accuracy, knn_classification_report

(1.0,
 '              precision    recall  f1-score   support\n\n           0       1.00      1.00      1.00        10\n           1       1.00      1.00      1.00         9\n           2       1.00      1.00      1.00        11\n\n    accuracy                           1.00        30\n   macro avg       1.00      1.00      1.00        30\nweighted avg       1.00      1.00      1.00        30\n')

In [77]:
# findings: Accuracy, precision, recall, f1-score = 100%, as well as in RFC. RFC and KNN showed perfect accuracy. Both models achieved perfect accuracy and classification metrics, indicating their high effectiveness for this particular dataset. This reflects the simplicity and clear separability of the Iris species in the dataset. In practice, the choice between these models might depend on factors like data complexity, dimensionality, and the need for model interpretability. For this dataset, either model could be confidently used for accurate classification. The high performance of both models suggests that the Iris dataset is relatively straightforward with clear distinctions between species. This might not be the case with more complex or overlapping datasets.