In [2]:
import pandas as pd

# Load the Iris dataset into a DataFrame
iris_df = pd.read_csv('Iris.csv')

# Display all rows of the dataset
print("All rows of the Iris dataset:")
print(iris_df)


All rows of the Iris dataset:
      Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm  \
0      1            5.1           3.5            1.4           0.2   
1      2            4.9           3.0            1.4           0.2   
2      3            4.7           3.2            1.3           0.2   
3      4            4.6           3.1            1.5           0.2   
4      5            5.0           3.6            1.4           0.2   
..   ...            ...           ...            ...           ...   
145  146            6.7           3.0            5.2           2.3   
146  147            6.3           2.5            5.0           1.9   
147  148            6.5           3.0            5.2           2.0   
148  149            6.2           3.4            5.4           2.3   
149  150            5.9           3.0            5.1           1.8   

            Species  
0       Iris-setosa  
1       Iris-setosa  
2       Iris-setosa  
3       Iris-setosa  
4       Iris-setosa

In [3]:
# 1. Understanding the Structure and Contents
print("First few rows of the dataset:")
print(iris_df.head())

print("\nColumn names:")
print(iris_df.columns)

print("\nShape of the dataset:")
print(iris_df.shape)

# 2. Checking for Missing Values
print("\nMissing values in each column:")
print(iris_df.isnull().sum())

# 3. Checking Data Types
print("\nData types and memory usage:")
print(iris_df.info())

# 4. Basic Statistics
print("\nBasic statistics for numerical columns:")
print(iris_df.describe())


First few rows of the dataset:
   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa

Column names:
Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')

Shape of the dataset:
(150, 6)

Missing values in each column:
Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

Data types and memory usage:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---

In [8]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Check for missing values
missing_values = iris_df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

# Handle missing values (if any)
if missing_values.sum() > 0:
    # Strategy: Impute missing values with mean for numerical columns
    numerical_features = iris_df.select_dtypes(include=['float64', 'int64']).columns
    numerical_imputer = SimpleImputer(strategy='mean')
    numerical_imputer.fit(iris_df[numerical_features])
    
    iris_df[numerical_features] = numerical_imputer.transform(iris_df[numerical_features])

# Encode categorical variables using one-hot encoding
iris_df_encoded = pd.get_dummies(iris_df)

# Standardize numerical features (if necessary)
numerical_features = iris_df_encoded.select_dtypes(include=['float64', 'int64']).columns
numerical_scaler = StandardScaler()
iris_df_encoded[numerical_features] = numerical_scaler.fit_transform(iris_df_encoded[numerical_features])

# Display preprocessed dataset
print("\nPreprocessed dataset:")
print(iris_df_encoded.head())


Missing values in each column:
Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

Preprocessed dataset:
         Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm  \
0 -1.720542      -0.900681      1.032057      -1.341272     -1.312977   
1 -1.697448      -1.143017     -0.124958      -1.341272     -1.312977   
2 -1.674353      -1.385353      0.337848      -1.398138     -1.312977   
3 -1.651258      -1.506521      0.106445      -1.284407     -1.312977   
4 -1.628164      -1.021849      1.263460      -1.341272     -1.312977   

   Species_Iris-setosa  Species_Iris-versicolor  Species_Iris-virginica  
0                    1                        0                       0  
1                    1                        0                       0  
2                    1                        0                       0  
3                    1                        0                       0  
4         

In [11]:
from sklearn.model_selection import train_test_split

# Check for missing values
missing_values = iris_df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

# Handle missing values (if any)
if missing_values.sum() > 0:
    # Strategy: Impute missing values with mean for numerical columns
    numerical_features = iris_df.select_dtypes(include=['float64', 'int64']).columns
    numerical_imputer = SimpleImputer(strategy='mean')
    numerical_imputer.fit(iris_df[numerical_features])
    
    iris_df[numerical_features] = numerical_imputer.transform(iris_df[numerical_features])

# Get the target variable
y = iris_df['Species']

# Drop the target variable from the original DataFrame to get the features
X = iris_df.drop(columns=['Species'])

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)


Missing values in each column:
Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64
Shape of X_train: (120, 5)
Shape of X_test: (30, 5)
Shape of y_train: (120,)
Shape of y_test: (30,)


In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset
iris_df = pd.read_csv("Iris.csv")

# Explore the dataset
print("Dataset Overview:")
print(iris_df.head())
print("\nDataset Information:")
print(iris_df.info())
print("\nSummary Statistics:")
print(iris_df.describe())

# Check for missing values
print("\nMissing values in each column:")
print(iris_df.isnull().sum())

# Preprocess the data
X = iris_df.drop(columns=['Species'])  # Features
y = iris_df['Species']  # Target variable
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train the KNN model
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, y_train)

# Make predictions with KNN
y_pred_knn = knn_classifier.predict(X_test)

# Evaluate KNN model
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print("\nKNN Accuracy:", accuracy_knn)

print("\nKNN Classification Report:")
print(classification_report(y_test, y_pred_knn))

print("\nKNN Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_knn))

# Train the Decision Tree model
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)

# Make predictions with Decision Tree
y_pred_dt = dt_classifier.predict(X_test)

# Evaluate Decision Tree model
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print("\nDecision Tree Accuracy:", accuracy_dt)

print("\nDecision Tree Classification Report:")
print(classification_report(y_test, y_pred_dt))

print("\nDecision Tree Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_dt))


Dataset Overview:
   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
me

In [15]:
# Evaluate the SVM model
print("\nEvaluation Metrics for SVM Model:")
accuracy_svm = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy_svm)

print("\nClassification Report for SVM Model:")
print(classification_report(y_test, y_pred))

# Evaluate the KNN model
print("\nEvaluation Metrics for KNN Model:")
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print("Accuracy:", accuracy_knn)

print("\nClassification Report for KNN Model:")
print(classification_report(y_test, y_pred_knn))

# Evaluate the Decision Tree model
print("\nEvaluation Metrics for Decision Tree Model:")
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print("Accuracy:", accuracy_dt)

print("\nClassification Report for Decision Tree Model:")
print(classification_report(y_test, y_pred_dt))



Evaluation Metrics for SVM Model:
Accuracy: 1.0

Classification Report for SVM Model:
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        10
Iris-versicolor       1.00      1.00      1.00         9
 Iris-virginica       1.00      1.00      1.00        11

       accuracy                           1.00        30
      macro avg       1.00      1.00      1.00        30
   weighted avg       1.00      1.00      1.00        30


Evaluation Metrics for KNN Model:
Accuracy: 1.0

Classification Report for KNN Model:
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        10
Iris-versicolor       1.00      1.00      1.00         9
 Iris-virginica       1.00      1.00      1.00        11

       accuracy                           1.00        30
      macro avg       1.00      1.00      1.00        30
   weighted avg       1.00      1.00      1.00        30


Evaluation Metrics fo

In [20]:
from sklearn.metrics import confusion_matrix

# Train the SVM model
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = svm_classifier.predict(X_test)

# Evaluate the SVM model
print("\nEvaluation Metrics for SVM Model:")
accuracy_svm = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy_svm)

print("\nClassification Report for SVM Model:")
print(classification_report(y_test, y_pred))

# Obtain the confusion matrix for SVM Model
conf_matrix_svm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix for SVM Model:")
print(conf_matrix_svm)



Evaluation Metrics for SVM Model:
Accuracy: 1.0

Classification Report for SVM Model:
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        10
Iris-versicolor       1.00      1.00      1.00         9
 Iris-virginica       1.00      1.00      1.00        11

       accuracy                           1.00        30
      macro avg       1.00      1.00      1.00        30
   weighted avg       1.00      1.00      1.00        30


Confusion Matrix for SVM Model:
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


In [26]:
import numpy as np

# Generate random data for testing
num_samples = 4
num_features = 4  # Assuming 4 features: SepalLengthCm, SepalWidthCm, PetalLengthCm, PetalWidthCm

# Generate random values for each feature
X_new = np.random.rand(num_samples, num_features) * 10  # Scale the random values to a range of 0 to 10

print("Sample New Data:")
print(X_new)


Sample New Data:
[[8.87356029 5.94635796 6.05468319 8.16400432]
 [1.49478335 0.45646617 8.57540739 4.4047873 ]
 [3.27646298 2.12464147 4.31310585 3.54617175]
 [2.62689379 8.97743014 5.34101102 8.23145929]]


In [5]:
# Step 1: Import necessary libraries
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Step 2: Load the dataset
iris = load_iris()
X = iris.data
y = iris.target

# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Train an initial model (e.g., SVM)
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)

# Step 5: Evaluate the initial model (e.g., SVM)
y_pred_svm = svm_classifier.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("Initial Model (SVM) Accuracy:", accuracy_svm)
print("Initial Model (SVM) Classification Report:\n", classification_report(y_test, y_pred_svm))

# Step 6: Experiment with a different algorithm (e.g., Random Forest)
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Step 7: Evaluate the alternative algorithm (e.g., Random Forest)
y_pred_rf = rf_classifier.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Alternative Algorithm (Random Forest) Accuracy:", accuracy_rf)
print("Alternative Algorithm (Random Forest) Classification Report:\n", classification_report(y_test, y_pred_rf))

# Step 14: Finalize the model and deploy it
# (Finalization and deployment code not included in this example)


Initial Model (SVM) Accuracy: 1.0
Initial Model (SVM) Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

Alternative Algorithm (Random Forest) Accuracy: 1.0
Alternative Algorithm (Random Forest) Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [11]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

# Step 1: Initialize the SVM classifier
svm_classifier = SVC()

# Step 2: Perform k-fold cross-validation
cv_scores = cross_val_score(svm_classifier, X_train_scaled, y_train, cv=5)

# Step 3: Calculate the mean and standard deviation of the cross-validation scores
mean_cv_score = cv_scores.mean()
std_cv_score = cv_scores.std()

# Step 4: Print the mean and standard deviation of the cross-validation scores
print("Mean Cross-Validation Score:", mean_cv_score)
print("Standard Deviation of Cross-Validation Scores:", std_cv_score)


Mean Cross-Validation Score: 0.95
Standard Deviation of Cross-Validation Scores: 0.06123724356957944


In [12]:
from sklearn.ensemble import RandomForestClassifier

# Step 1: Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Step 2: Train the Random Forest classifier on the training data
rf_classifier.fit(X_train_scaled, y_train)

# Step 3: Make predictions on the test data
y_pred_rf = rf_classifier.predict(X_test_scaled)

# Step 4: Evaluate the Random Forest classifier
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Accuracy of Random Forest Classifier:", accuracy_rf)


Accuracy of Random Forest Classifier: 1.0
