# Iris Flower Classification with FastAPI


## Step 1: Import Libraries

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import joblib
from sklearn.datasets import load_iris

## Step 2: Load the Dataset

Load the Iris.csv dataset and inspect its structure.

In [9]:
# Load the dataset
df = pd.read_csv('Iris.csv')

# Drop the 'Id' column as it's not a feature
df = df.drop(columns=['Id'])

# Display first few rows
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


## Step 3: Data Cleaning

Check for missing values, duplicates, and outliers in numerical features.

In [11]:
# Check for missing values
print('Missing values:\n', df.isnull().sum())

# Check for duplicates
print('Duplicate rows:', df.duplicated().sum())

# Remove duplicates
df = df.drop_duplicates()

# Handle outliers using IQR for numerical features
numerical_cols = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
def remove_outliers(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

df = remove_outliers(df, numerical_cols)


Missing values:
 SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64
Duplicate rows: 0


## Step 4: Feature Engineering

Add new features (e.g., sepal area, petal area) and scale numerical features.

In [12]:
# Create new features
df['sepal_area'] = df['SepalLengthCm'] * df['SepalWidthCm']
df['petal_area'] = df['PetalLengthCm'] * df['PetalWidthCm']

# Features and target
features = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'sepal_area', 'petal_area']
X = df[features]
y = df['Species']

# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save the scaler
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

## Step 5: Train-Test Split

Split the data into training and testing sets.

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print('Training set shape:', X_train.shape)
print('Test set shape:', X_test.shape)

Training set shape: (114, 6)
Test set shape: (29, 6)


## Step 6: Train the Model

Train a RandomForestClassifier and evaluate with cross-validation.

In [14]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Cross-validation
cv_scores = cross_val_score(model, X_scaled, y, cv=5, scoring='accuracy')
print(f'Cross-Validation Accuracy Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean()}')

# Evaluate on test set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Test Accuracy: {accuracy}')

Cross-Validation Accuracy Scores: [0.96551724 0.93103448 0.96551724 0.96428571 1.        ]
Mean CV Accuracy: 0.9652709359605911
Test Accuracy: 0.896551724137931


## Step 7: Save the Model

Save the trained model for FastAPI.

In [15]:
joblib.dump(model, 'model.pkl')
print('Model saved as model.pkl')

Model saved as model.pkl
