In [3]:
import pandas as pd  # data manipulation  
from sklearn.model_selection import train_test_split  # train/test split  
from sklearn.impute import SimpleImputer  # missing‐value imputation  
from sklearn.compose import ColumnTransformer  # column‐wise transformations  
from sklearn.ensemble import RandomForestClassifier  # classifier  
from sklearn.pipeline import Pipeline  # pipeline builder  
from sklearn import metrics  # evaluation metrics  
import joblib  # persistence  

# Load the dataset
data = pd.read_csv('iris.csv')  # Read the CSV into a DataFrame

data


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [4]:
# Split into features and target
X = data.drop(columns=['species'])  # All columns except 'species' are features
y = data['species']  # 'species' is the target variable

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(  # Split data
    X,  # Features
    y,  # Target
    test_size=0.3,  # 30% of data for testing
    random_state=23  # Fix random seed for reproducibility
)

# Create a median imputer
median_imputer = SimpleImputer(strategy='median')  # Impute missing values with column medians

# Build a ColumnTransformer that applies the imputer to all feature columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num_imputer', median_imputer, X.columns)  # Apply median imputer to every feature column
    ],
    remainder='passthrough'  # Leave any other columns unchanged
)

# Instantiate the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=23)  # Use random_state for reproducibility

# Build the pipeline: preprocessing followed by classification
pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),  # Preprocessing step
        ('classifier', rf_classifier)  # Classification step
    ]
)

# Train the pipeline on the training data
pipeline.fit(X_train, y_train)  # Fit both imputer and classifier

# Predict on the test data
y_pred = pipeline.predict(X_test)  # Generate predictions

# Print evaluation metrics
print(metrics.classification_report(y_test, y_pred))  # Show precision, recall, f1‐score for each clas

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        18
Iris-versicolor       0.93      1.00      0.97        14
 Iris-virginica       1.00      0.92      0.96        13

       accuracy                           0.98        45
      macro avg       0.98      0.97      0.98        45
   weighted avg       0.98      0.98      0.98        45



In [8]:
import os  # filesystem utilities
# Ensure 'app' directory exists before saving
os.makedirs('app', exist_ok=True)  # Create directory if it doesn't exist

In [10]:
# Save the trained pipeline
# save the the pipeline in the app folder
joblib.dump(pipeline, '../app/iris.mdl')  # Persist the pipeline for later use

['../app/iris.mdl']