# Iris Decision Tree - Preprocessing & Training

This notebook demonstrates preprocessing, label encoding,
and training a Decision Tree classifier on the Iris dataset.
It also evaluates the model using accuracy, precision, and recall.

Steps are commented inline for clarity.

In [None]:
# 1. Imports and loading the Iris dataset
from sklearn import datasets
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

iris = datasets.load_iris()
X = iris['data']
y = iris['target']
feature_names = iris['feature_names']
target_names = iris['target_names']

# Build DataFrame
df = pd.DataFrame(X, columns=feature_names)
df['species'] = pd.Categorical.from_codes(y, target_names)

df.head()

In [None]:
# 2. Check & handle missing values
print('Missing values per column:')
print(df.isnull().sum())

# Impute if necessary (Iris dataset has no missing values by default)
if df.isnull().values.any():
    imputer = SimpleImputer(strategy='mean')
    df.iloc[:, :-1] = imputer.fit_transform(df.iloc[:, :-1])
    print('Imputation done (mean).')
else:
    print('No missing values detected.')

In [None]:
# 3. Encode labels (species) to numeric values
le = LabelEncoder()
df['species_encoded'] = le.fit_transform(df['species'])
print('Classes:', le.classes_)
df[['species', 'species_encoded']].drop_duplicates()

In [None]:
# 4. Prepare features and target, then split into train/test
X = df[feature_names].values
y_enc = df['species_encoded'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.2, random_state=42, stratify=y_enc
)

print('Train size:', X_train.shape[0], 'Test size:', X_test.shape[0])

In [None]:
# 5. Train a Decision Tree classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('Predictions on test set:', y_pred)

In [None]:
# 6. Evaluate: accuracy, precision, recall, and full report
acc = accuracy_score(y_test, y_pred)
prec_macro = precision_score(y_test, y_pred, average='macro', zero_division=0)
rec_macro = recall_score(y_test, y_pred, average='macro', zero_division=0)

print(f'Accuracy: {acc:.4f}')
print(f'Precision (macro): {prec_macro:.4f}')
print(f'Recall (macro): {rec_macro:.4f}')

print('\\nClassification report:')
print(classification_report(y_test, y_pred, target_names=le.classes_, zero_division=0))