<a href="https://colab.research.google.com/github/kimanirobbi/wk-3-ai/blob/main/Task_1_Classical_ML_(Iris_Decision_Tree).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.impute import SimpleImputer

# -----------------------------------------------------------
# 1. Data Loading and Initial Inspection
# -----------------------------------------------------------
print("--- Task 1: Classical ML (Decision Tree on Iris) ---")
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.Series(iris.target)
target_names = iris.target_names

# Display basic info
print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")
print(f"Target Species: {target_names}")
print("\nFirst 5 rows of data:")
print(X.head())

# -----------------------------------------------------------
# 2. Data Preprocessing (Handling Missing Values & Encoding)
# -----------------------------------------------------------

# 2a. Simulating and Handling Missing Values (Best Practice Example)
# Iris dataset is clean, so we simulate a missing value for demonstration
X_with_nans = X.copy()
# Introduce one NaN value in the first feature
X_with_nans.iloc[1, 0] = np.nan

# Initialize Imputer (using median for robustness against outliers)
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
X_imputed = imputer.fit_transform(X_with_nans)
X_imputed = pd.DataFrame(X_imputed, columns=X.columns)
print("\nMissing values imputation complete (median strategy applied).")

# 2b. Label Encoding (Iris targets are already numeric, but we demonstrate)
# If the target was a string array (e.g., ['setosa', 'versicolor', 'virginica']),
# we would use LabelEncoder or OneHotEncoder.
le = LabelEncoder()
# Since y is already numeric, this step serves to map the integers to themselves
# but confirms the encoding pipeline if needed for string labels.
y_encoded = le.fit_transform(y)
print(f"Encoded classes: {le.classes_}")

# -----------------------------------------------------------
# 3. Data Splitting and Model Training
# -----------------------------------------------------------

# Split the data into training and testing sets (70/30 split)
X_train, X_test, y_train, y_test = train_test_split(
    X_imputed, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
)
print(f"\nTraining samples: {X_train.shape[0]}, Testing samples: {X_test.shape[0]}")

# Initialize and train the Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)
print("Decision Tree Classifier training complete.")

# -----------------------------------------------------------
# 4. Prediction and Evaluation
# -----------------------------------------------------------

# Make predictions on the test set
y_pred = dt_classifier.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
# Use 'macro' average for precision/recall since this is a multi-class problem
# 'macro' calculates metrics for each label and finds their unweighted mean.
precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
recall = recall_score(y_test, y_pred, average='macro', zero_division=0)

# Display results
print("\n--- Model Evaluation Results (Decision Tree) ---")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision (Macro): {precision:.4f}")
print(f"Recall (Macro):    {recall:.4f}")

# Example of a single prediction
sample_index = 0
# Create a DataFrame with the same columns as X_train for prediction
sample_features = pd.DataFrame([X_test.iloc[sample_index]], columns=X_train.columns)
sample_prediction = dt_classifier.predict(sample_features)[0]
# Access the element using standard NumPy array indexing
true_label = y_test[sample_index]
print(f"\nTest Sample Index {sample_index}:")
print(f"Features: {X_test.iloc[sample_index].values}")
print(f"True Species: {target_names[true_label]} (Label {true_label})")
print(f"Predicted Species: {target_names[sample_prediction]} (Label {sample_prediction})")

--- Task 1: Classical ML (Decision Tree on Iris) ---
Features (X) shape: (150, 4)
Target (y) shape: (150,)
Target Species: ['setosa' 'versicolor' 'virginica']

First 5 rows of data:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2

Missing values imputation complete (median strategy applied).
Encoded classes: [0 1 2]

Training samples: 105, Testing samples: 45
Decision Tree Classifier training complete.

--- Model Evaluation Results (Decision Tree) ---
Accuracy:  0.9333
Precision (Macro): 0.9444
Recall (Macro):    0.9333

Test Sample Index 0:
Features: [7.3 2.9 6.3 1.8]
True Species: vi