In [2]:
# Import essential libraries for data handling, visualization, and machine learning
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report


In [3]:
# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target
target_names = iris.target_names

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [4]:
# This function trains the model, predicts on test data, and prints key metrics
def evaluate_model(model, X_train, X_test, y_train, y_test, name="Model"):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f"\n=== {name} ===")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision (macro):", precision_score(y_test, y_pred, average='macro'))
    print("Recall (macro):", recall_score(y_test, y_pred, average='macro'))
    print("F1 Score (macro):", f1_score(y_test, y_pred, average='macro'))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


In [5]:
# Define models
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC()
}

# Evaluate each model
for name, model in models.items():
    evaluate_model(model, X_train, X_test, y_train, y_test, name)



=== Logistic Regression ===
Accuracy: 1.0
Precision (macro): 1.0
Recall (macro): 1.0
F1 Score (macro): 1.0
Confusion Matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]

=== Decision Tree ===
Accuracy: 1.0
Precision (macro): 1.0
Recall (macro): 1.0
F1 Score (macro): 1.0
Confusion Matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]

=== Random Forest ===
Accuracy: 1.0
Precision (macro): 1.0
Recall (macro): 1.0
F1 Score (macro): 1.0
Confusion Matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]

=== SVM ===
Accuracy: 1.0
Precision (macro): 1.0
Recall (macro): 1.0
F1 Score (macro): 1.0
Confusion Matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


In [6]:
# Reduce to 2 principal components using PCA (unsupervised)
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Evaluate each model using PCA-transformed data
for name, model in models.items():
    evaluate_model(model, X_train_pca, X_test_pca, y_train, y_test, name + " + PCA")



=== Logistic Regression + PCA ===
Accuracy: 0.9
Precision (macro): 0.9027777777777778
Recall (macro): 0.8956228956228957
F1 Score (macro): 0.8976982097186701
Confusion Matrix:
 [[10  0  0]
 [ 0  7  2]
 [ 0  1 10]]

=== Decision Tree + PCA ===
Accuracy: 0.9333333333333333
Precision (macro): 0.9326599326599326
Recall (macro): 0.9326599326599326
F1 Score (macro): 0.9326599326599326
Confusion Matrix:
 [[10  0  0]
 [ 0  8  1]
 [ 0  1 10]]

=== Random Forest + PCA ===
Accuracy: 0.9333333333333333
Precision (macro): 0.9326599326599326
Recall (macro): 0.9326599326599326
F1 Score (macro): 0.9326599326599326
Confusion Matrix:
 [[10  0  0]
 [ 0  8  1]
 [ 0  1 10]]

=== SVM + PCA ===
Accuracy: 0.9
Precision (macro): 0.9027777777777778
Recall (macro): 0.8956228956228957
F1 Score (macro): 0.8976982097186701
Confusion Matrix:
 [[10  0  0]
 [ 0  7  2]
 [ 0  1 10]]


In [7]:
# Reduce to 2 components using LDA (supervised)
lda = LDA(n_components=2)
X_train_lda = lda.fit_transform(X_train, y_train)
X_test_lda = lda.transform(X_test)

# Evaluate each model using LDA-transformed data
for name, model in models.items():
    evaluate_model(model, X_train_lda, X_test_lda, y_train, y_test, name + " + LDA")



=== Logistic Regression + LDA ===
Accuracy: 1.0
Precision (macro): 1.0
Recall (macro): 1.0
F1 Score (macro): 1.0
Confusion Matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]

=== Decision Tree + LDA ===
Accuracy: 0.9666666666666667
Precision (macro): 0.9722222222222222
Recall (macro): 0.9629629629629629
F1 Score (macro): 0.9658994032395567
Confusion Matrix:
 [[10  0  0]
 [ 0  8  1]
 [ 0  0 11]]

=== Random Forest + LDA ===
Accuracy: 1.0
Precision (macro): 1.0
Recall (macro): 1.0
F1 Score (macro): 1.0
Confusion Matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]

=== SVM + LDA ===
Accuracy: 1.0
Precision (macro): 1.0
Recall (macro): 1.0
F1 Score (macro): 1.0
Confusion Matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


Model Performance in Three Scenarios
Models were evaluated in three settings:

Without dimensionality reduction

With PCA

With LDA

Without reduction, all models — especially Random Forest and SVM — performed well. Logistic Regression also showed strong results.

With PCA, performance slightly dropped across models since PCA is unsupervised and may remove class-relevant features. SVM and Logistic Regression handled it better than tree-based models.

With LDA, performance improved or remained high, particularly for Logistic Regression and SVM. Since LDA uses class labels to maximize separability, it’s highly effective for classification.



Effect of Dimensionality Reduction
PCA may reduce accuracy due to information loss, as it doesn't consider class labels.

LDA generally improves or maintains metrics by preserving class distinctions.

Tree-based models are less impacted by dimensionality reduction but don’t benefit as much from LDA.