In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [2]:
def print_section(title):
    """Helper function to print section titles"""
    print("\n" + "="*80)
    print(title)
    print("="*80)

def load_and_prepare_data():
    """Load and prepare the Iris dataset"""
    print_section("1. Loading and Preparing the Data")
    
    # Load the Iris dataset
    iris = load_iris()
    X = iris.data
    y = iris.target
    feature_names = iris.feature_names
    target_names = iris.target_names
    
    # Create a DataFrame for better visualization
    df = pd.DataFrame(X, columns=feature_names)
    df['target'] = y
    df['target_name'] = [target_names[i] for i in y]
    
    print("Dataset Shape:", X.shape)
    print("\nFirst few rows of the dataset:")
    print(df.head())
    
    return X, y, feature_names, target_names, df

def load_and_prepare_data():
    """Load and prepare the Iris dataset"""
    print_section("1. Loading and Preparing the Data")
    
    # Load the Iris dataset
    iris = load_iris()
    X = iris.data
    y = iris.target
    feature_names = iris.feature_names
    target_names = iris.target_names
    
    # Create a DataFrame for better visualization
    df = pd.DataFrame(X, columns=feature_names)
    df['target'] = y
    df['target_name'] = [target_names[i] for i in y]
    
    print("Dataset Shape:", X.shape)
    print("\nFirst few rows of the dataset:")
    print(df.head())
    
    return X, y, feature_names, target_names, df

def visualize_original_data(df):
    """Create pair plot of the original data"""
    print_section("2. Visualizing Original Data")
    print("Creating pair plot to show relationships between features...")
    
    plt.figure(figsize=(12, 8))
    sns.pairplot(df, hue='target_name', diag_kind='hist')
    plt.savefig('pair_plot.png')
    plt.close()
    print("Pair plot saved as 'pair_plot.png'")

def perform_pca(X, y, target_names):
    """Perform PCA and visualize results"""
    print_section("3. Principal Component Analysis (PCA)")
    
    # Standardize the data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Apply PCA
    pca = PCA()
    X_pca = pca.fit_transform(X_scaled)
    
    # Calculate explained variance ratio
    explained_variance_ratio = pca.explained_variance_ratio_
    cumulative_variance_ratio = np.cumsum(explained_variance_ratio)
    
    # Plot explained variance ratio
    plt.figure(figsize=(10, 6))
    plt.bar(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, alpha=0.5, align='center')
    plt.step(range(1, len(cumulative_variance_ratio) + 1), cumulative_variance_ratio, where='mid')
    plt.ylabel('Explained variance ratio')
    plt.xlabel('Principal components')
    plt.title('Explained Variance Ratio by Principal Components')
    plt.savefig('pca_variance_ratio.png')
    plt.close()
    
    # Print explained variance ratio
    print("Explained variance ratio by component:")
    for i, ratio in enumerate(explained_variance_ratio):
        print(f"PC{i+1}: {ratio:.3f}")
    print(f"\nTotal variance explained by first 2 components: {cumulative_variance_ratio[1]:.3f}")
    
    # Visualize PCA results in 2D
    plt.figure(figsize=(10, 8))
    for i, target_name in enumerate(target_names):
        plt.scatter(X_pca[y == i, 0], X_pca[y == i, 1], label=target_name)
    plt.xlabel('First Principal Component')
    plt.ylabel('Second Principal Component')
    plt.title('PCA of Iris Dataset')
    plt.legend()
    plt.savefig('pca_2d_plot.png')
    plt.close()
    
    return X_scaled, X_pca

def perform_lda(X_scaled, y, target_names):
    """Perform LDA and visualize results"""
    print_section("4. Linear Discriminant Analysis (LDA)")
    
    # Apply LDA
    lda = LinearDiscriminantAnalysis(n_components=2)
    X_lda = lda.fit_transform(X_scaled, y)
    
    # Visualize LDA results
    plt.figure(figsize=(10, 8))
    for i, target_name in enumerate(target_names):
        plt.scatter(X_lda[y == i, 0], X_lda[y == i, 1], label=target_name)
    plt.xlabel('First Linear Discriminant')
    plt.ylabel('Second Linear Discriminant')
    plt.title('LDA of Iris Dataset')
    plt.legend()
    plt.savefig('lda_2d_plot.png')
    plt.close()
    
    return X_lda

def compare_methods(X_pca, X_lda, y, target_names):
    """Compare PCA and LDA results"""
    print_section("5. Comparison of PCA and LDA")
    
    # Create subplots for comparison
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Plot PCA results
    for i, target_name in enumerate(target_names):
        ax1.scatter(X_pca[y == i, 0], X_pca[y == i, 1], label=target_name)
    ax1.set_xlabel('First Principal Component')
    ax1.set_ylabel('Second Principal Component')
    ax1.set_title('PCA')
    ax1.legend()
    
    # Plot LDA results
    for i, target_name in enumerate(target_names):
        ax2.scatter(X_lda[y == i, 0], X_lda[y == i, 1], label=target_name)
    ax2.set_xlabel('First Linear Discriminant')
    ax2.set_ylabel('Second Linear Discriminant')
    ax2.set_title('LDA')
    ax2.legend()
    
    plt.tight_layout()
    plt.savefig('comparison_plot.png')
    plt.close()

## Load and prepare data

In [3]:
X, y, feature_names, target_names, df = load_and_prepare_data()
    
    


1. Loading and Preparing the Data
Dataset Shape: (150, 4)

First few rows of the dataset:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target target_name  
0       0      setosa  
1       0      setosa  
2       0      setosa  
3       0      setosa  
4       0      setosa  


## Visualize original data

In [4]:
visualize_original_data(df)


2. Visualizing Original Data
Creating pair plot to show relationships between features...


  self._figure.tight_layout(*args, **kwargs)


Pair plot saved as 'pair_plot.png'


<Figure size 1200x800 with 0 Axes>

## Perform PCA

In [5]:
X_scaled, X_pca = perform_pca(X, y, target_names)



3. Principal Component Analysis (PCA)
Explained variance ratio by component:
PC1: 0.730
PC2: 0.229
PC3: 0.037
PC4: 0.005

Total variance explained by first 2 components: 0.958


## Perform LDA

In [6]:
X_lda = perform_lda(X_scaled, y, target_names)


4. Linear Discriminant Analysis (LDA)


## Compare methods

In [7]:
compare_methods(X_pca, X_lda, y, target_names)


5. Comparison of PCA and LDA
