In [None]:
# Load a sample dataset (e.g., the Iris dataset) and visualize its features.

import pandas as pd
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
import seaborn as sns

# Load the Iris dataset
iris = load_iris()
data = pd.DataFrame(iris.data, columns=iris.feature_names)
data['target'] = iris.target

# Visualize the dataset
sns.pairplot(data, hue='target', palette='bright', diag_kind="kde")
plt.suptitle("Pairplot of Iris Dataset", y=1.02)
plt.show()


In [None]:
# Apply PCA to reduce the dataset's dimensionality while retaining maximum variance.

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(iris.data)

# Apply PCA
pca = PCA(n_components=2)  # Reduce to 2 dimensions for visualization
pca_data = pca.fit_transform(scaled_data)

# Convert PCA result to a DataFrame
pca_df = pd.DataFrame(data=pca_data, columns=['Principal Component 1', 'Principal Component 2'])
pca_df['target'] = iris.target

print("Explained Variance Ratio:", pca.explained_variance_ratio_)


In [None]:
# Plot the data in the reduced 2D space to observe separability between classes.

plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=pca_df,
    x='Principal Component 1',
    y='Principal Component 2',
    hue='target',
    palette='bright'
)
plt.title("PCA-Reduced Data")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title="Target")
plt.show()


In [None]:
# Use the explained variance ratio to determine the number of components needed for PCA.

# Compute explained variance for all components
pca_full = PCA()
pca_full.fit(scaled_data)
explained_variance_ratio = pca_full.explained_variance_ratio_

# Plot cumulative explained variance
plt.figure(figsize=(8, 6))
plt.plot(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio.cumsum(), marker='o')
plt.title("Cumulative Explained Variance by PCA Components")
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Explained Variance")
plt.grid()
plt.show()


In [None]:
# Reconstruct the original data from the principal components and compare the reconstruction loss.

# Project data into a reduced space and back
pca_reduced = PCA(n_components=2)
reduced_data = pca_reduced.fit_transform(scaled_data)
reconstructed_data = pca_reduced.inverse_transform(reduced_data)

# Calculate reconstruction loss (MSE)
from sklearn.metrics import mean_squared_error
reconstruction_loss = mean_squared_error(scaled_data, reconstructed_data)
print("Reconstruction Loss (MSE):", reconstruction_loss)


In [None]:
# Use PCA as a preprocessing step for classification tasks.

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split data
X_train, X_test, y_train, y_test = train_test_split(pca_data, iris.target, test_size=0.2, random_state=42)

# Train a classifier on PCA-reduced data
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Evaluate the classifier
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on PCA-Reduced Data:", accuracy)
