# 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
import random

# 2. Load Dataset

In [None]:
df = pd.read_csv('C:/ML/Labwork2/dataset/data.csv')
print("Dataset shape:", df.shape)
df.head()


# 3. Data Preprocessing
Remove non-numeric columns if any

In [None]:
df_numeric = df.select_dtypes(include = [np.number])
df_numeric.dropna(inplace = True)
scaler = StandardScaler()
data_scaled = scaler.fit_transform(df_numeric)

# 4. PCA to 2D and 3D

In [None]:
pca_2d = PCA(n_components = 2)
data_pca_2d = pca_2d.fit_transform(data_scaled)

pca_3d = PCA(n_components = 3)
data_pca_3d = pca_3d.fit_transform(data_scaled)

# 5. Visualize in 2D and 3D

In [None]:
plt.figure(figsize = (8,6))
plt.scatter(data_pca_2d[:,0], data_pca_2d[:,1], alpha = 0.5)
plt.title("PCA - 2D Projection")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.grid(True)
plt.savefig("C:/ML/Labwork2/visualizations/PCA_2D_projection.png")
plt.show()

# Optional 3D Visualization 
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize = (10, 7))
ax = fig.add_subplot(111, projection = '3d')
ax.scatter(data_pca_3d[:, 0], data_pca_3d[:, 1], data_pca_3d[:, 2], alpha = 0.5)
ax.set_title("PCA - 3D Projection")
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_zlabel("PC3")
plt.savefig("C:/ML/Labwork2/visualizations/PCA_3D_projection.png")
plt.show()

# 6. KMeans Clustering Before PCA

In [None]:
kmeans_orig = KMeans(n_clusters = 3, random_state = 42)
kmeans_orig.fit(data_scaled)
labels_orig = kmeans_orig.labels_
silhouette_orig = silhouette_score(data_scaled, labels_orig)
print("Silhouette Score (Original Data):", silhouette_orig)

# 7. KMeans After PCA

In [None]:
kmeans_pca = KMeans(n_clusters = 3, random_state = 42)
kmeans_pca.fit(data_pca_2d)
labels_pca = kmeans_pca.labels_
silhouette_pca = silhouette_score(data_pca_2d, labels_pca)
print("Silhouette Score (After PCA - 2D):", silhouette_pca)

# 8. Visualize PCA Clusters

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(data_pca_2d[:,0], data_pca_2d[:,1], c = labels_pca, cmap = 'viridis', alpha = 0.6)
plt.title("KMeans Clusters after PCA")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.grid(True)
plt.savefig("C:/ML/Labwork2/visualizations/KMeans_Clusters_after_PCA.png")
plt.show()

# 9. Random Subspace Clustering

In [None]:
random.seed(42)
subspace_features = random.sample(list(df_numeric.columns), k = 20)
data_subspace = df_numeric[subspace_features]
data_subspace_scaled = scaler.fit_transform(data_subspace)

kmeans_subspace = KMeans(n_clusters = 3, random_state = 42)
kmeans_subspace.fit(data_subspace_scaled)
labels_subspace = kmeans_subspace.labels_
silhouette_subspace = silhouette_score(data_subspace_scaled, labels_subspace)
print("Silhouette Score (Random Subspace):", silhouette_subspace)

# 10. Summary

In [None]:
print("\n--- Summary of Silhouette Scores ---")
print(f"Original Data        : {silhouette_orig:.4f}")
print(f"After PCA (2D)       : {silhouette_pca:.4f}")
print(f"Random Subspace (20) : {silhouette_subspace:.4f}")