<a href="https://colab.research.google.com/github/jmarcano101/data110/blob/main/Week11_Clustring.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# Generate random data
data = np.random.rand(300, 2)  # 300 points in 2D

# Initialize and fit the KMeans model
kmeans = KMeans(n_clusters=3)  # we choose 3 clusters
kmeans.fit(data)

# Predict the cluster for each data point
y_kmeans = kmeans.predict(data)

# Plot the clusters
plt.scatter(data[:, 0], data[:, 1], c=y_kmeans, s=50, cmap='viridis')

# Plot the centroids
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.75)  # centroids in red
plt.title("K-means Clustering")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# Load the dataset
penguins = sns.load_dataset('penguins')
penguins.dropna(subset=['bill_length_mm', 'bill_depth_mm'], inplace=True)  # remove rows with missing values in these columns

# Select features
features = penguins[['bill_length_mm', 'bill_depth_mm']]

# Apply K-means clustering
kmeans = KMeans(n_clusters=3)  # you can experiment with the number of clusters
clusters = kmeans.fit_predict(features)

# Plot the results
plt.figure(figsize=(10, 6))
plt.scatter(features['bill_length_mm'], features['bill_depth_mm'], c=clusters, cmap='viridis')
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=200, marker='X')  # mark centroids
plt.title('Penguins Clustering based on Bill Dimensions (Non-standardized)')
plt.xlabel('Bill Length (mm)')
plt.ylabel('Bill Depth (mm)')
plt.colorbar(label='Cluster')
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Load the dataset
iris = datasets.load_iris()
X = iris.data
y = iris.target

# Apply K-means clustering
kmeans = KMeans(n_clusters=3)
clusters = kmeans.fit_predict(X)

# Dimensionality Reduction for Visualization
pca = PCA(n_components=2)  # Reduce to two dimensions for visualization
reduced_data = pca.fit_transform(X)

# Plot the results
plt.figure(figsize=(12, 6))
colors = ['navy', 'turquoise', 'darkorange']

# Plotting actual species
plt.subplot(1, 2, 1)
for color, i, target_name in zip(colors, [0, 1, 2], iris.target_names):
    plt.scatter(reduced_data[y == i, 0], reduced_data[y == i, 1], color=color, alpha=0.8,
                label=target_name)
plt.title('Ground Truth (Iris Species)')
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.legend()

# Plotting clustered data
plt.subplot(1, 2, 2)
for color, i in zip(colors, range(3)):
    plt.scatter(reduced_data[clusters == i, 0], reduced_data[clusters == i, 1], color=color, alpha=0.8,
                label=f'Cluster {i+1}')
plt.title('K-means Clustering')
plt.xlabel('PCA 1')
plt.legend()

plt.show()


In [None]:
print(np.random.rand(10, 2))