<a href="https://colab.research.google.com/github/kavyasriarum/Machine_leaning_Models/blob/main/KMeans.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark py4j

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317145 sha256=40e902305cac117ed9aaf0d4101cc315bed300ee23b01f5cf31b75a44a5a966d
  Stored in directory: /root/.cache/pip/wheels/9f/34/a4/159aa12d0a510d5ff7c8f0220abbea42e5d81ecf588c4fd884
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [None]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.feature import VectorAssembler
from urllib.request import urlretrieve

In [None]:
# Step 1: Download and load the Iris dataset
urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", "iris.csv")
spark = SparkSession.builder.appName("IrisKMeans").getOrCreate()
df = spark.read.csv("iris.csv", header=False, inferSchema=True)
df = df.toDF("sepal_length", "sepal_width", "petal_length", "petal_width", "class")

In [None]:
# Step 2: Build a K-means model
# You can choose a k value randomly, let's use k=3
k = 3
feature_cols = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df_assembled = assembler.transform(df)
kmeans = KMeans(k=k, seed=1)
model = kmeans.fit(df_assembled)

# Print the cluster centers and cluster sizes
print("Cluster Centers:")
for center in model.clusterCenters():
    print(center)

Cluster Centers:
[5.9016129  2.7483871  4.39354839 1.43387097]
[5.006 3.418 1.464 0.244]
[6.85       3.07368421 5.74210526 2.07105263]


In [None]:
# Step 3: Report the original performance using Silhouette score
predictions = model.transform(df_assembled)
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print("Original Silhouette Score:", silhouette)

Original Silhouette Score: 0.7354567373091194


In [None]:
# Step 4: Try different k values to improve performance
best_k = 0
best_silhouette = -1.0
for k in range(2, 13): # Trying k values from 2 to 12
    kmeans = KMeans(k=k, seed=1)
    model = kmeans.fit(df_assembled)
    predictions = model.transform(df_assembled)
    silhouette = evaluator.evaluate(predictions)
    print("k =", k, "Silhouette Score:", silhouette)
    if silhouette > best_silhouette:
        best_k = k
        best_silhouette = silhouette

k = 2 Silhouette Score: 0.8501515983265806
k = 3 Silhouette Score: 0.7354567373091194
k = 4 Silhouette Score: 0.6720731409257744
k = 5 Silhouette Score: 0.6155691231448028
k = 6 Silhouette Score: 0.5517166229578094
k = 7 Silhouette Score: 0.4932559686632637
k = 8 Silhouette Score: 0.512000408959464
k = 9 Silhouette Score: 0.5211325007507351
k = 10 Silhouette Score: 0.6017516470706848
k = 11 Silhouette Score: 0.44993721397644093
k = 12 Silhouette Score: 0.4863667747325271


In [None]:
# Step 5: Select the best k and print out the result
print("Best k =", best_k, "gives the best performance, Silhouette =", best_silhouette)

Best k = 2 gives the best performance, Silhouette = 0.8501515983265806
