In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("Python Spark Clustering") \
    .getOrCreate()

## 1. Import Data dan Query

In [2]:
df = spark.read.csv("D:/Spark/tugas cluster/driver-license-permit-and-non-driver-identification-cards-issued-as-of-august-30-2017.csv", header=True, inferSchema=True)

In [3]:
df.schema

StructType(List(StructField(Year of Birth,IntegerType,true),StructField(Sex,StringType,true),StructField(City,StringType,true),StructField(State,StringType,true),StructField(Zip,StringType,true),StructField(Residence County,StringType,true),StructField(License Class,StringType,true),StructField(Status,StringType,true),StructField(Privilege,StringType,true),StructField(Year of Expiration,IntegerType,true)))

In [4]:
df.show()

+-------------+---+---------------+-----+-----+----------------+-------------+---------+---------+------------------+
|Year of Birth|Sex|           City|State|  Zip|Residence County|License Class|   Status|Privilege|Year of Expiration|
+-------------+---+---------------+-----+-----+----------------+-------------+---------+---------+------------------+
|         1950|  F|       GLENMONT|   NY|12077|          ALBANY|            D|    VALID|     FULL|              2020|
|         1977|  F| WEST HEMPSTEAD|   NY|11552|          NASSAU|            D|    VALID|     FULL|              2019|
|         1967|  M|      SMITHTOWN|   NY|11787|         SUFFOLK|            D|    VALID|     FULL|              2019|
|         1989|  M|       BROOKLYN|   NY|11213|           KINGS|            D|    VALID|     FULL|              2020|
|         1994|  F|        BXVILLE|   NY|10708|     WESTCHESTER|            D|    VALID|     FULL|              2023|
|         1960|  M|       NEW YORK|   NY|10025|        N

In [5]:
df.createOrReplaceTempView("License")

In [78]:
query = spark.sql("SELECT `City`, `Year of Birth`, `Year of Expiration`, `License Class` \
                    FROM License \
                    WHERE `Year of Birth` BETWEEN 1980 AND 2000 \
                    AND Sex='M'")

In [79]:
query.show()

+------------+-------------+------------------+-------------+
|        City|Year of Birth|Year of Expiration|License Class|
+------------+-------------+------------------+-------------+
|    BROOKLYN|         1989|              2020|            D|
|    NEW YORK|         1981|              2020|            I|
|       BRONX|         1996|              2020|     D PERMIT|
|   EAST OTTO|         2000|              2021|    DJ PERMIT|
|LONG IS CITY|         1994|              2019|            D|
|   BELLEROSE|         1986|              2023|            D|
|     MAHOPAC|         1995|              2024|            D|
| RANSOMVILLE|         1993|              2022|            D|
| STEPHENTOWN|         1984|              2021|           DM|
|    BROOKLYN|         1980|              2022|            D|
|    BROOKLYN|         1982|              2024|            D|
|   HAUPPAUGE|         1991|              2017|     D PERMIT|
|WATKINS GLEN|         1993|              2022|            D|
|       

In [80]:
query.count()

2774999

## 2. Vector Assembler

In [81]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["Year of Birth", "Year of Expiration"],
    outputCol='features')

query = assembler.transform(query)

In [82]:
query.show()

+------------+-------------+------------------+-------------+---------------+
|        City|Year of Birth|Year of Expiration|License Class|       features|
+------------+-------------+------------------+-------------+---------------+
|    BROOKLYN|         1989|              2020|            D|[1989.0,2020.0]|
|    NEW YORK|         1981|              2020|            I|[1981.0,2020.0]|
|       BRONX|         1996|              2020|     D PERMIT|[1996.0,2020.0]|
|   EAST OTTO|         2000|              2021|    DJ PERMIT|[2000.0,2021.0]|
|LONG IS CITY|         1994|              2019|            D|[1994.0,2019.0]|
|   BELLEROSE|         1986|              2023|            D|[1986.0,2023.0]|
|     MAHOPAC|         1995|              2024|            D|[1995.0,2024.0]|
| RANSOMVILLE|         1993|              2022|            D|[1993.0,2022.0]|
| STEPHENTOWN|         1984|              2021|           DM|[1984.0,2021.0]|
|    BROOKLYN|         1980|              2022|            D|[19

## 3. kMeans

In [83]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans().setK(5).setSeed(1)
model = kmeans.fit(query)

## 4. Prediction

In [84]:
prediction = model.transform(query)
prediction.show()

+------------+-------------+------------------+-------------+---------------+----------+
|        City|Year of Birth|Year of Expiration|License Class|       features|prediction|
+------------+-------------+------------------+-------------+---------------+----------+
|    BROOKLYN|         1989|              2020|            D|[1989.0,2020.0]|         3|
|    NEW YORK|         1981|              2020|            I|[1981.0,2020.0]|         4|
|       BRONX|         1996|              2020|     D PERMIT|[1996.0,2020.0]|         1|
|   EAST OTTO|         2000|              2021|    DJ PERMIT|[2000.0,2021.0]|         1|
|LONG IS CITY|         1994|              2019|            D|[1994.0,2019.0]|         1|
|   BELLEROSE|         1986|              2023|            D|[1986.0,2023.0]|         0|
|     MAHOPAC|         1995|              2024|            D|[1995.0,2024.0]|         2|
| RANSOMVILLE|         1993|              2022|            D|[1993.0,2022.0]|         2|
| STEPHENTOWN|       

## 5. Evaluate

In [85]:
from pyspark.ml.evaluation import ClusteringEvaluator

evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(prediction)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.6000210625334547


## 6. Cluster Center

In [86]:
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[1985.94496334 2022.76302178]
[1997.47876262 2019.53306976]
[1993.9084316  2023.17896686]
[1989.92296319 2019.39982705]
[1982.03307073 2020.15344342]


## 7. Vizualization

In [87]:
import pixiedust

In [88]:
display(prediction)

![cluster](img/img1.png)