# Unsupervised ML

This notebook will be loading in the data, and then running time-series k-means clustering by count on the following:

1. Pickups in Chicago
2. Pickups in Hyde Park (pre-program)
3. Pickups in Hyde Park (program)

Here's the Apache documentation I'll be drawing inspiration from:

https://spark.apache.org/docs/latest/ml-clustering.html
https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.clustering.KMeans.html

And here's the article that helped me out:

https://www.influxdata.com/blog/why-use-k-means-for-time-series-data-part-one/
https://www.influxdata.com/blog/why-use-k-means-for-time-series-data-part-two/
https://www.influxdata.com/blog/why-use-k-means-for-time-series-data-part-three/

In [15]:
# read in packages create spark environment
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import matplotlib.pyplot as plt
%matplotlib inline
import geopandas as gpd
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.ml import Pipeline
from pyspark.ml.feature import StandardScaler
from pyspark.ml.evaluation import ClusteringEvaluator

spark = SparkSession.builder.appName('unsupervised').getOrCreate()

#change configuration settings on Spark 
conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '4g'), ('spark.app.name', 'Spark Updated Conf'), ('spark.executor.cores', '4'), ('spark.cores.max', '4'), ('spark.driver.memory','4g')])

#print spark configuration settings
spark.sparkContext.getConf().getAll()

[('spark.stage.maxConsecutiveAttempts', '10'),
 ('spark.dynamicAllocation.minExecutors', '1'),
 ('spark.eventLog.enabled', 'true'),
 ('spark.submit.pyFiles',
  '/root/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,/root/.ivy2/jars/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,/root/.ivy2/jars/com.typesafe_config-1.4.2.jar,/root/.ivy2/jars/org.rocksdb_rocksdbjni-6.29.5.jar,/root/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,/root/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-storage-2.16.0.jar,/root/.ivy2/jars/com.navigamez_greex-1.0.jar,/root/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,/root/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar,/root/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar,/root/.ivy2/jars/com.google.guava_guava-31.1-jre.jar,/root/.ivy2/jars/com.google.guava_failureaccess-1.0.1.jar,/root/.ivy2/jars/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-

### Reading in cleaned data, partitioning

In [2]:
# read in rideshare data for all years, concatenate, create appropriate partitioning
# we are dropping 2020 because covid will affect the performance of our model

df_2018 = spark.read.csv("gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/rides_2018.csv", inferSchema=True, header=True)
df_2019 = spark.read.csv("gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/rides_2019.csv", inferSchema=True, header=True)
df_2021 = spark.read.csv("gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/rides_2021.csv", inferSchema=True, header=True)
df_2022 = spark.read.csv("gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/rides_2022.csv", inferSchema=True, header=True)
df_2023 = spark.read.csv("gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/rides_2023.csv", inferSchema=True, header=True)

# dropping new columns in 2023
df_2023 = df_2023.drop('Shared Trip Match','Percent Time Chicago','Percent Distance Chicago')

df_all = df_2018.union(df_2019).union(df_2021).union(df_2022).union(df_2023)
df_all.show(5)

                                                                                

+--------------------+-------------------+-------------------+-------+-----+------------+-------------+-----------+------------+----+---+-----+-------------+--------------+-------------+--------------+-----+------------+----+---+
|                  ID|    start_timestamp|      end_timestamp|seconds|miles|pickup_tract|dropoff_tract|pickup_area|dropoff_area|Fare|Tip|total|   pickup_lat|    pickup_lon|  dropoff_lat|   dropoff_lon|month|day_of_month|hour|day|
+--------------------+-------------------+-------------------+-------+-----+------------+-------------+-----------+------------+----+---+-----+-------------+--------------+-------------+--------------+-----+------------+----+---+
|625e77ae6e0ff7191...|2018-11-06 19:00:00|2018-11-06 19:15:00|   1142|  5.8| 17031063400|  17031010400|          6|           1|12.5|  0| 15.0|41.9346591566|-87.6467297286| 42.004764559| -87.659122427|   11|           6|  19|  3|
|62945fdb2e70957f0...|2018-11-06 19:00:00|2018-11-06 19:00:00|    341|  1.2| 170

In [3]:
#display number of records by partition
def displaypartitions(df):
    #number of records by partition
    num = df.rdd.getNumPartitions()
    print("Partitions:", num)
    df.withColumn("partitionId", F.spark_partition_id())\
        .groupBy("partitionId")\
        .count()\
        .orderBy(F.asc("count"))\
        .show(num)

df_all.rdd.getNumPartitions()
displaypartitions(df_all)

Partitions: 544




+-----------+------+
|partitionId| count|
+-----------+------+
|         42|305254|
|         41|305316|
|         40|305420|
|         38|305471|
|         39|305480|
|         37|305618|
|         36|305676|
|         35|305871|
|         34|305890|
|         33|305962|
|         32|305971|
|         31|306010|
|         29|306031|
|         30|306038|
|         28|306086|
|         27|306127|
|         26|306402|
|         25|306467|
|         24|306633|
|         23|306731|
|         22|307226|
|        243|328837|
|        242|328975|
|        241|329131|
|        240|329163|
|        239|329209|
|        237|329245|
|        235|329263|
|        238|329263|
|        234|329311|
|        236|329315|
|        232|329332|
|        233|329344|
|        231|329373|
|        228|329389|
|        229|329390|
|        227|329399|
|        225|329410|
|        226|329410|
|        224|329418|
|        230|329427|
|        223|329428|
|        220|329461|
|        222|329481|
|        221|

                                                                                

In [4]:
# repartitioning to 600 partitions, seems to be balanced now. 
df_all = df_all.repartition(600)
displaypartitions(df_all)



Partitions: 600




+-----------+------+
|partitionId| count|
+-----------+------+
|         25|362151|
|         33|362151|
|         32|362151|
|         38|362151|
|         39|362152|
|         29|362152|
|        598|362152|
|         15|362152|
|         37|362152|
|        597|362152|
|         17|362152|
|         16|362152|
|         24|362152|
|         26|362152|
|         40|362152|
|          8|362153|
|         13|362153|
|         41|362153|
|         11|362153|
|         34|362153|
|         35|362153|
|         31|362153|
|         18|362153|
|         14|362153|
|         27|362153|
|         30|362153|
|         20|362153|
|         23|362153|
|         63|362154|
|         22|362154|
|         64|362154|
|         10|362154|
|         48|362154|
|         19|362154|
|        599|362154|
|          5|362154|
|         21|362154|
|          9|362154|
|        525|362154|
|          3|362154|
|         42|362154|
|         28|362154|
|        462|362154|
|        596|362154|
|         36|

                                                                                

In [5]:
# we will need a year column in this model:
df_all = df_all.withColumn('year', F.year(df_all.start_timestamp))

## Next steps

In [6]:
# Check packages:
%pip freeze

access @ file:///home/conda/feedstock_root/build_artifacts/access_1696558639912/work
affine @ file:///home/conda/feedstock_root/build_artifacts/affine_1674245120525/work
aiohttp @ file:///home/conda/feedstock_root/build_artifacts/aiohttp_1696765416168/work
aiosignal @ file:///home/conda/feedstock_root/build_artifacts/aiosignal_1667935791922/work
alabaster @ file:///home/conda/feedstock_root/build_artifacts/alabaster_1673645646525/work
alembic @ file:///home/conda/feedstock_root/build_artifacts/alembic_1698347477885/work
amply @ file:///home/conda/feedstock_root/build_artifacts/amply_1687675480808/work
ansiwrap==0.8.4
anyio @ file:///home/conda/feedstock_root/build_artifacts/anyio_1688651106312/work/dist
appdirs @ file:///home/conda/feedstock_root/build_artifacts/appdirs_1603108395799/work
argon2-cffi @ file:///home/conda/feedstock_root/build_artifacts/argon2-cffi_1692818318753/work
argon2-cffi-bindings @ file:///home/conda/feedstock_root/build_artifacts/argon2-cffi-bindings_16953865480

In [7]:
df_all.printSchema()

root
 |-- ID: string (nullable = true)
 |-- start_timestamp: timestamp (nullable = true)
 |-- end_timestamp: timestamp (nullable = true)
 |-- seconds: integer (nullable = true)
 |-- miles: double (nullable = true)
 |-- pickup_tract: long (nullable = true)
 |-- dropoff_tract: long (nullable = true)
 |-- pickup_area: integer (nullable = true)
 |-- dropoff_area: integer (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Tip: integer (nullable = true)
 |-- total: double (nullable = true)
 |-- pickup_lat: double (nullable = true)
 |-- pickup_lon: double (nullable = true)
 |-- dropoff_lat: double (nullable = true)
 |-- dropoff_lon: string (nullable = true)
 |-- month: integer (nullable = true)
 |-- day_of_month: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- year: integer (nullable = true)



## Clustering Analysis:
### First we're going to run a time series k-means clustering on the entire City of Chicago. 

In [17]:
df_all = df_all.na.drop()

In [19]:
# Clustering by pick_up area. Understanding the most-popular spots in the city to call a rideshare and their locations:
feature_cols = ["pickup_area", "pickup_lat", "pickup_lon"]

# Step 1: Vector Assembly:
feature_assembler = VectorAssembler(inputCols=feature_cols, outputCol="feature_vector")
vector_assembler = VectorAssembler(inputCols=["feature_vector"], outputCol="features")

# Step 2: Normalization:
scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=False)

# Step 3: K-Means Clustering:
kmeans = KMeans(k=3, seed=1, featuresCol="scaled_features", predictionCol="prediction")

# Step 4: Model Training:
pipeline = Pipeline(stages=[feature_assembler, vector_assembler, scaler, kmeans])
model = pipeline.fit(df_all)

# Step 5: Prediction:
predictions = model.transform(df_all)

# Evaluate clustering by computing Silhouette score:
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))



Silhouette with squared euclidean distance = 0.6724456252751287


                                                                                

In [20]:
# Show the resulting clusters
centers = model.stages[-1].clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[    1.80161164   869.01481622 -1588.97018395]
[    4.18828318   871.29960529 -1593.60879614]
[ 5.21132153e-01  8.70095495e+02 -1.58910819e+03]


In [24]:
# Display cluster assignments
chicago_clustering = predictions.select("pickup_area", "pickup_lat", "pickup_lon", "features", "scaled_features", "prediction")

In [25]:
# Seeing 20 of the results:
chicago_clustering.show()

[Stage 303:>                                                        (0 + 1) / 1]

+-----------+-------------+--------------+--------------------+--------------------+----------+
|pickup_area|   pickup_lat|    pickup_lon|            features|     scaled_features|prediction|
+-----------+-------------+--------------+--------------------+--------------------+----------+
|         41|41.8016710371|-87.5942656985|[41.0,41.80167103...|[2.26101319928869...|         0|
|          5|41.9448137543|-87.6907750098|[5.0,41.944813754...|[0.27573331698642...|         2|
|          6|41.9359889065|-87.6709663837|[6.0,41.935988906...|[0.33087998038371...|         2|
|          6| 41.936159071|-87.6612652184|[6.0,41.936159071...|[0.33087998038371...|         2|
|         23|41.9066839592|-87.7103539349|[23.0,41.90668395...|[1.26837325813756...|         2|
|         41|41.8012268363|-87.5853031602|[41.0,41.80122683...|[2.26101319928869...|         0|
|          6| 41.942577185|-87.6470785093|[6.0,41.942577185...|[0.33087998038371...|         2|
|          6| 41.942577185|-87.647078509

                                                                                

In [None]:
# Convert Spark DataFrame to Pandas DataFrame
pandas_df = chicago_clustering.toPandas()

# Plotting chicago_clustering:
gdf = gpd.GeoDataFrame(pandas_df, geometry=gpd.points_from_xy(pandas_df['pickup_lon'].astype(float), pandas_df['pickup_lat'].astype(float)))

# Plot the clusters
fig, ax = plt.subplots(figsize=(10, 8))
gdf.plot(ax=ax, column='prediction', legend=True, markersize=50, cmap='viridis', legend_kwds={'label': "Cluster"})

# Add labels and title
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Clusters of Pickup Areas')

# Show the plot
plt.show()

                                                                                