<a href="https://colab.research.google.com/github/inesclopes/data-eng-proc/blob/main/spark_jobs/extract_carris_vehicles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Authenticate with Google Cloud


In [1]:
!gcloud auth application-default login

Go to the following link in your browser, and complete the sign-in prompts:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=https%3A%2F%2Fsdk.cloud.google.com%2Fapplicationdefaultauthcode.html&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login&state=Kqwsss0DOrMwH6YEeUWRdUHFXlSBB9&prompt=consent&token_usage=remote&access_type=offline&code_challenge=oRaYsRuO7xXkt9DOkdQ0MNg1m7fr4yjA1ucG3VHkqhg&code_challenge_method=S256

Once finished, enter the verification code provided in your browser: 4/0AanRRrtnOzZ2Cs-erxxWDtN9re5rPIk7Oh0Xh9dKbA6mobrOVe0-8wS1OKEtye00RXDH4Q

Credentials saved to file: [/content/.config/application_default_credentials.json]

These credentials will be used by any library that requests Application Default Credentials (ADC).
Ca

# Step 2: Install Spark and BigQuery connector

In [2]:
# Install OpenJDK 8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# Download Spark from a reliable source
!wget -q https://archive.apache.org/dist/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz

# Extract the downloaded Spark tarball
!tar xf spark-3.2.1-bin-hadoop3.2.tgz

!rm -rf spark-3.2.1-bin-hadoop3.2.tgz # No longer needed

# Install the GCS Connector
!rm -rf /content/spark-3.2.1-bin-hadoop3.2/jars/gcs-connector-hadoop3-latest.jar
!wget -q https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop3-latest.jar -P /content/spark-3.2.1-bin-hadoop3.2/jars/

# Install the BigQuery Connector
!rm -rf /content/spark-3.2.1-bin-hadoop3.2/jars/spark-bigquery-with-dependencies_2.12-0.29.0.jar
!wget -q https://storage.googleapis.com/spark-lib/bigquery/spark-bigquery-with-dependencies_2.12-0.29.0.jar -P /content/spark-3.2.1-bin-hadoop3.2/jars/

# Install Findspark
!pip install -q findspark


^C
^C


In [18]:
%pip install pyspark



# Step 3: Set environment variables for Spark and Java

In [3]:
import os

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/content/.config/application_default_credentials.json"
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop3.2"

# Step 4: Initialize Spark session

In [4]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("GCS to BigQuery Streaming") \
    .config("spark.jars", "/content/spark-3.2.1-bin-hadoop3.2/jars/gcs-connector-hadoop3-latest.jar,/content/spark-3.2.1-bin-hadoop3.2/jars/spark-bigquery-with-dependencies_2.12-0.29.0.jar") \
    .config("spark.hadoop.fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") \
    .config("spark.hadoop.fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS") \
    .getOrCreate()

print("Spark Session successfully created!")


Spark Session successfully created!


# Step 5: Define GCS input path and output BigQuery table

In [5]:
input_path = "gs://edit-de-project-streaming-data/carris-vehicles"
output_table = "data-eng-dev-437916.data_eng_project_group3_raw"

In [6]:
!rm -rf content/lake/processing/

# Step 6: Read streaming data from GCS

In [7]:
# Define the schema for your JSON files
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, LongType
from pyspark.sql.functions import min, max, first, last, col, window
from pyspark.sql.functions import window, from_unixtime, col, to_timestamp

schema = StructType([
    StructField("bearing", FloatType(), True),
    StructField("block_id", StringType(), True),
    StructField("current_status", StringType(), True),
    StructField("id", StringType(), True),
    StructField("lat", FloatType(), True),
    StructField("line_id", StringType(), True),
    StructField("lon", FloatType(), True),
    StructField("pattern_id", StringType(), True),
    StructField("route_id", StringType(), True),
    StructField("schedule_relationship", StringType(), True),
    StructField("shift_id", StringType(), True),
    StructField("speed", FloatType(), True),
    StructField("stop_id", StringType(), True),
    StructField("timestamp", LongType(), True),
    StructField("trip_id", StringType(), True)
])

# Create the streaming DataFrame with the defined schema
streaming_df = spark.readStream \
    .format("json") \
    .schema(schema) \
    .load(input_path)

In [8]:
from pyspark.sql.functions import from_unixtime, to_timestamp

def transform_data(df, batch_id):

  transformed_df = df.withColumn("timestamp", to_timestamp(from_unixtime("timestamp")))
  window_spec = window("timestamp", "2 minutes")

  # Group by vehicle ID and window, then get the first and last timestamps and lat/lon values
  result_df = (transformed_df.groupBy("id", window_spec)
      .agg(
          min("timestamp").alias("first_timestamp"),
          max("timestamp").alias("last_timestamp"),
          first("lat").alias("first_lat"),
          first("lon").alias("first_lon"),
          last("lat").alias("last_lat"),
          last("lon").alias("last_lon")
      ).orderBy("last_timestamp", ascending=False).dropDuplicates(["id"])
      )

  return result_df.write.mode("append").format("parquet").save("content/lake/processing/vehicles/data")



In [9]:
import math
def haversine(lat1, lon1, lat2, lon2):
    try:
      # Earth radius in kilometers
      R = 6371.0

      # Convert latitude and longitude from degrees to radians
      lat1_rad, lon1_rad = math.radians(lat1), math.radians(lon1)
      lat2_rad, lon2_rad = math.radians(lat2), math.radians(lon2)

      # Differences
      delta_lat = lat2_rad - lat1_rad
      delta_lon = lon2_rad - lon1_rad

      # Haversine formula
      a = math.sin(delta_lat / 2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(delta_lon / 2)**2
      c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

      # Distance
      distance = R * c
      return distance
    except:
      return 0

# Example usage
lat1, lon1 = 52.2296756, 21.0122287  # Warsaw
lat2, lon2 = 41.8919300, 12.5113300  # Rome
distance = haversine(lat1, lon1, lat2, lon2)

In [11]:
df = (spark.readStream.option("maxFilesPerTrigger", 1)
    .format("json")
    .schema(schema)
    .load(input_path))


query = (df.writeStream
.outputMode('append')
.option('checkpointLocation', 'content/lake/processing/vehicles_checkpoint') #using a common checkpoint for both messages streams.
.trigger(processingTime='5 seconds')
.foreachBatch(transform_data)
.start()
)

query.awaitTermination(60)

query.stop()


ERROR:py4j.clientserver:There was an exception while executing the Python Proxy on the Python Side.
Traceback (most recent call last):
  File "/content/spark-3.2.1-bin-hadoop3.2/python/lib/py4j-0.10.9.3-src.zip/py4j/clientserver.py", line 581, in _call_proxy
    return_value = getattr(self.pool[obj_id], method)(*params)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/spark-3.2.1-bin-hadoop3.2/python/pyspark/sql/utils.py", line 196, in call
    raise e
  File "/content/spark-3.2.1-bin-hadoop3.2/python/pyspark/sql/utils.py", line 193, in call
    self.func(DataFrame(jdf, self.sql_ctx), batch_id)
  File "<ipython-input-8-42739f2eb610>", line 20, in transform_data
    return result_df.write.mode("append").format("parquet").save("content/lake/processing/vehicles/data")
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/spark-3.2.1-bin-hadoop3.2/python/pyspark/sql/readwriter.py", line 740,

In [12]:
df = spark.read.format("parquet").load("content/lake/processing/vehicles/data")


In [19]:
df.show()

+--------+--------------------+-------------------+-------------------+---------+---------+---------+---------+
|      id|              window|    first_timestamp|     last_timestamp|first_lat|first_lon| last_lat| last_lon|
+--------+--------------------+-------------------+-------------------+---------+---------+---------+---------+
| 41|1349|{2025-01-16 15:46...|2025-01-16 15:47:41|2025-01-16 15:47:41| 38.75672|-9.250352| 38.75672|-9.250352|
|  41|541|{2025-01-16 15:46...|2025-01-16 15:47:38|2025-01-16 15:47:38|38.802677|-9.361051|38.802677|-9.361051|
| 42|1186|{2025-01-16 15:48...|2025-01-16 15:48:06|2025-01-16 15:48:06|38.824844|-9.166352|38.824844|-9.166352|
| 42|2304|{2025-01-16 15:46...|2025-01-16 15:47:56|2025-01-16 15:47:56| 38.87158|-9.060005| 38.87158|-9.060005|
| 42|2333|{2025-01-16 15:48...|2025-01-16 15:48:08|2025-01-16 15:48:08| 38.81158|-9.118885| 38.81158|-9.118885|
| 42|2405|{2025-01-16 15:48...|2025-01-16 15:48:07|2025-01-16 15:48:07| 38.84585|-9.087123| 38.84585|-9.

In [11]:
# prompt: create a new column that applies haversine to each row to the values of columns first_lat, first_lon, last_lat and last_lon

from pyspark.sql.functions import udf
import pyspark.sql.functions as F
from pyspark.sql.types import FloatType,   StringType, IntegerType

# Define the UDF for Haversine calculation
def haversine_udf(lat1):

    return "data"

haversine_udf = F.udf(haversine)

# Apply the UDF to create the new column


In [312]:
# prompt: create a new df from de first 10 lines of existing dfs

new_df = df.limit(10)


In [20]:
# UDF examples

from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age"))
df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")).show()
slen = udf(lambda s: len(s), IntegerType())

@udf
def to_upper(s):
    if s is not None:
        return s.upper()

@udf(returnType=IntegerType())
def add_one(x):
    if x is not None:
        return x + 1

df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age"))
df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")).show()


Traceback (most recent call last):
  File "/content/spark-3.2.1-bin-hadoop3.2/python/pyspark/serializers.py", line 437, in dumps
    return cloudpickle.dumps(obj, pickle_protocol)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/spark-3.2.1-bin-hadoop3.2/python/pyspark/cloudpickle/cloudpickle_fast.py", line 73, in dumps
    cp.dump(obj)
  File "/content/spark-3.2.1-bin-hadoop3.2/python/pyspark/cloudpickle/cloudpickle_fast.py", line 563, in dump
    return Pickler.dump(self, obj)
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/spark-3.2.1-bin-hadoop3.2/python/pyspark/cloudpickle/cloudpickle_fast.py", line 653, in reducer_override
    return self._function_reduce(obj)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/spark-3.2.1-bin-hadoop3.2/python/pyspark/cloudpickle/cloudpickle_fast.py", line 526, in _function_reduce
    return self._dynamic_function_reduce(obj)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/spark-3.2.1-bin-hadoop3.2/python/

PicklingError: Could not serialize object: IndexError: tuple index out of range

In [54]:
data = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age"))

Traceback (most recent call last):
  File "/content/spark-3.2.1-bin-hadoop3.2/python/pyspark/serializers.py", line 437, in dumps
    return cloudpickle.dumps(obj, pickle_protocol)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/spark-3.2.1-bin-hadoop3.2/python/pyspark/cloudpickle/cloudpickle_fast.py", line 73, in dumps
    cp.dump(obj)
  File "/content/spark-3.2.1-bin-hadoop3.2/python/pyspark/cloudpickle/cloudpickle_fast.py", line 563, in dump
    return Pickler.dump(self, obj)
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/spark-3.2.1-bin-hadoop3.2/python/pyspark/cloudpickle/cloudpickle_fast.py", line 653, in reducer_override
    return self._function_reduce(obj)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/spark-3.2.1-bin-hadoop3.2/python/pyspark/cloudpickle/cloudpickle_fast.py", line 526, in _function_reduce
    return self._dynamic_function_reduce(obj)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/spark-3.2.1-bin-hadoop3.2/python/

PicklingError: Could not serialize object: IndexError: tuple index out of range

In [302]:


new_df.select(add_dpto_name(col('id'))).show()

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/content/spark-3.2.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 603, in main
    func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type)
                                               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/spark-3.2.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 449, in read_udfs
    udfs.append(read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=i))
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/spark-3.2.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 251, in read_single_udf
    f, return_type = read_command(pickleSer, infile)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/spark-3.2.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 71, in read_command
    command = serializer._read_with_length(file)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/spark-3.2.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/serializers.py", line 160, in _read_with_length
    return self.loads(obj)
           ^^^^^^^^^^^^^^^
  File "/content/spark-3.2.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/serializers.py", line 430, in loads
    return pickle.loads(obj, encoding=encoding)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: code() argument 13 must be str, not int


In [290]:
df.select(col("id"), \
    convertUDF(col("id")).alias("Name") ) \
   .show(truncate=False)

Traceback (most recent call last):
  File "/content/spark-3.2.1-bin-hadoop3.2/python/pyspark/serializers.py", line 437, in dumps
    return cloudpickle.dumps(obj, pickle_protocol)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/spark-3.2.1-bin-hadoop3.2/python/pyspark/cloudpickle/cloudpickle_fast.py", line 73, in dumps
    cp.dump(obj)
  File "/content/spark-3.2.1-bin-hadoop3.2/python/pyspark/cloudpickle/cloudpickle_fast.py", line 563, in dump
    return Pickler.dump(self, obj)
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/spark-3.2.1-bin-hadoop3.2/python/pyspark/cloudpickle/cloudpickle_fast.py", line 653, in reducer_override
    return self._function_reduce(obj)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/spark-3.2.1-bin-hadoop3.2/python/pyspark/cloudpickle/cloudpickle_fast.py", line 526, in _function_reduce
    return self._dynamic_function_reduce(obj)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/spark-3.2.1-bin-hadoop3.2/python/

PicklingError: Could not serialize object: IndexError: tuple index out of range

In [None]:
# prompt: filter the result_df by id and get only the most recent timestamp

from pyspark.sql.functions import max

# Assuming 'result_df' is the DataFrame from the previous code block
most_recent_df = result_df.groupBy("id").agg(max("last_timestamp").alias("last_timestamp"))

# Join with the original result_df to get other relevant information
final_df = most_recent_df.join(result_df, ["id", "last_timestamp"], "inner")

# Show the final result
final_df.filter("id = '42|2570'").show()

In [None]:
result_df.orderBy("last_timestamp", ascending=False).dropDuplicates(["id"]).filter("id = '42|2570'").show()

+-------+--------------------+-------------------+-------------------+---------+---------+--------+---------+
|     id|              window|    first_timestamp|     last_timestamp|first_lat|first_lon|last_lat| last_lon|
+-------+--------------------+-------------------+-------------------+---------+---------+--------+---------+
|42|2570|{2025-01-16 12:20...|2025-01-16 12:20:22|2025-01-16 12:21:07|38.813778|-9.167581|38.81523|-9.173496|
+-------+--------------------+-------------------+-------------------+---------+---------+--------+---------+



In [None]:
import tempfile

import time

_ = spark.sql("DROP TABLE IF EXISTS my_table2")


    # Create a table with Rate source.
    query = (spark.readStream.option("maxFilesPerTrigger", 1)
    .format("json")
    .schema(schema)
    .load(input_path).withColumn("timestamp", to_timestamp(from_unixtime("timestamp")))
    .writeStream
             .toTable(

            "vehicles_table",

            queryName='read_query',

            outputMode="append",

            format='parquet',

            checkpointLocation=d))

    time.sleep(10)

    query.stop()



In [None]:
spark.read.table("vehicles_table").show()

+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+----------+--------------------+
|bearing|            block_id|current_status|      id|      lat|line_id|      lon|pattern_id|route_id|schedule_relationship|    shift_id|    speed|stop_id| timestamp|             trip_id|
+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+----------+--------------------+
|  183.0|20250116-64010165...| IN_TRANSIT_TO|44|12745| 38.76761|   4720|-9.100393|  4720_0_1|  4720_0|            SCHEDULED|113260234560|      0.0| 060011|1737026213|4720_0_1|2700|111...|
|   58.0|20250116-64010273...| IN_TRANSIT_TO|44|12557| 38.57138|   4641| -9.03899|  4641_0_2|  4641_0|            SCHEDULED|111500234560|      0.0| 150013|1737026203|4641_0_2|2700|102...|
|   12.0|20250116-64010076...|    STOPPED_AT|44|12092| 38.70

In [None]:
with tempfile.TemporaryDirectory() as d:
    # Create a stream from the input data
    df = (spark.readStream
          .option("maxFilesPerTrigger", 1)
          .format("json")
          .schema(schema)  # Define the schema for the input data
          .load(input_path))

    # Apply transformation to the DataFrame
    transformed_df = (df
                       .withColumn("timestamp", to_timestamp(from_unixtime("timestamp"))))  # Example transformation

    # Write the transformed data to the table
    query = (transformed_df.writeStream(
               "vehicles_table",

            queryName='read_query',

            outputMode="append",

            format='parquet',

            checkpointLocation=d ) ) # Define the checkpoint location

    # Sleep for a while to let the stream process some data
    time.sleep(10)

    # Stop the stream query after processing
    query.stop()


TypeError: 'DataStreamWriter' object is not callable

In [None]:
# prompt: convert the column timestamp to timestamp

from pyspark.sql.functions import from_unixtime, to_timestamp


In [None]:
# prompt: group by vehicle in 2 minutes buckets

from pyspark.sql.functions import window, from_unixtime, col, to_timestamp

# Assuming 'vehicles_table' is your table with the vehicle data

# Read the data
vehicle_data = spark.read.parquet("vehicles_table")

# Convert the timestamp to a timestamp type and adjust timezone
vehicle_data = vehicle_data.withColumn("timestamp_readable", to_timestamp(from_unixtime(col("timestamp") / 1000)))

# Group by vehicle and 2-minute intervals
grouped_vehicle_data = vehicle_data.groupBy(
    "id", window("timestamp_readable", "2 minutes")
).count()


# Show the results
grouped_vehicle_data.show(truncate=False)

# Optional: Write the grouped data to a new location
# grouped_vehicle_data.write.mode("overwrite").parquet("grouped_vehicles_data")

In [None]:
  spark.read.table("vehicles_table").withColumn("timestamp", to_timestamp(from_unixtime("timestamp"))).show()

+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+
|bearing|            block_id|current_status|      id|      lat|line_id|      lon|pattern_id|route_id|schedule_relationship|    shift_id|    speed|stop_id|          timestamp|             trip_id|
+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+
|  183.0|20250116-64010165...| IN_TRANSIT_TO|44|12745| 38.76761|   4720|-9.100393|  4720_0_1|  4720_0|            SCHEDULED|113260234560|      0.0| 060011|2025-01-16 11:16:53|4720_0_1|2700|111...|
|   58.0|20250116-64010273...| IN_TRANSIT_TO|44|12557| 38.57138|   4641| -9.03899|  4641_0_2|  4641_0|            SCHEDULED|111500234560|      0.0| 150013|2025-01-16 11:16:43|4641_0_2|2700|102...|
|   12.0|202501

# Step 7: Write streaming data to BigQuery with auto-table creation

In [None]:
streaming_query = streaming_df.writeStream \
    .format("bigquery") \
    .option("table", output_table) \
    .option("checkpointLocation", "gs://edit-data-eng-project-group3/streaming_data/checkpoints") \
    .option("temporaryGcsBucket", "gs://edit-data-eng-project-group3/streaming_data/temp-bucket") \
    .option("writeDisposition", "WRITE_APPEND") \
    .option("createDisposition", "CREATE_IF_NEEDED") \
    .outputMode("append") \
    .start()

streaming_query.awaitTermination()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/content/spark-3.2.1-bin-hadoop3.2/python/lib/py4j-0.10.9.3-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/spark-3.2.1-bin-hadoop3.2/python/lib/py4j-0.10.9.3-src.zip/py4j/clientserver.py", line 475, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/socket.py", line 718, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
streaming_query.stop()