<a href="https://colab.research.google.com/github/vaniamv/final-project-edit/blob/main/Streaming_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Aplication in Real Time to Read Carris API - group 1


1. Authentication to gcloud

In [1]:
# autentication to gcloud with login

!gcloud auth application-default login

Go to the following link in your browser, and complete the sign-in prompts:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=https%3A%2F%2Fsdk.cloud.google.com%2Fapplicationdefaultauthcode.html&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login&state=bkX1TEEMxAk1ufk0TNMXLpxTBzMkvh&prompt=consent&token_usage=remote&access_type=offline&code_challenge=Th6LJRA5MLhPhEsNsIjFgdVCpHXQVdLv9MzdbZcQTCU&code_challenge_method=S256

Once finished, enter the verification code provided in your browser: 4/0ASVgi3JmNVNq5TW3c2Urfe3hULxlpQj8BuzpruL0OpAeiyGFxDPccJhNjLv--IhlCKhZxA

Credentials saved to file: [/content/.config/application_default_credentials.json]

These credentials will be used by any library that requests Application Default Credentials (ADC).
Ca

In [2]:
# download connector and save it local

!wget https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.7/gcs-connector-hadoop3-2.2.7-shaded.jar -P /usr/local/lib/

--2025-01-22 13:58:52--  https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.7/gcs-connector-hadoop3-2.2.7-shaded.jar
Resolving repo1.maven.org (repo1.maven.org)... 199.232.192.209, 199.232.196.209, 2a04:4e42:4c::209, ...
Connecting to repo1.maven.org (repo1.maven.org)|199.232.192.209|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 33831577 (32M) [application/java-archive]
Saving to: ‘/usr/local/lib/gcs-connector-hadoop3-2.2.7-shaded.jar’


2025-01-22 13:58:52 (172 MB/s) - ‘/usr/local/lib/gcs-connector-hadoop3-2.2.7-shaded.jar’ saved [33831577/33831577]



In [3]:
# import libraries

import os
from pyspark.sql import SparkSession

#spark session
spark = SparkSession.builder \
    .appName('GCS_Spark') \
    .config('spark.jars', '/usr/local/lib/gcs-connector-hadoop3-2.2.7-shaded.jar') \
    .config('spark.hadoop.fs.gs.impl', 'com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem') \
    .getOrCreate()

# save credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = '/content/.config/application_default_credentials.json'

# Config PySpark to access the GCS
spark._jsc.hadoopConfiguration().set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
spark._jsc.hadoopConfiguration().set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
spark._jsc.hadoopConfiguration().set("fs.gs.auth.service.account.enable", "true")
spark._jsc.hadoopConfiguration().set("google.cloud.auth.service.account.json.keyfile", '/content/.config/application_default_credentials.json')

2. Read Stream

In [77]:
from pyspark.sql.types import *

# create schema
vehicle_schema = StructType([StructField('bearing', IntegerType(), True),
                             StructField('block_id', StringType(), True),
                             StructField('current_status', StringType(), True),
                             StructField('id', StringType(), True),
                             StructField('lat', FloatType(), True),
                             StructField('line_id', StringType(), True),
                             StructField('lon', FloatType(), True),
                             StructField('pattern_id', StringType(), True),
                             StructField('route_id', StringType(), True),
                             StructField('schedule_relationship', StringType(), True),
                             StructField('shift_id', StringType(), True),
                             StructField('speed', FloatType(), True),
                             StructField('stop_id', StringType(), True),
                             StructField('timestamp', TimestampType(), True),
                             StructField('trip_id', StringType(), True)])


#readStreaming
stream = spark.readStream.format("json").schema(vehicle_schema).load("gs://edit-de-project-streaming-data/carris-vehicles")

3. Read from API endpoint stops

In [78]:
df_stops = spark.read.option("header", "true").csv('gs://edit-data-eng-project-group1/LandingZone/GTFS/stops.txt')
df_stops = df_stops.select('stop_id','stop_lat','stop_lon')
df_stops = df_stops.withColumn("stop_lat", df_stops["stop_lat"].cast("float"))
df_stops = df_stops.withColumn("stop_lon", df_stops["stop_lon"].cast("float"))

In [79]:
#select columns
transform = stream.select('id', 'speed', 'timestamp','line_id','route_id','stop_id','lat', 'lon')
# join tables
transform = transform.join(df_stops, on='stop_id', how='left')

In [80]:

from pyspark.sql.functions import col, lag,coalesce, current_timestamp, window
from pyspark.sql.window import Window
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType
from pyspark.sql import functions as F

# watermark is necessarary because of the aggregation
transformed = transform.withWatermark("timestamp", "60 seconds")

windowed_transform = transformed.groupBy("id", "stop_id", F.window("timestamp", "2 minutes")).agg(
    F.first(col("lat")).alias("previous_lat"),
    F.first(col("lon")).alias("previous_lon"),
    F.last(col("lat")).alias("lat"),
    F.last(col("lon")).alias("lon"),
    F.last(col("stop_lat")).alias("stop_lat"),
    F.last(col("stop_lon")).alias("stop_lon")
    )

In [81]:
def haversine_distance(lat1, lon1, lat2, lon2):

    if any(x is None for x in [lat1, lon1, lat2, lon2]):
        return 0.0
    R = 6371  # Earth's radius in kilometers

    # Convert latitude and longitude to radians
    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])

    # Calculate differences
    dlat = lat2 - lat1
    dlon = lon2 - lon1

    # Apply Haversine formula
    a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
    c = 2 * math.asin(math.sqrt(a))

    # Calculate distance
    distance = R * c

    return distance

# Register the UDF
distance_udf = udf(haversine_distance, FloatType())

windowed_transform = windowed_transform.withColumn("distance", distance_udf(windowed_transform["previous_lat"],windowed_transform["previous_lon"],windowed_transform["lat"],windowed_transform["lon"]))
windowed_transform = windowed_transform.withColumn("distance_to_stop", distance_udf(windowed_transform["lat"],windowed_transform["lon"],windowed_transform["stop_lat"],windowed_transform["stop_lon"]))


In [82]:
agg = windowed_transform.withColumn('speed', col('distance')/(2/60))

agg = agg.filter(agg.distance_to_stop.isNotNull() & (agg.distance_to_stop > 0) & (agg.speed.isNotNull()) & (agg.speed > 0)) \
         .withColumn('time_to_stop', (col('distance_to_stop')/col('speed') * 3600))

agg = agg.withColumn(
    'time_to_stop',
    F.from_unixtime(
        F.unix_timestamp(F.lit('00:00:00'), 'HH:mm:ss') + col('time_to_stop'),
        'HH:mm:ss'
    ))

5. Write Stream

In [83]:
# select folder
folder = 'stream/vehicles2'
gc_folder = 'gs://edit-de-project-streaming-data/datalake/stream/vehicles'


# Output function for each windowed batch
def insert_windowed_vehicles(df, batch_id):
    print(f"Batch ID: {batch_id}")
    df.write.format("parquet").mode("append").save(f"{folder}")


# Write the streaming query with watermark and window
windowed_query = (agg
                  .writeStream
                  .outputMode("append")
                  .foreachBatch(insert_windowed_vehicles)
                  .option('checkpointLocation', f'{folder}/checkpoint')
                  .trigger(processingTime='10 seconds')
                  .start()
)

windowed_query.awaitTermination(30)

False

In [84]:
windowed_query.isActive

True

In [85]:
windowed_query.status

{'message': 'Processing new data',
 'isDataAvailable': True,
 'isTriggerActive': True}

In [86]:
windowed_query.recentProgress

[]

In [76]:
#windowed_query.stop()

4. Transformations

In [59]:
windowed_query.recentProgress

[]

In [69]:
# Define the path to the Parquet files
parquet_path = "stream/vehicles2"

# Read the Parquet files into a DataFrame
parquet_df = spark.read.parquet(parquet_path)

# Show the first few rows
parquet_df.show(truncate=False)

# Print the schema to understand the data structure
parquet_df.printSchema()

+---+-------+------+------------+------------+---+---+--------+--------+--------+----------------+-----+------------+
|id |stop_id|window|previous_lat|previous_lon|lat|lon|stop_lat|stop_lon|distance|distance_to_stop|speed|time_to_stop|
+---+-------+------+------------+------------+---+---+--------+--------+--------+----------------+-----+------------+
+---+-------+------+------------+------------+---+---+--------+--------+--------+----------------+-----+------------+

root
 |-- id: string (nullable = true)
 |-- stop_id: string (nullable = true)
 |-- window: struct (nullable = true)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- previous_lat: float (nullable = true)
 |-- previous_lon: float (nullable = true)
 |-- lat: float (nullable = true)
 |-- lon: float (nullable = true)
 |-- stop_lat: float (nullable = true)
 |-- stop_lon: float (nullable = true)
 |-- distance: float (nullable = true)
 |-- distance_to_stop: float (nullable = true)
 |-- 