<a href="https://colab.research.google.com/github/jorgeneves16/dataeng-dataprocessing/blob/main/spark_streaming/examples/coinbase_consumer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Consumer Coinbase

- To be executed in Google Colab
- Connect to GCLOUD
- Read data from GCS as streaming
- Analyze data

In [1]:
from google.colab import auth
auth.authenticate_user()

project_id = 'data-eng-dev-437916'
!gcloud config set project {project_id}

Updated property [core/project].


In [2]:
!apt-get install openjdk-11-jdk -y
!pip install pyspark gcsfs

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
openjdk-11-jdk is already the newest version (11.0.27+6~us1-0ubuntu1~22.04).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [147]:
from pyspark.sql import SparkSession

GCS_JAR = "https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.9/gcs-connector-hadoop3-2.2.9-shaded.jar"

spark = SparkSession.builder \
    .appName("GCSStreamingDemo") \
    .config("spark.jars", GCS_JAR) \
    .config("spark.hadoop.fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") \
    .config("spark.hadoop.fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS") \
    .getOrCreate()

In [320]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

schema = "type STRING, sequence LONG, product_id STRING, price STRING, time STRING"

df = spark.readStream.schema(schema).json("gs://edit-data-eng-dev/datalake/landing/btc/")

# write to memory
# stream = df.select("time", "product_id", col("price").cast("double")) \
#   .writeStream \
#   .outputMode("append") \
#   .queryName("btc_price_stream10") \
#   .format("memory") \
#   .start()

# write parquet
stream = df.select("time", "product_id", col("price").cast("double")) \
  .writeStream \
  .outputMode("append") \
  .format("parquet") \
  .option("path", "gs://edit-data-eng-dev/datalake/bronze/btc/") \
  .option("checkpointLocation", "gs://edit-data-eng-dev/datalake/bronze/btc/checkpoint") \
  .start()

In [323]:
stream.stop()

In [149]:
!gsutil ls gs://edit-data-eng-dev/datalake/landing/btc/ | wc

    598     598   44252


In [304]:
stream.isActive

True

In [302]:
from pyspark.sql.functions import *
df = spark.sql("select * from btc_price_stream10")
df.show()

+--------------------+----------+---------+
|                time|product_id|    price|
+--------------------+----------+---------+
|2025-07-05T10:20:...|   BTC-USD|108163.06|
|2025-07-05T10:21:...|   BTC-USD|108163.06|
|2025-07-05T10:21:...|   BTC-USD|108163.06|
|2025-07-05T10:20:...|   BTC-USD|108163.06|
|2025-07-05T10:21:...|   BTC-USD|108163.06|
|2025-07-05T10:20:...|   BTC-USD|108163.06|
|2025-07-05T10:21:...|   BTC-USD|108163.06|
|2025-07-05T10:21:...|   BTC-USD|108163.06|
|2025-07-05T10:21:...|   BTC-USD|108163.06|
|2025-07-05T10:21:...|   BTC-USD|108163.06|
|2025-07-05T10:21:...|   BTC-USD|108163.06|
|2025-07-05T10:21:...|   BTC-USD|108156.85|
|2025-07-05T10:21:...|   BTC-USD|108156.85|
|2025-07-05T10:21:...|   BTC-USD|108156.85|
|2025-07-05T10:21:...|   BTC-USD|108156.85|
|2025-07-05T10:21:...|   BTC-USD|108156.85|
|2025-07-05T10:22:...|   BTC-USD|108156.85|
|2025-07-05T10:22:...|   BTC-USD|108171.17|
|2025-07-05T10:22:...|   BTC-USD|108171.17|
|2025-07-05T10:21:...|   BTC-USD

In [None]:
# Analysis

# Latest Bitcoin price
# Calculate average BTC price per minute
# Calculate standard deviation of price over time
# How many price tickets per minute?
# Find anomalies (price == nulls or with strange values)

In [172]:
from pyspark.sql.functions import *

In [309]:
import pyspark.sql.functions as f
from pyspark.sql.window import Window

# Latest Bitcoin price
df = spark.sql("select * from btc_price_stream10 where product_id = 'BTC-USD' order by time desc limit 1");
df.show()

import pyspark.sql.functions as f
from pyspark.sql.window import Window
windowSpec = Window.partitionBy("product_id").orderBy("time")
df2 = df.filter("product_id=='BTC-USD'").withColumn("last_price", f.first("price").over(windowSpec))
df2.filter(f.col("last_price") == f.col("price"))
df2.show()

+--------------------+----------+---------+
|                time|product_id|    price|
+--------------------+----------+---------+
|2025-07-05T10:23:...|   BTC-USD|108172.43|
+--------------------+----------+---------+

+--------------------+----------+---------+----------+
|                time|product_id|    price|last_price|
+--------------------+----------+---------+----------+
|2025-07-05T10:23:...|   BTC-USD|108172.33| 108172.33|
+--------------------+----------+---------+----------+



In [306]:
# Calculate average BTC price per minute
df.filter("product_id=='BTC-USD'").groupBy(window("time", "1 minute")).agg(avg("price")).show()

+--------------------+----------+
|              window|avg(price)|
+--------------------+----------+
|{2025-07-05 10:23...| 108172.44|
+--------------------+----------+



In [308]:
#Calculate standard deviation of price over time

df.filter("product_id=='BTC-USD'").groupBy(window("time", "1 minute")).agg(stddev("price")).show(10, False)

+------------------------------------------+-------------+
|window                                    |stddev(price)|
+------------------------------------------+-------------+
|{2025-07-05 10:23:00, 2025-07-05 10:24:00}|NULL         |
+------------------------------------------+-------------+



In [None]:
#How many price tickets per minute?

df.groupBy(window("time", "1 minute"), "product_id").agg(count(lit("1")).alias("count")).orderBy("window", "product_id").show(10, False)

In [311]:
#Find anomalies (price == nulls or with strange values)
df.filter("product_id=='BTC-USD'").filter((f.col("price").isNull()) | (col("price") > 1000000)).show()

+----+----------+-----+
|time|product_id|price|
+----+----------+-----+
+----+----------+-----+



In [134]:
# count and average per product_id
df.groupBy("product_id").agg(count(lit("1")).alias("count"), avg("price").alias("avg_price")).show()

+----------+-----+---------+
|product_id|count|avg_price|
+----------+-----+---------+
|   BTC-USD|    1|108168.51|
+----------+-----+---------+

