In [1]:
import os

os.chdir("/Users/ibulmnie/Documents/20241/BigData/crypto-big-data/")

In [2]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType
from pyspark.sql.functions import col, from_json, to_date, avg, lit
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import time
import shutil
from datetime import datetime, timedelta


import os

gcs_jar_path = os.path.abspath("config/gcs-connector-hadoop3-latest.jar")
from pyspark.sql import SparkSession

# Khởi tạo SparkSession
spark = SparkSession.builder \
    .appName("hehee") \
    .config("spark.hadoop.fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")\
    .config("spark.hadoop.fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")\
    .config("spark.hadoop.fs.gs.auth.service.account.enable", "true")\
    .config("spark.hadoop.fs.gs.auth.service.account.json.keyfile", "config/key/btcanalysishust-495a3a227f22.json") \
    .config("spark.jars", gcs_jar_path) \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0") \
    .config("spark.hadoop.fs.gs.project.id", "btcanalysishust")\
    .getOrCreate()


:: loading settings :: url = jar:file:/Users/ibulmnie/Documents/20241/BigData/crypto-big-data/spark-env/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/ibulmnie/.ivy2/cache
The jars for the packages stored in: /Users/ibulmnie/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-78050fd1-d1e4-4da8-80e8-528192a9e2fe;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.3.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.3.0 in central
	found org.apache.kafka#kafka-clients;2.8.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found org.slf4j#slf4j-api;1.7.32 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.2 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-client-api;3.3.2 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
:

In [30]:
from pyspark.sql.types import StructType, StructField, TimestampType, DoubleType


In [35]:
# List of cryptocurrency columns in the streaming data
crypto_columns = [
    "BTC_CLOSE", "ETH_CLOSE", "USDT_CLOSE", 
    "USDC_CLOSE", "XRP_CLOSE", "ADA_CLOSE", "MATIC_CLOSE", 
    "DOGE_CLOSE", "SOL_CLOSE"
]


In [49]:
# Định nghĩa schema của dữ liệu từ Kafka
schema = StructType([
    StructField("timestamp", StringType()),
    StructField("prices", StructType([
        StructField("bitcoin", FloatType()),
        StructField("ethereum", FloatType()),
        StructField("tether", FloatType()),
        StructField("binancecoin", FloatType()),
        StructField("usd-coin", FloatType()),
        StructField("ripple", FloatType()),
        StructField("cardano", FloatType()),
        StructField("dogecoin", FloatType()),
        StructField("matic-network", FloatType()),
        StructField("solana", FloatType()),
    ]))
])

# Đọc dữ liệu từ Kafka
kafka_df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "35.206.252.44:9092") \
    .option("subscribe", "crypto-pricess") \
    .option("startingOffsets", "latest") \
    .load()

# Chuyển đổi dữ liệu từ Kafka
parsed_df = kafka_df.selectExpr("CAST(value AS STRING)") \
    .select(F.from_json(F.col("value"), schema).alias("data"))

# Lấy cột timestamp và các giá trị giá trị tiền điện tử
crypto_parsed_df = parsed_df.select(
    F.to_timestamp(F.col("data.timestamp"), "yyyy-MM-dd'T'HH:mm:ss.SSSSSS").alias("DATE"),
    F.col("data.prices.bitcoin").alias("BTC_CLOSE"),
    F.col("data.prices.ethereum").alias("ETH_CLOSE"),
    F.col("data.prices.tether").alias("USDT_CLOSE"),
    F.col("data.prices.binancecoin").alias("BNB_CLOSE"),
    F.col("data.prices.usd-coin").alias("USDC_CLOSE"),
    F.col("data.prices.ripple").alias("XRP_CLOSE"),
    F.col("data.prices.cardano").alias("ADA_CLOSE"),
    F.col("data.prices.matic-network").alias("MATIC_CLOSE"),
    F.col("data.prices.dogecoin").alias("DOGE_CLOSE"),
    F.col("data.prices.solana").alias("SOL_CLOSE")
)

# Giữ lại dữ liệu gần nhất trong vòng 10 phút
crypto_parsed_df = crypto_parsed_df.filter(
    F.unix_timestamp(F.current_timestamp()) - F.unix_timestamp(F.col("DATE")) <= 600  # 600 giây = 10 phút
)

# Định nghĩa window để tính toán SMA
window_spec_5 = Window.orderBy("DATE").rowsBetween(-4, 0)
window_spec_10 = Window.orderBy("DATE").rowsBetween(-9, 0)
window_spec_20 = Window.orderBy("DATE").rowsBetween(-19, 0)
window_spec_50 = Window.orderBy("DATE").rowsBetween(-49, 0)
window_spec_100 = Window.orderBy("DATE").rowsBetween(-99, 0)
window_spec_200 = Window.orderBy("DATE").rowsBetween(-199, 0)

# Tính toán SMA cho từng cột (ví dụ Bitcoin, Ethereum, ...)
crypto_sma_df = crypto_parsed_df.withColumn(f"SMA5_BTC", F.avg("BTC_CLOSE").over(window_spec_5)) \
    .withColumn(f"SMA10_BTC", F.avg("BTC_CLOSE").over(window_spec_10)) \
    .withColumn(f"SMA20_BTC", F.avg("BTC_CLOSE").over(window_spec_20)) \
    .withColumn(f"SMA50_BTC", F.avg("BTC_CLOSE").over(window_spec_50)) \
    .withColumn(f"SMA100_BTC", F.avg("BTC_CLOSE").over(window_spec_100)) \
    .withColumn(f"SMA200_BTC", F.avg("BTC_CLOSE").over(window_spec_200)) \
    .withColumn(f"SMA5_ETH", F.avg("ETH_CLOSE").over(window_spec_5)) \
    .withColumn(f"SMA10_ETH", F.avg("ETH_CLOSE").over(window_spec_10)) \
    .withColumn(f"SMA20_ETH", F.avg("ETH_CLOSE").over(window_spec_20)) \
    .withColumn(f"SMA50_ETH", F.avg("ETH_CLOSE").over(window_spec_50)) \
    .withColumn(f"SMA100_ETH", F.avg("ETH_CLOSE").over(window_spec_100)) \
    .withColumn(f"SMA200_ETH", F.avg("ETH_CLOSE").over(window_spec_200))

# Tạo một query để stream dữ liệu, có thể show kết quả ở console hoặc thực hiện tính toán
query = crypto_sma_df.writeStream \
    .outputMode("append") \
    .foreachBatch(lambda batch_df, batch_id: batch_df.show()) \
    .start()

# Chạy job liên tục
query.awaitTermination()

24/12/12 00:15:26 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /private/var/folders/k9/xss4jl9s24sg7_7zhcmrp2g00000gn/T/temporary-104a5ba6-2d7d-451a-8c09-b1f3da067309. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/12/12 00:15:26 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/12/12 00:15:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/12 00:15:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/12 00:15:26 WARN WindowExec: No Partition Defined for Window operation! Moving all 

KeyboardInterrupt: 

24/12/12 00:20:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/12 00:20:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/12 00:20:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/12 00:20:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/12 00:20:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/12 00:20:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/12 0