In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, lpad, lit, max, min, expr, when, count, regexp_extract, date_format
from pyspark.sql.window import Window

# Khởi tạo SparkSession
spark = SparkSession.builder \
    .appName("Aroon_Up/Aroon_Down") \
    .config("spark.cores.max", "2") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

# Đọc dữ liệu từ bảng nguồn machungkhoan
df_lichsugia = spark.read.format("iceberg").load("stock_db.datn_lichsugia")
df_machungkhoan = spark.read.format("iceberg").load("stock_db.datn_machungkhoan")

# Convert 'ngay' to DateType
df_lichsugia = df_lichsugia.withColumn("ngay", to_date(col("ngay"), "dd/MM/yyyy"))

# Extract numeric values from 'thaydoi' and split into change and percentage change
df_lichsugia = df_lichsugia.withColumn("thaydoi_value", regexp_extract(col("thaydoi"), r'([\d\.-]+)', 1).cast("float"))
df_lichsugia = df_lichsugia.withColumn("thaydoi_percent", regexp_extract(col("thaydoi"), r'\(([\d\.-]+)%\)', 1).cast("float"))

# Convert 'ngay' to 'dateid' in the format ddMMyyyy
df_lichsugia = df_lichsugia.withColumn("dateid", date_format(col("ngay"), "ddMMyyyy").cast("int"))

dim_stock_df = df_machungkhoan.filter((col('categoryname').isNotNull()) & (col('categoryname') != ""))
df_stock_drop_duplicate = dim_stock_df.dropDuplicates(['symbol'])

# Ghi dữ liệu vào bảng dim_stock
dim_stock_data = df_stock_drop_duplicate.select(
    col("symbol").alias("stocksymbol"),
    "companyname",
    "categoryid"
).distinct()

# Convert dim_stock_df to a list of stocksymbols
valid_stocksymbols = [row.stocksymbol for row in dim_stock_data.select("stocksymbol").distinct().collect()]

# Select and rename columns to match the schema of 'fact_price_history'
df_fact_price_history = df_lichsugia.select(
    col("symbol").alias("stocksymbol"),
    col("dateid"),
    col("giamocua").alias("openprice").cast("decimal(18, 2)"),
    col("giadongcua").alias("closeprice").cast("decimal(18, 2)"),
    col("giacaonhat").alias("highprice").cast("decimal(18, 2)"),
    col("giathapnhat").alias("lowprice").cast("decimal(18, 2)"),
    col("khoiluongkhoplenh").alias("volume").cast("bigint")
)

# Đảm bảo df_fact_price_history đã được lọc để loại bỏ các giá trị null
df_fact_price_history_filtered = df_fact_price_history.filter(col("stocksymbol").isin(valid_stocksymbols))

df_fact_price_history_filtered = df_fact_price_history_filtered.withColumn('dateid_padded', lpad(col('dateid').cast('string'), 8, '0'))
df_fact_price_history_filtered = df_fact_price_history_filtered.withColumn('date', to_date(col('dateid_padded'), 'ddMMyyyy'))


                                                                                

In [5]:
# Define window specifications
windowSpec = Window.partitionBy("stocksymbol").orderBy(col("date").cast("long")).rowsBetween(-13, 0)

# Calculate rolling max and min
df_with_aroon = df_fact_price_history_filtered.withColumn("rolling_max", max("highprice").over(windowSpec)) \
    .withColumn("rolling_min", min("lowprice").over(windowSpec)) \
    .withColumn("days_since_max", count(when(col("highprice") == col("rolling_max"), col("date"))).over(windowSpec)) \
    .withColumn("days_since_min", count(when(col("lowprice") == col("rolling_min"), col("date"))).over(windowSpec)) \
    .withColumn("Aroon_Up", expr("100 * (14 - days_since_max) / 14")) \
    .withColumn("Aroon_Down", expr("100 * (14 - days_since_min) / 14"))

# Select required columns
aroon_data = df_with_aroon.select(
    col("stocksymbol"),
    col("dateid"),
    col("Aroon_Up").alias("indicatorvalue"),
    lit(3).alias("indicatorid")  # Giả sử 3 là ID cho Aroon Up
).union(
    df_with_aroon.select(
        col("stocksymbol"),
        col("dateid"),
        col("Aroon_Down").alias("indicatorvalue"),
        lit(4).alias("indicatorid")  # Giả sử 4 là ID cho Aroon Down
    )
)



In [6]:
# Write result to `fact_stock_indicator`
aroon_data.write \
    .format("jdbc") \
    .option("driver", "com.mysql.cj.jdbc.Driver") \
    .option("url", "jdbc:mysql://10.168.6.106:3306/dtm_stock") \
    .option("dbtable", "fact_stock_indicator") \
    .option("user", "acc_etl") \
    .option("password", "Vnpt123456") \
    .mode("append") \
    .save()


                                                                                