In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_timestamp, to_date, split, when, col, sum, mean, round, count, abs, regexp_extract, date_format
import numpy as np
from datetime import datetime, timedelta
from pyspark.sql.window import Window
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder \
    .appName("lichsugia") \
    .config("spark.cores.max", "2") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

24/05/26 16:20:44 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [8]:
# Đọc dữ liệu từ bảng nguồn machungkhoan
df_lichsugia = spark.read.format("iceberg").load("stock_db.datn_lichsugia")

In [9]:
# Convert 'ngay' to DateType
df_lichsugia = df_lichsugia.withColumn("ngay", to_date(col("ngay"), "dd/MM/yyyy"))

In [24]:
# Select and rename columns to match the schema of 'fact_price_history'
df_fact_price_history = df_lichsugia.select(
    col("symbol").alias("stocksymbol"),
    col("ngay").alias("trace_date"),
    col("giamocua").alias("open_price").cast("decimal(18, 2)"),
    col("giadongcua").alias("close_price").cast("decimal(18, 2)"),
    col("giacaonhat").alias("high_price").cast("decimal(18, 2)"),
    col("giathapnhat").alias("low_price").cast("decimal(18, 2)"),
    col("khoiluongkhoplenh").alias("volume").cast("bigint")
)

In [37]:
# Đọc dữ liệu từ bảng nguồn machungkhoan
df_machungkhoan = spark.read.format("iceberg").load("stock_db.datn_machungkhoan")

In [38]:
dim_stock_df  = df_machungkhoan.filter((col('categoryname').isNotNull()) & (col('categoryname') != ""))
df_stock_drop_duplicate = dim_stock_df.dropDuplicates(['symbol'])

In [39]:
dim_stock_data = df_stock_drop_duplicate.select(
    col("symbol").alias("stocksymbol"),
    "companyname",
    "categoryname"
).distinct()

In [40]:
# Convert dim_stock_df to a list of stocksymbols
valid_stocksymbols = [row.stocksymbol for row in dim_stock_data.select("stocksymbol").distinct().collect()]


                                                                                

In [41]:
# Filter df_fact_price_history to only include valid stocksymbols
df_fact_price_history_filtered = df_fact_price_history.filter(col("stocksymbol").isin(valid_stocksymbols))

In [42]:
# Write the transformed and filtered data to the 'fact_price_history' table
df_fact_price_history_filtered.write \
    .format("jdbc") \
    .option("driver", "com.mysql.cj.jdbc.Driver") \
    .option("url", "jdbc:mysql://10.168.6.106:3306/dtm_stock_v2") \
    .option("dbtable", "fact_price_history") \
    .option("user", "acc_etl") \
    .option("password", "Vnpt123456") \
    .mode("append") \
    .save()

                                                                                