In [0]:
from pyspark.sql import functions as F

df_fact = spark.table("silver_fact_ratings")

df_fact = df_fact \
    .withColumn("userId", F.col("userId").cast("int")) \
    .withColumn("movieId", F.col("movieId").cast("int")) \
    .withColumn("rating", F.col("rating").cast("float")) \
    .withColumn(
        "timestamp_ts",
        F.from_unixtime(F.col("timestamp").cast("long")).cast("timestamp")
    ) \
    .withColumn("rating_date", F.to_date("timestamp_ts"))


In [0]:
df_fact.write.format("delta") \
    .mode("overwrite") \
    .partitionBy("rating_date") \
    .saveAsTable("gold_fact_ratings")


In [0]:
spark.table("gold_fact_ratings").printSchema()


root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: float (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- timestamp_ts: timestamp (nullable = true)
 |-- rating_date: date (nullable = true)



In [0]:
spark.table("gold_fact_ratings") \
    .select("timestamp", "rating_date") \
    .show(10, truncate=False)


+---------+-----------+
|timestamp|rating_date|
+---------+-----------+
|943015211|1999-11-19 |
|943015062|1999-11-19 |
|943013896|1999-11-19 |
|943015701|1999-11-19 |
|943013896|1999-11-19 |
|943015256|1999-11-19 |
|943015099|1999-11-19 |
|943013896|1999-11-19 |
|943013896|1999-11-19 |
|943015701|1999-11-19 |
+---------+-----------+
only showing top 10 rows


In [0]:
spark.table("gold_fact_ratings").count()


100836

In [0]:
from pyspark.sql import functions as F

df = spark.table("gold_fact_ratings")

df.select(
    F.countDistinct("userId"),
    F.countDistinct("movieId")
).show()


+----------------------+-----------------------+
|count(DISTINCT userId)|count(DISTINCT movieId)|
+----------------------+-----------------------+
|                   610|                   9724|
+----------------------+-----------------------+



In [0]:
tables = spark.catalog.listTables()
for t in tables:
    if t.name.startswith("gold_"):
        print(f"{t.name}  |  {t.tableType}  |  {t.isTemporary}")


gold_fact_ratings  |  MANAGED  |  False
