In [0]:
from pyspark.sql.functions import current_timestamp, lit, concat, col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [0]:
movies_cast_schema = StructType(fields=[
    StructField("movieId", IntegerType(), True),
    StructField("personId", IntegerType(), True),
    StructField("characterName", StringType(), True),
    StructField("genderId", IntegerType(), True),
    StructField("castOrder", IntegerType(), True)
])

In [0]:
movie_cast_df = spark.read \
                .schema(movies_cast_schema) \
                .option("multiLine", "True") \
                .json('/mnt/storymovie/bronze/movie_cast.json')

In [0]:
movie_cast_with_columns_df = movie_cast_df \
                            .withColumnRenamed("movieId", "movie_id") \
                            .withColumnRenamed("personId", "person_id") \
                            .withColumnRenamed("characterName", "character_name") \
                            .withColumn("ingestion_date", current_timestamp()) \
                            .withColumn("env", lit("python"))

In [0]:
movies_cast_final_df = movie_cast_with_columns_df.drop(col("genderId"), col("castOrder"))

In [0]:
display(movies_cast_final_df)

In [0]:
movies_cast_final_df.write.mode("overwrite").parquet('/mnt/storymovie/silver/movies_casts')