In [0]:
spark.read.option("header",True).csv("/mnt/mymoviehistory/bronze/2024-12-23/movie.csv").createOrReplaceTempView("v_movie_2")

In [0]:
%sql
select count(1)
from v_movie_2;

In [0]:
spark.read.option("header",True).csv("/mnt/mymoviehistory/bronze/2024-12-30/movie.csv").createOrReplaceTempView("v_movie_3")

In [0]:
%sql
select count(1)
from v_movie_3;

### Ingesta del archivo "movie.csv"

In [0]:
dbutils.widgets.help()

In [0]:
dbutils.widgets.text("p_environment","")
v_environment = dbutils.widgets.get("p_environment")

In [0]:
dbutils.widgets.text("p_file_date","2024-12-30")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
display(v_environment)

con esto, recupero las variables definidas en el nb de configuration, con las rutas de bronze, silver y gold, para no tener que escribirlas de manera manual

In [0]:
%run "../includes/configuration"


In [0]:
%run "../includes/commom_functions"

### Paso 1 - Leer el archivo CSV usando "DataFrameReader" de Spark

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType, DateType

In [0]:
movie_schema = StructType(fields= [

    StructField("movieId", IntegerType(), False),
    StructField("title", StringType(), True),
    StructField("budget", DoubleType(), True),
    StructField("homePage", StringType(), True),
    StructField("overview", StringType(), True),
    StructField("popularity", DoubleType(), True),
    StructField("yearReleaseDate", IntegerType(), True),
    StructField("releaseDate", DateType(), True),
    StructField("revenue", DoubleType(), True),
    StructField("durationTime", IntegerType(), True),
    StructField("movieStatus", StringType(), True),
    StructField("tagline", StringType(), True),
    StructField("voteAverage", DoubleType(), True),
    StructField("voteCount", IntegerType(), True),
] )

In [0]:
#con inferSchema, en función de los datos del campo, asigna el tipo de la columna, pero consume muchos recursos al dejar que sea spark quien lo haga, porque hace dos tareas, una de lectura y otra de transformación.
 # .option("inferSchema", True) \
#lo correcto, sea asignar nosotros el esquema, conociendo los datos admitidos
movie_df = spark.read \
    .option("header", True) \
    .schema(movie_schema) \
    .csv(f"{bronze_folder_path}/{v_file_date}/movie.csv")
#    .csv("/mnt/mymoviehistory/bronze/movie.csv")
#con option("header",True) indicamos que la primera línea es el encabezado

In [0]:
type(movie_df)

In [0]:
display(movie_df)

In [0]:
movie_df.printSchema() #ver el formato de los campos

In [0]:
display(movie_df.describe()) #ver nombre de las columnas y el conteo, promedio, max y min de cada columna

In [0]:
display(dbutils.fs.mounts())

In [0]:
%fs
ls /mnt/mymoviehistory/bronze

###Paso 2 -Seleccionar sólo las columnas "requeridas"

In [0]:
movies_selected_df = movie_df.select("movieId","title","budget","popularity","yearReleaseDate","releaseDate","revenue","durationTime","voteAverage","voteCount")

In [0]:
movies_selected_df = movie_df.select(movie_df.movieId,movie_df.title,movie_df.budget,movie_df.popularity,movie_df.yearReleaseDate,movie_df.releaseDate,movie_df.revenue,movie_df.durationTime,movie_df.voteAverage,movie_df.voteCount)

In [0]:
movies_selected_df = movie_df.select(movie_df["movieId"],movie_df["title"],movie_df["budget"],movie_df["popularity"],movie_df["yearReleaseDate"],movie_df["releaseDate"],movie_df["revenue"],movie_df["durationTime"],movie_df["voteAverage"],movie_df["voteCount"])

In [0]:
from pyspark.sql.functions import col

In [0]:
movies_selected_df = movie_df.select(col("movieId"),col("title"),col("budget"),col("popularity"),col("yearReleaseDate"),col("releaseDate"),col("revenue"),col("durationTime"),col("voteAverage"),col("voteCount").alias("vote_count"))

### Paso 3 - Cambiar el nombre de las columnas

In [0]:
movies_renamed_df = movies_selected_df \
    .withColumnRenamed("movieId","movie_id")\
    .withColumnRenamed("yearReleaseDate","year_release_date")\
    .withColumnRenamed("releaseDate","release_date")\
    .withColumnRenamed("durationTime","duration_time")\
    .withColumnRenamed("voteAverage","vote_average")\
    .withColumnRenamed("voteCount","vote_count")

In [0]:
movies_renamed_df = movies_selected_df \
    .withColumnsRenamed({"movieId":"movie_id","yearReleaseDate":"year_release_date","releaseDate":"release_date","durationTime":"duration_time","voteAverage":"vote_average","voteCount":"vote_count"})

### Paso 4- Agregar columnas

In [0]:
from pyspark.sql.functions import current_timestamp, lit

al crear la funcion, ya no hace falta
#movies_final_df = movies_renamed_df.withColumn("ingestion_date",current_timestamp()) \

In [0]:

movies_final_df = add_ingestion_dat(movies_renamed_df) \
                  .withColumn("environment",lit(v_environment)) \
                  .withColumn("file_date",lit(v_file_date))


In [0]:
display(movies_final_df)

### Paso 5 - Escribir dato en el datalake en formato "Parquet"

In [0]:
#overwrite_partition(movies_final_df, "movie_silver", "movies", "file_date")

In [0]:
merge_condition='tgt.movie_id = src.movie_id and tgt.file_date = src.file_date'
merge_delta_lake(movies_final_df, "movie_silver", "movies", silver_folder_path, merge_condition, "file_date")

In [0]:
#movies_final_df.write.mode("append").partitionBy("file_date").parquet(f"{silver_folder_path}/movies")

In [0]:
%sql
select file_date, count(1) 
from movie_silver.movies
group by file_date;

In [0]:
#display(spark.read.parquet("/mnt/mymoviehistory/silver/movies"))
display(spark.read.format("delta").load("/mnt/mymoviehistory/silver/movies"))

In [0]:
%fs
ls /mnt/mymoviehistory/silver/movies

In [0]:
df = spark.read.format("delta").load("/mnt/mymoviehistory/silver/movies")

In [0]:
display(df)

In [0]:
display(movies_final_df)

In [0]:
display(movie_df)

In [0]:
dbutils.notebook.exit("Success")