In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/commom_functions"

In [0]:
silver_folder_path

#Ingesta de archivo

## Paso 1: Lectura de archivo CSV

In [0]:
from pyspark.sql.types import StructField, StructType, IntegerType, DoubleType, StringType
from pyspark.sql.functions import col, current_timestamp, lit

In [0]:
from pathlib import Path

In [0]:
movie_df = spark.read.option("header", "True") \
    .option("sep", ",") \
    .option("quote", '"') \
    .option("inferSchema" , "True") \
    .option("escape", "\\") \
    .option("multiLine", True) \
    .option("ignoreLeadingWhiteSpace", True) \
    .option("ignoreTrailingWhiteSpace", True) \
    .option("nullValue", "") \
    .option("mode", "PERMISSIVE") \
    .csv(f"{bronze_folder_path}/movie.csv")

## Paso 2: Seleccionar columnas requeridas

### Opcion 1

In [0]:
movies_selected_df = movie_df.select("movieId", "title", "budget", "popularity", "yearReleaseDate", "releaseDate", "revenue", "durationTime", "voteAverage", "voteCount")

In [0]:
movies_selected_df.show()

### Opcion 2

In [0]:
movies_selected_df = movie_df.select(movie_df.movieId, movie_df.title, movie_df.budget, "popularity", "yearReleaseDate", "releaseDate", "revenue", "durationTime", "voteAverage", "voteCount")

### Opcion 3

In [0]:
movies_selected_df = movie_df.select(movie_df["movieId"], movie_df["title"], "budget", "popularity", "yearReleaseDate", "releaseDate", "revenue", "durationTime", "voteAverage", "voteCount")

In [0]:
Path.cwd().parent

### Opcion 4

In [0]:
movies_selected_df = movie_df.select(col("movieId"), col("title"), col("budget"), col("popularity"), col("yearReleaseDate"), col("releaseDate"), col("revenue"), col("durationTime"), col("voteAverage"), col("voteCount"))

In [0]:
display(movies_selected_df)

## Paso 3: Renombrar columnas

### Opcion 1

In [0]:
movies_renamed_df = movies_selected_df.withColumnRenamed("movieId", "movie_id") \
                    .withColumnRenamed("releaseDate", "release_date") \
                    .withColumnRenamed("yearReleaseDate", "year_release_date" ) \
                    .withColumnRenamed("durationTime", "duration_time") \
                    .withColumnRenamed("voteAverage", "vote_average") \
                    .withColumnRenamed("voteCount", "vote_count")

In [0]:
display(movies_renamed_df)

## Paso 4: Agregar Columnas

### Opcion 1

In [0]:
#movies_final_df = movies_renamed_df.withColumn("ingestion_date", current_timestamp()) \
#                .withColumn("env", lit("python"))

In [0]:
movies_final_df = add_ingestion_date(movies_renamed_df) \
                    .withColumn("env", lit("python"))

### Opcion 2

In [0]:
#movies_final_df = movies_renamed_df.withColumns({"ingestion_date":current_timestamp(), "env": lit("python")})

In [0]:
display(movies_final_df)

## Paso 5: Escribir datos en el datalake en formato Parquet

In [0]:
movies_final_df.write.mode("overwrite").parquet(f"{silver_folder_path}/movies")

In [0]:
%fs
ls /mnt/storymovie/silver/movies

## Paso 6: Consultar Datalake

In [0]:
df = spark.read.parquet("/mnt/storymovie/silver/movies")

In [0]:
display(df)

In [0]:
dbutils.notebook.exit("Ejecucion exitosa!!!")