In [1]:
import os
import time
import calendar
import requests
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType
from pyspark.sql.functions import col, sum
from datetime import datetime, timedelta
from dotenv import load_dotenv

In [2]:
spark = SparkSession.builder \
    .appName("Movies_70_26") \
    .master("spark://spark:7077") \
    .config("spark.executor.memory", "6g")  \
    .config("spark.executor.cores", "1") \
    .config("spark.hadoop.fs.s3a.access.key", os.getenv("AWS_ACCESS_KEY_ID")) \
    .config("spark.hadoop.fs.s3a.secret.key", os.getenv("AWS_SECRET_ACCESS_KEY")) \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.1,com.amazonaws:aws-java-sdk-bundle:1.11.901") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()

:: loading settings :: url = jar:file:/opt/bitnami/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-ee6907ce-0655-487b-bcd6-07e146071a49;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.1 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.901 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 249ms :: artifacts dl 5ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.11.901 from central in [default]
	org.apache.hadoop#hadoop-aws;3.3.1 from central in [default]
	org.wildfly.openssl#wildfly-openssl;1.0.7.Final from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|d

In [3]:
# Lê todos os diretórios que começam com 'movie_' na camada bronze
df = spark.read.parquet("s3a://bronze/movie_*")

# Remove linhas com IDs duplicados mantendo a primeira ocorrência
df_deduplicado = df.dropDuplicates(["id"])

# Opcional: Verificar o resultado
print(f"Registros originais: {df.count()}")
print(f"Registros após remoção de duplicados: {df_deduplicado.count()}")

df.show(10)

25/02/28 03:15:01 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

Registros originais: 748896


                                                                                

Registros após remoção de duplicados: 748896
+-------+--------------------+--------------------+------------+------------+------------------+
|     id|               title|            overview|release_date|vote_average|         genre_ids|
+-------+--------------------+--------------------+------------+------------+------------------+
| 113727|      Dark Seduction|Laura is a succes...|  2010-01-01|         7.0|   [18, 53, 10749]|
|1339825|The Cast of 'Lost...|Retrospective on ...|  2010-01-01|         0.0| [18, 14, 99, 878]|
| 333926|        Online Crush|Love is just a mo...|  2010-01-01|         4.8|              [18]|
|  43615|Lula, the Son of ...|The true story of...|  2010-01-01|         5.5|          [18, 36]|
|  67250|                  12|12, is a comic pa...|  2010-01-01|       5.009|              [35]|
|  68712|  The Lazarus Papers|Lonny Smith, a yo...|  2010-01-01|         4.0| [18, 28, 53, 878]|
| 348521|           Love/Loss|A chance meeting ...|  2010-01-01|         0.0|[1074

In [4]:
df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns]).show()



+---+-----+--------+------------+------------+---------+
| id|title|overview|release_date|vote_average|genre_ids|
+---+-----+--------+------------+------------+---------+
|  0|    0|       0|           0|           0|        0|
+---+-----+--------+------------+------------+---------+



                                                                                

In [5]:
# Escreve no MinIO
df.write \
  .format("parquet") \
  .mode("overwrite") \
  .save("s3a://silver/movies_70_26/")

print("Escrita no MinIO concluída com sucesso!")

# Para verificação, lê os dados escritos
print("Lendo dados do MinIO para verificação...")
df_read = spark.read.parquet("s3a://silver/movies_70_26/")
df_read.show(5)

spark.stop()

                                                                                

Escrita no MinIO concluída com sucesso!
Lendo dados do MinIO para verificação...


                                                                                

+-------+--------------------+--------------------+------------+------------+-----------------+
|     id|               title|            overview|release_date|vote_average|        genre_ids|
+-------+--------------------+--------------------+------------+------------+-----------------+
| 113727|      Dark Seduction|Laura is a succes...|  2010-01-01|         7.0|  [18, 53, 10749]|
|1339825|The Cast of 'Lost...|Retrospective on ...|  2010-01-01|         0.0|[18, 14, 99, 878]|
| 333926|        Online Crush|Love is just a mo...|  2010-01-01|         4.8|             [18]|
|  43615|Lula, the Son of ...|The true story of...|  2010-01-01|         5.5|         [18, 36]|
|  67250|                  12|12, is a comic pa...|  2010-01-01|       5.009|             [35]|
+-------+--------------------+--------------------+------------+------------+-----------------+
only showing top 5 rows

