In [1]:
from pyspark.sql import SparkSession, functions as F
from datetime import datetime

In [2]:
spark = SparkSession \
    .builder \
    .master("local[*]")\
    .appName("Processamento Trusted TMDB") \
    .getOrCreate()

In [3]:
input_path='assets/data/Layers/Raw/TMDB/JSON/2024/03/27'
df = spark.read.option("encoding", "UTF-8").json(input_path)
df.show()

+--------+--------------------+------+----------+----------+------------+---------+-------+--------------------+------------+----------+
|  budget|              genres|    id|   imdb_id|popularity|release_date|  revenue|runtime|               title|vote_average|vote_count|
+--------+--------------------+------+----------+----------+------------+---------+-------+--------------------+------------+----------+
|       0|[{27, Horror}, {1...|324787| tt3591944|     3.569|  2015-02-13|        0|     81|Little Red Riding...|         3.6|         9|
| 2310000|[{14, Fantasy}, {...|576026| tt9875852|     7.997|  2019-04-11|        0|     90|       The House Elf|         7.0|        68|
|       0|[{27, Horror}, {9...|614372| tt6472234|       0.6|  2019-07-09|        0|     98|        Landing Lake|         7.3|         3|
|       0|[{28, Action}, {3...|808516| tt0015053|     1.024|  1924-11-23|        0|     59|  Laughing at Danger|         6.5|         1|
|       0|[{27, Horror}, {9...| 58432| tt

In [4]:
df.printSchema()
df.count()

root
 |-- budget: long (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- id: long (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- popularity: double (nullable = true)
 |-- release_date: string (nullable = true)
 |-- revenue: long (nullable = true)
 |-- runtime: long (nullable = true)
 |-- title: string (nullable = true)
 |-- vote_average: double (nullable = true)
 |-- vote_count: long (nullable = true)



12876

In [5]:
df = df.drop("genres")
df = df.withColumn('release_date', F.to_date(df['release_date'], 'yyyy-MM-dd'))
df.show()
df.printSchema()
df.count()

+--------+------+----------+----------+------------+---------+-------+--------------------+------------+----------+
|  budget|    id|   imdb_id|popularity|release_date|  revenue|runtime|               title|vote_average|vote_count|
+--------+------+----------+----------+------------+---------+-------+--------------------+------------+----------+
|       0|324787| tt3591944|     3.569|  2015-02-13|        0|     81|Little Red Riding...|         3.6|         9|
| 2310000|576026| tt9875852|     7.997|  2019-04-11|        0|     90|       The House Elf|         7.0|        68|
|       0|614372| tt6472234|       0.6|  2019-07-09|        0|     98|        Landing Lake|         7.3|         3|
|       0|808516| tt0015053|     1.024|  1924-11-23|        0|     59|  Laughing at Danger|         6.5|         1|
|       0| 58432| tt0484273|      5.12|  2006-10-06|        0|     92|       The Dark Hour|         5.6|        51|
|       0| 73624| tt2027178|     8.501|  2011-09-02|  5799606|    106|Th

12876

In [6]:
df_novo = df.filter(
    (F.col("revenue") != 0) & (F.col("budget") != 0) & (F.col("runtime") != 0))
df_novo.show()
df_novo.count()

+---------+------+---------+----------+------------+----------+-------+--------------------+------------+----------+
|   budget|    id|  imdb_id|popularity|release_date|   revenue|runtime|               title|vote_average|vote_count|
+---------+------+---------+----------+------------+----------+-------+--------------------+------------+----------+
| 55000000|   795|tt0120632|    21.145|  1998-04-10| 198685114|    114|      City of Angels|         6.8|      2022|
|  8000000|  9507|tt0088024|      9.81|  1984-12-14|   6770587|    100|             Runaway|         5.5|       226|
|  2020000| 30145|tt0151572|      6.83|  1998-12-18|   4374994|    104|The Miracle of P....|         6.7|        98|
|300000000|299536|tt4154756|   193.962|  2018-04-25|2052415039|    149|Avengers: Infinit...|         8.2|     28661|
| 30000000|  1450|tt0806027|    15.018|  2009-04-02|   5874530|     91|Blood: The Last V...|         5.7|       327|
|200000000|   296|tt0181852|    84.467|  2003-07-02| 435000000| 

1407

In [7]:
df_novo = df_novo.filter(F.col("vote_count") >= 30)
df_novo.orderBy(F.col("vote_count")).show()
df_novo.count()

+--------+------+----------+----------+------------+--------+-------+--------------------+------------+----------+
|  budget|    id|   imdb_id|popularity|release_date| revenue|runtime|               title|vote_average|vote_count|
+--------+------+----------+----------+------------+--------+-------+--------------------+------------+----------+
|    3000| 65104| tt0466103|     1.113|  2005-06-17|   24000|    107|                Dust|         5.6|        31|
| 4000000| 36775| tt0062657|     5.773|  1967-12-22|10000000|    102|       The Ambushers|         5.0|        31|
|  620000|330418| tt4806232|     3.239|  2015-06-26| 1400000|    146|  Indru Netru Naalai|         7.7|        31|
| 4000000| 36271| tt0116770|     4.602|  1996-11-15|  219449|     98|       Killer Tongue|         5.4|        32|
| 5000000| 76516| tt0100129|     3.641|  1990-11-08|  485772|     90| Meet the Applegates|         5.5|        32|
| 3000000| 62732| tt1414840|     1.684|  2008-12-23|17850711|    100|       Love

1316

In [8]:
caminho = "s3://jvitor-desafio/Raw/TMDB/JSON/2024/03/27/"
partes = caminho.split("/")

# A última parte do caminho é a data no formato "YYYY/MM/DD"
ano = partes[-4]
mes = partes[-3]
dia = partes[-2]

data_str = f"{ano}-{mes}-{dia}"

data = datetime.strptime(data_str, "%Y-%m-%d").date()

print("Data extraída:", data)


Data extraída: 2024-03-27


In [9]:
df_novo = df_novo.withColumn("extraction_date", F.lit(data))

In [10]:
df_novo.show()
df_novo.printSchema()

+---------+------+---------+----------+------------+----------+-------+--------------------+------------+----------+---------------+
|   budget|    id|  imdb_id|popularity|release_date|   revenue|runtime|               title|vote_average|vote_count|extraction_date|
+---------+------+---------+----------+------------+----------+-------+--------------------+------------+----------+---------------+
| 55000000|   795|tt0120632|    21.145|  1998-04-10| 198685114|    114|      City of Angels|         6.8|      2022|     2024-03-27|
|  8000000|  9507|tt0088024|      9.81|  1984-12-14|   6770587|    100|             Runaway|         5.5|       226|     2024-03-27|
|  2020000| 30145|tt0151572|      6.83|  1998-12-18|   4374994|    104|The Miracle of P....|         6.7|        98|     2024-03-27|
|300000000|299536|tt4154756|   193.962|  2018-04-25|2052415039|    149|Avengers: Infinit...|         8.2|     28661|     2024-03-27|
| 30000000|  1450|tt0806027|    15.018|  2009-04-02|   5874530|     9

In [11]:
df_novo.describe().show()

+-------+-------------------+------------------+---------+------------------+--------------------+------------------+--------------------+------------------+------------------+
|summary|             budget|                id|  imdb_id|        popularity|             revenue|           runtime|               title|      vote_average|        vote_count|
+-------+-------------------+------------------+---------+------------------+--------------------+------------------+--------------------+------------------+------------------+
|  count|               1316|              1316|     1316|              1316|                1316|              1316|                1316|              1316|              1316|
|   mean|  4.5621839943769E7|105134.17021276595|     NULL| 30.12173100303951|1.3586896763677812E8| 107.8966565349544|              1495.6| 6.278130699088145|3289.6838905775076|
| stddev|5.851431836267711E7| 172668.5242147341|     NULL|33.069584051749466|2.6194405985221502E8|18.40598235714061