In [1]:
from pyspark.sql import SparkSession, functions as F
from datetime import datetime

In [2]:
spark = SparkSession \
    .builder \
    .master("local[*]")\
    .appName("Exercicio Intro") \
    .getOrCreate()

In [3]:
df = spark.read.option("encoding", "UTF-8").json("tmdb_movies")
df.show()

+--------+--------------------+-------+----------+----------+------------+-------+-------+--------------------+------------+----------+
|  budget|              genres|     id|   imdb_id|popularity|release_date|revenue|runtime|               title|vote_average|vote_count|
+--------+--------------------+-------+----------+----------+------------+-------+-------+--------------------+------------+----------+
|       0|[{35, Comedy}, {1...|  82720| tt0268042|     1.068|  1994-04-24|      0|    150|           Yamaleela|         7.6|         5|
|       0|[{12, Adventure},...|  29813| tt0297149|     2.638|  1974-05-15|      0|     87|      Les gloutonnes|         2.7|         3|
|       0|[{18, Drama}, {87...| 112044| tt1202515|       0.6|  2008-09-19|      0|     90|            Atlantis|         6.0|         1|
|       0|       [{18, Drama}]| 696567| tt0875003|      0.87|  1993-01-01|      0|     82|    Day of the Angel|         6.0|         1|
|       0|[{14, Fantasy}, {...|  53004| tt092615

In [4]:
df.printSchema()
df.count()

root
 |-- budget: long (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- id: long (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- popularity: double (nullable = true)
 |-- release_date: string (nullable = true)
 |-- revenue: long (nullable = true)
 |-- runtime: long (nullable = true)
 |-- title: string (nullable = true)
 |-- vote_average: double (nullable = true)
 |-- vote_count: long (nullable = true)



12875

In [5]:
df = df.drop("genres")
df = df.withColumn('release_date', F.to_date(df['release_date'], 'yyyy-MM-dd'))
df.show()
df.printSchema()
df.count()

+--------+-------+----------+----------+------------+-------+-------+--------------------+------------+----------+
|  budget|     id|   imdb_id|popularity|release_date|revenue|runtime|               title|vote_average|vote_count|
+--------+-------+----------+----------+------------+-------+-------+--------------------+------------+----------+
|       0|  82720| tt0268042|     1.068|  1994-04-24|      0|    150|           Yamaleela|         7.6|         5|
|       0|  29813| tt0297149|     2.638|  1974-05-15|      0|     87|      Les gloutonnes|         2.7|         3|
|       0| 112044| tt1202515|       0.6|  2008-09-19|      0|     90|            Atlantis|         6.0|         1|
|       0| 696567| tt0875003|      0.87|  1993-01-01|      0|     82|    Day of the Angel|         6.0|         1|
|       0|  53004| tt0926151|     2.512|  2006-11-30| 291243|     85| Legend of Sudsakorn|         3.7|         3|
|       0| 357917|tt10378618|     3.366|  1985-09-03|      0|     72|Orochi Stri

12875

In [6]:
df_novo = df.filter(
    (F.col("revenue") != 0) & (F.col("budget") != 0) & (F.col("runtime") != 0))
df_novo.show()
df_novo.count()

+---------+------+---------+----------+------------+----------+-------+--------------------+------------+----------+
|   budget|    id|  imdb_id|popularity|release_date|   revenue|runtime|               title|vote_average|vote_count|
+---------+------+---------+----------+------------+----------+-------+--------------------+------------+----------+
| 15500000| 37725|tt0482527|    15.396|  2006-07-15|   7385434|     95|It's a Boy Girl T...|         6.5|       997|
|   900000|  1694|tt0089885|    13.891|  1985-10-18|   2023414|     86|         Re-Animator|       7.051|      1371|
| 13749400|477489|tt5092380|    16.637|  2018-04-04|   2842040|     90|  Just a Breath Away|         6.3|       751|
| 35000000|  5549|tt0100502|    24.013|  1990-06-22|  45681173|    117|           RoboCop 2|         6.0|      1769|
| 36500000| 16911|tt0972558|    12.092|  2008-12-18|  21834845|    115|The Inhabited Island|         5.5|        95|
| 13000000| 10546|tt0115986|    30.181|  1996-08-29|  17917287| 

1407

In [7]:
df_novo = df_novo.filter(F.col("vote_count") >= 30)
df_novo.orderBy(F.col("vote_count")).show()
df_novo.count()

+--------+------+----------+----------+------------+--------+-------+--------------------+------------+----------+
|  budget|    id|   imdb_id|popularity|release_date| revenue|runtime|               title|vote_average|vote_count|
+--------+------+----------+----------+------------+--------+-------+--------------------+------------+----------+
|  620000|330418| tt4806232|     3.239|  2015-06-26| 1400000|    146|  Indru Netru Naalai|         7.7|        31|
| 4000000| 36775| tt0062657|     5.773|  1967-12-22|10000000|    102|       The Ambushers|         5.0|        31|
|    3000| 65104| tt0466103|     1.113|  2005-06-17|   24000|    107|                Dust|         5.6|        31|
| 4000000| 36271| tt0116770|     4.602|  1996-11-15|  219449|     98|       Killer Tongue|         5.4|        32|
| 5000000| 76516| tt0100129|     3.641|  1990-11-08|  485772|     90| Meet the Applegates|         5.5|        32|
| 1500000| 16427| tt0377744|     2.528|  2004-05-14|10057021|     98|A Day Witho

1316

In [8]:
caminho = "s3://jvitor-desafio/Raw/TMDB/JSON/2024/03/27/"
partes = caminho.split("/")

# A última parte do caminho é a data no formato "YYYY/MM/DD"
ano = partes[-4]
mes = partes[-3]
dia = partes[-2]

data_str = f"{ano}-{mes}-{dia}"

data = datetime.strptime(data_str, "%Y-%m-%d").date()

print("Data extraída:", data)


Data extraída: 2024-03-27


In [9]:
df_novo = df_novo.withColumn("extraction_date", F.lit(data))

In [10]:
df_novo.show()
df_novo.printSchema()

+---------+------+---------+----------+------------+----------+-------+--------------------+------------+----------+---------------+
|   budget|    id|  imdb_id|popularity|release_date|   revenue|runtime|               title|vote_average|vote_count|extraction_date|
+---------+------+---------+----------+------------+----------+-------+--------------------+------------+----------+---------------+
| 15500000| 37725|tt0482527|    15.396|  2006-07-15|   7385434|     95|It's a Boy Girl T...|         6.5|       997|     2024-03-27|
|   900000|  1694|tt0089885|    13.891|  1985-10-18|   2023414|     86|         Re-Animator|       7.051|      1371|     2024-03-27|
| 13749400|477489|tt5092380|    16.637|  2018-04-04|   2842040|     90|  Just a Breath Away|         6.3|       751|     2024-03-27|
| 35000000|  5549|tt0100502|    24.013|  1990-06-22|  45681173|    117|           RoboCop 2|         6.0|      1769|     2024-03-27|
| 36500000| 16911|tt0972558|    12.092|  2008-12-18|  21834845|    11

In [11]:
df_novo.describe().show()

+-------+-------------------+------------------+---------+------------------+--------------------+------------------+--------------------+------------------+------------------+
|summary|             budget|                id|  imdb_id|        popularity|             revenue|           runtime|               title|      vote_average|        vote_count|
+-------+-------------------+------------------+---------+------------------+--------------------+------------------+--------------------+------------------+------------------+
|  count|               1316|              1316|     1316|              1316|                1316|              1316|                1316|              1316|              1316|
|   mean|  4.5621839943769E7|105134.17021276595|     NULL| 30.12173100303951|1.3586896763677812E8| 107.8966565349544|              1495.6| 6.277963525835866|3289.6436170212764|
| stddev|5.851431836267712E7|172668.52421473403|     NULL|33.069584051749466|2.6194405985221502E8|18.40598235714061