In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285398 sha256=bd42819e4f06a8b7e191eb6e1250e6e11707bc720af0df74de0483085c6d4afe
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


## Lendo json com colunas selecionadas e convertendo coluna genres de array para string

In [9]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, explode, array_contains, concat_ws, to_date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType
from datetime import datetime

# Configuração do Spark
conf = SparkConf().setAppName("TMDB").setMaster("local")
sc = SparkContext.getOrCreate(conf)
spark = SparkSession(sc)

# Lê os dados JSON do bucket S3
df = spark.read.json('/content/drive/MyDrive/Colab Notebooks/desafio_parte3/filmes_popular.json',multiLine=True)


# Define o esquema dos dados
schema = StructType([
    StructField('title', StringType(), True),
    StructField('id', IntegerType(), True),
    StructField('popularity', StringType(), True),
    StructField('overview', StringType(), True),
    StructField('genres', ArrayType(StringType()), True),
    StructField('release_date', StringType(), True)
])

# Converte a coluna genres de array para string
df = df.withColumn("genres", concat_ws(", ", col("genres")))

# Converte a coluna release_date de string para data
df = df.withColumn("release_date", to_date(col("release_date"), "yyyy-MM-dd"))

# Filtra os filmes do gênero de terror
# df_horror = df.where(array_contains(col("genres"), 27)).select("id", "title", "overview", "genres", "popularity", "release_date")
df_horror = df.where(col("genres").contains("Terror")).select("id", "title", "overview", "genres", "popularity", "release_date")

# df.show()
df_horror.printSchema()
df_horror.orderBy("id").show(20)


root
 |-- id: long (nullable = true)
 |-- title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- genres: string (nullable = false)
 |-- popularity: double (nullable = true)
 |-- release_date: date (nullable = true)

+-----+--------------------+--------------------+--------------------+----------+------------+
|   id|               title|            overview|              genres|popularity|release_date|
+-----+--------------------+--------------------+--------------------+----------+------------+
| 1450|Caçadores de Vamp...|No filme, Saya, é...|Ação, Aventura, T...|      8.81|  2009-04-02|
| 1977|           O Grito 3|Jake (Matthew Kni...|Mistério, Terror,...|    15.043|  2009-05-12|
| 3511|            Reflexos|Gina McVey, como ...|Thriller, Drama, ...|     8.136|  2008-11-15|
| 4627|Prisioneiros da M...|Um órfão de 14 an...|Ação, Fantasia, T...|     7.566|  2008-09-07|
| 5489|             Neowolf|                    |    Terror, Thriller|     1.865|  2010-04-19|
| 

In [10]:
df_horror.count()

10000

## Montando glue para json

In [None]:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext, SparkConf
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql.functions import from_json, col, explode, array_contains, concat_ws, to_date
from datetime import datetime

## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME', 'S3_RAW_PATH', 'S3_TRUSTED_PATH'])

conf = SparkConf().setAppName("TMDB").setMaster("local")
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

df_dynamic = glueContext.create_dynamic_frame.from_options(
    "s3",
    {
        "paths": [
            args['S3_RAW_PATH']
        ]
    },
    "csv",
    {"withHeader": True, "separator": "|"},
)

data_frame = df_dynamic.toDF()

# Lê os dados JSON do bucket S3
df = spark.read.json('/content/drive/MyDrive/Colab Notebooks/desafio_parte3/filmes_popular.json',multiLine=True)

# Define o esquema dos dados
schema = StructType([
    StructField('title', StringType(), True),
    StructField('id', IntegerType(), True),
    StructField('popularity', StringType(), True),
    StructField('overview', StringType(), True),
    StructField('genres', ArrayType(StringType()), True),
    StructField('release_date', StringType(), True)
])

# Converte a coluna genres de array para string
df = df.withColumn("genres", concat_ws(", ", col("genres")))

# Converte a coluna release_date de string para data
df = df.withColumn("release_date", to_date(col("release_date"), "yyyy-MM-dd"))

# Filtra os filmes do gênero de terror
df_horror = df.where(array_contains(col("genre_ids"), 27)).select("id", "title", "overview", "genres", "popularity", "release_date")

# # Salva o resultado em formato Parquet no caminho especificado
df_horror.write.mode("overwrite").parquet(args['S3_TRUSTED_PATH'])

job.commit()

In [None]:
!pip install tmdbv3api

Collecting tmdbv3api
  Downloading tmdbv3api-1.7.7-py2.py3-none-any.whl (18 kB)
Installing collected packages: tmdbv3api
Successfully installed tmdbv3api-1.7.7


In [None]:
from tmdbv3api import TMDb, Discover, Genre
from datetime import datetime, timedelta
import json
import logging
import os


# Defina o intervalo de datas para os últimos 15 anos
end_date = datetime.now()
start_date = end_date - timedelta(days=15*365)

with open('/content/drive/MyDrive/Colab Notebooks/desafio_parte3/api_key_tmbd.txt') as arquivo:
    api_key = arquivo.read()

tmdb = TMDb()
tmdb.api_key = api_key
tmdb.language = 'pt-BR'

# metodos TMDB para achar os filmes
discover = Discover()
genre = Genre()

# Obtenha a lista de gêneros de filmes
genres = genre.movie_list()

# Crie um dicionário para mapear IDs de gênero para nomes de gênero
genre_dict = {g['id']: g['name'] for g in genres}

# Parametros para pegar as datas do intervalo
params = {
    'primary_release_date.gte': start_date.strftime('%Y-%m-%d'),
    'primary_release_date.lte': end_date.strftime('%Y-%m-%d'),
    'with_genres': 27 # ID do gênero de terror
}

# count contar a quantidade de filmes achados
# escolher a quantidade de paginas da API
count = 0
movies_data = []

for page in range(1, 501):
    params['page'] = page
    movies = discover.discover_movies(params)
    count += len(movies)

    for movie in movies:
        print(movie)
        movie_data = movie.__dict__.copy()
        movie_data['genres'] = [genre_dict[g] for g in movie['genre_ids']]
        movies_data.append(movie_data)

# Convertendo a lista de dados dos filmes em formato JSON
movies_json = json.dumps(movies_data, ensure_ascii=False)

with open('movies500.json', 'w', encoding='utf-8') as f:
    json.dump(movies_data, f, ensure_ascii=False, indent=4)



In [None]:
count

10000