In [6]:
#%pip install requests

In [32]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, udf
from pyspark.sql.types import ArrayType, StringType


import time

# SparkSession 초기화
spark = SparkSession.builder \
    .appName("TMDb Movie") \
    .getOrCreate()

In [9]:
import requests, pandas as pd

url = "https://api.themoviedb.org/3/authentication"
headers = {
    "accept": "application/json",
    "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiIwMDE3OWM5ODdkNGMyZTc1NGJjNjA4ZDdhZjJmZWYyYiIsInN1YiI6IjY2NmQ1ODA1MGRjZDkwNTFmMmVjMTk3ZCIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.Rx4kaiPk0d7-wATgXeWX20Lc3jMdzH_h1ZXkqdYh5Rk"
}
response = requests.get(url, headers=headers)
print(response.text)

{"success":true,"status_code":1,"status_message":"Success."}


## 영화 장르 종류 확인

In [132]:
url_genre = "https://api.themoviedb.org/3/genre/movie/list?language=en"
response_genre = requests.get(url_genre, headers=headers)
genre = spark.createDataFrame(response_genre.json()['genres'])
gd = dict(zip([x.get("id") for x in response_genre.json()['genres']], [x.get("name") for x in response_genre.json()['genres']]))

In [135]:
genre.show()

+-----+---------------+
|   id|           name|
+-----+---------------+
|   28|         Action|
|   12|      Adventure|
|   16|      Animation|
|   35|         Comedy|
|   80|          Crime|
|   99|    Documentary|
|   18|          Drama|
|10751|         Family|
|   14|        Fantasy|
|   36|        History|
|   27|         Horror|
|10402|          Music|
| 9648|        Mystery|
|10749|        Romance|
|  878|Science Fiction|
|10770|       TV Movie|
|   53|       Thriller|
|10752|            War|
|   37|        Western|
+-----+---------------+



## 영화 리스트 확인
- 500페이지까지 존재하기 때문에 500페이지 분량의 내용을 데이터프레임화

In [215]:
url = "https://api.themoviedb.org/3/discover/movie?include_adult=True&include_video=false&language=en-US&page=1&sort_by=popularity.desc"
response = requests.get(url, headers=headers)
df = spark.createDataFrame(response.json()["results"])

In [217]:
start = time.time()
for i in range(2,501) :
    url = "https://api.themoviedb.org/3/discover/movie?include_adult=True&include_video=false&language=en-US&page={}&sort_by=popularity.desc".format(i)
    response = requests.get(url, headers=headers)
    df = df.union(spark.createDataFrame(response.json()["results"]))
end = time.time()
print(f"{end - start:.5f} sec")

74.01920 sec


## 데이터 전처리

In [218]:
# 사용할 컬럼만 설정

df = df.select(
    col("id"),
    col("title"),
    col("release_date"),
    col("genre_ids"),
    col("original_language"),
    col("adult"),
    col("popularity"),
    col("vote_average"),
    col("vote_count"),
)

In [130]:
start = time.time()
df.count()
end = time.time()
print(f"{end - start:.5f} sec")

131.84892 sec


### df의 성인 표기 변경

In [219]:
df = df.withColumn("adult", when(col("adult") == True, 1).otherwise(0))

### df의 장르 표기 변경

In [220]:
def map_list(lst) :
    return [gd.get(x, None) for x in lst]

map_list_udf = udf(map_list, ArrayType(StringType()))
df = df.withColumn("genre_ids", map_list_udf(col("genre_ids")))

In [221]:
df.show()

+-------+--------------------+------------+--------------------+-----------------+-----+----------+------------+----------+
|     id|               title|release_date|           genre_ids|original_language|adult|popularity|vote_average|vote_count|
+-------+--------------------+------------+--------------------+-----------------+-----+----------+------------+----------+
|1022789|        Inside Out 2|  2024-06-11|[Animation, Famil...|               en|    0|  9017.097|       7.897|       261|
| 653346|Kingdom of the Pl...|  2024-05-08|[Science Fiction,...|               en|    0|  3374.453|       6.839|      1097|
|1001311|         Under Paris|  2024-06-05|[Thriller, Horror...|               fr|    0|  2358.065|         5.9|       560|
| 150540|          Inside Out|  2015-06-17|[Animation, Famil...|               en|    0|  2556.501|       7.917|     20616|
| 573435|Bad Boys: Ride or...|  2024-06-05|[Action, Crime, T...|               en|    0|  2327.853|         7.1|       319|
| 823464