In [30]:
import requests, json, pandas as pd
import os
from dotenv import load_dotenv
from pyspark.sql import SparkSession

load_dotenv()

TMDB_API_KEY = os.getenv('TMDB_API_KEY')
TMDB_API_READ_ACCESS_TOKEN = os.getenv('TDMB_API_READ_ACCESS_TOKEN')
OMDB_API_KEY = os.getenv('OMDB_API_KEY')

In [None]:
# test with TMDB API key
BASE_URL = "https://api.themoviedb.org/3/movie/popular"

def fetch_movies(page=1):
    params = {"api_key": TMDB_API_KEY, "language": "en-US", "page": page}
    response = requests.get(BASE_URL, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        print("Error fetching data:", response.status_code)
        return []
    
def get_all_genres():
    movies_genres_url = "https://api.themoviedb.org/3/genre/movie/list"
    tv_genres_url = "https://api.themoviedb.org/3/genre/tv/list"
    params = {"api_key": TMDB_API_KEY, "language": "en-US"}
    movies_response = requests.get(movies_genres_url, params=params)
    tv_response = requests.get(tv_genres_url, params=params)
    if movies_response.status_code == 200 and tv_response.status_code == 200:
        genre_list = movies_response.json()["genres"] + tv_response.json()["genres"]
        result = []

        for genre in genre_list:
            genre = genre["name"]
            result.append(genre)

        return result    
    
    else:
        print("Error fetching data:", movies_response.status_code, tv_response.status_code)
        return []

    
def get_recommendations(movie_id):
    url = f"https://api.themoviedb.org/3/movie/{movie_id}/recommendations"
    params = {"api_key": TMDB_API_KEY, "language": "en-US"}
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        print("Error fetching data:", response.status_code)
        return []
    

movies = get_all_genres()

with open("output.json", "w") as f:
    json.dump(movies, f, indent=4)


In [31]:
# test ODMB API key
def get_data_from_omdb_with_title(title):
    url = "http://www.omdbapi.com/?apikey=" + OMDB_API_KEY + "&t=" + title
    response = requests.get(url)
    return response.json()

def get_data_from_omdb_with_imdb_id(imdb_id):
    url = "http://www.omdbapi.com/?apikey=" + OMDB_API_KEY + "&i=" + imdb_id
    response = requests.get(url)
    return response.json()

def get_data_from_omdb_based_on_yearOfRelease(year):
    url = "http://www.omdbapi.com/?apikey=" + OMDB_API_KEY + "&y=" + year
    response = requests.get(url)
    return response.json()

def get_data_from_omdb_based_on_type(type):
    url = "http://www.omdbapi.com/?apikey=" + OMDB_API_KEY + "&type=" + type
    response = requests.get(url)
    return response.json()


In [21]:
response = get_data_from_omdb_with_imdb_id("tt0114103")

with open("output.json", "w") as f:
    json.dump(response, f, indent=4)

In [None]:
url = "https://api.themoviedb.org/3/genre/movie/list"

headers = {
    "accept": "application/json",
    "Authorization": "Bearer " + TMDB_API_READ_ACCESS_TOKEN 
}

response = requests.get(url, headers=headers)

print(response.text)

{"genres":[{"id":28,"name":"Action"},{"id":12,"name":"Adventure"},{"id":16,"name":"Animation"},{"id":35,"name":"Comedy"},{"id":80,"name":"Crime"},{"id":99,"name":"Documentary"},{"id":18,"name":"Drama"},{"id":10751,"name":"Family"},{"id":14,"name":"Fantasy"},{"id":36,"name":"History"},{"id":27,"name":"Horror"},{"id":10402,"name":"Music"},{"id":9648,"name":"Mystery"},{"id":10749,"name":"Romance"},{"id":878,"name":"Science Fiction"},{"id":10770,"name":"TV Movie"},{"id":53,"name":"Thriller"},{"id":10752,"name":"War"},{"id":37,"name":"Western"}]}


In [5]:
with open("output.json", "w") as f:
    json.dump(response.json(), f, indent=4)

In [None]:
url = "https://api.themoviedb.org/3/genre/tv/list?language=en"

headers = {
    "accept": "application/json",
    "Authorization": "Bearer " + TMDB_API_READ_ACCESS_TOKEN
}

response = requests.get(url, headers=headers)

with open("output.json", "w") as f:
    json.dump(response.json(), f, indent=4)

In [36]:
movies = pd.read_csv("dataset/movies.csv")
links = pd.read_csv("dataset/links.csv")

In [29]:
response = get_data_from_omdb_with_imdb_id(movies.loc[0, "imdbId"])
with open("output.json", "w") as f:
    json.dump(response, f, indent=4)

In [None]:
rated, avg_rating, rating_count, runtime, release_date, budget, revenue, desc, status, poster, language, country, genre = [], [], [], [], [], [], [], [], [], [], [], [], []
director, writer, actor = [], [], []

for i in range(20112, len(movies)):
    idtf = movies.loc[i, "imdbId"]
    if pd.isna(idtf):
        continue
    response = get_data_from_omdb_with_imdb_id(idtf)
    rated.append(response["Rated"] if "Rated" in response else None)
    avg_rating.append(response["imdbRating"] if "imdbRating" in response else None)
    rating_count.append(response["imdbVotes"] if "imdbVotes" in response else None)
    runtime.append(response["Runtime"] if "Runtime" in response else None)
    release_date.append(response["Released"] if "Released" in response else None)
    budget.append(0)
    revenue.append(response["BoxOffice"] if "BoxOffice" in response else None)
    desc.append(response["Plot"] if "Plot" in response else None)
    status.append(response["Response"] if "Response" in response else None)
    poster.append(response["Poster"] if "Poster" in response else None)
    language.append(response["Language"] if "Language" in response else None)
    country.append(response["Country"] if "Country" in response else None)
    genre.append(response["Genre"] if "Genre" in response else None)
    director.append(response["Director"] if "Director" in response else None)
    writer.append(response["Writer"] if "Writer" in response else None)
    actor.append(response["Actors"] if "Actors" in response else None)
    
print(len(rated), len(avg_rating), len(rating_count), len(runtime), len(release_date), len(budget), len(revenue), len(desc), len(status), len(poster), len(language), len(country))

KeyboardInterrupt: 

In [46]:
print(len(rated), len(avg_rating), len(rating_count), len(runtime), len(release_date), len(budget), len(revenue), len(desc), len(status), len(poster), len(language), len(country))

20111 20111 20111 20111 20111 20111 20111 20111 20111 20111 20111 20111


In [None]:
for i in range(len(movies) - 20111):
    rated.append(None)
    avg_rating.append(None)
    rating_count.append(None)
    runtime.append(None)
    release_date.append(None)
    budget.append(None)
    revenue.append(None)
    desc.append(None)
    status.append(None)
    poster.append(None)
    language.append(None)
    country.append(None)
    genre.append(None)
    director.append(None)
    writer.append(None) 
    actor.append(None)

movies["Rated"] = rated
movies["avg_rating"] = avg_rating
movies["rating_count"] = rating_count
movies["runtime"] = runtime
movies["release_date"] = release_date
movies["budget"] = budget
movies["revenue"] = revenue
movies["desc"] = desc
movies["status"] = status
movies["poster"] = poster
movies["language"] = language
movies["country"] = country
movies["genre"] = genre
movies["director"] = director
movies["writer"] = writer
movies["actor"] = actor

# 87585
# 87461

In [56]:
links = pd.read_csv("dataset/links.csv")
cntNA = 0

for i in range(len(links)):
    if pd.isna(links.loc[i, "tmdbId"]):
        cntNA += 1

print(cntNA)

124
