In [39]:
import requests
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor
from collections import defaultdict

In [33]:
API_KEY = "4f7e050b4d9ee8f88f4b60cc46ec39b3"
HEB = 'he-IL'
ENG = 'en-US'
EARLIEST_YEAR = 1984
MAX_MOVIES = 30000
MOVIES_PER_PAGE = 20

## Download top voted movies

In [3]:
top_rated_request = 'https://api.themoviedb.org/3/movie/top_rated?api_key={api_key}&language={lang}&page={page}'

In [None]:

top_rated_movies = []
with tqdm(total=MAX_MOVIES, desc="Downloading movies") as pbar:
    page = 1
    while len(top_rated_movies) < MAX_MOVIES:
        request_url = top_rated_request.format(api_key=API_KEY, lang=HEB, page=page)
        response = requests.get(request_url)
        if response.status_code == 200:
            json_data = response.json()
            buffered_movies = [m for m in json_data['results'] if int(m['release_date'].split('-')[0]) >= EARLIEST_YEAR]
            top_rated_movies += buffered_movies
            page += 1
            pbar.update(len(buffered_movies))
        else:
            print(f"{response.status_code} for {request_url}")


## Downloading metadata

In [55]:

def download_movie_data(movie_id):
    details_url = f'https://api.themoviedb.org/3/movie/{movie_id}'
    credits_url = f'https://api.themoviedb.org/3/movie/{movie_id}/credits'
    keywords_url = f'https://api.themoviedb.org/3/movie/{movie_id}/keywords'
    params = {'api_key': API_KEY}

    movie_data = {}

    # Downloading movie details
    response = requests.get(details_url, params=params)
    if response.status_code == 200:
        movie_data['details'] = response.json()

    # Downloading cast data
    response = requests.get(credits_url, params=params)
    if response.status_code == 200:
        movie_data['credits'] = response.json()

    # Downloading cast data
    response = requests.get(keywords_url, params=params)
    if response.status_code == 200:
        movie_data['keywords'] = response.json()

    return movie_id, movie_data

movie_ids = [movie['id'] for movie in top_rated_movies]
movie_details_map = defaultdict(dict)

with ThreadPoolExecutor() as executor:
    for movie_id, movie_data in tqdm(executor.map(download_movie_data, movie_ids), total=len(movie_ids), desc="Downloading details and cast"):
        movie_details_map[movie_id] = movie_data


Downloading details and cast:   0%|          | 0/8672 [00:00<?, ?it/s]

## Compose union json

In [71]:
def extract_keys(entry, keys):
    return {key: entry[key] for key in keys if key in entry}

def extract_list_be_key(entries, key):
    return [entry[key] for entry in entries]

def extract_director(crew):
    for c in crew:
        if c['job'] == 'Director':
            return c['name']
    return None

def populate_movie(movie, details, credits, keywords):
    movie['genres'] = extract_list_be_key(details['genres'], 'name')
    movie['title_he'] = movie['title']
    movie['title'] = details['title']
    movie['year'] = movie['release_date'].split('-')[0]
    movie['imdb_id'] = details['imdb_id']
    movie['overview_he'] = movie['overview']
    movie['overview'] = details['overview']
    movie['revenue'] = details['revenue']
    movie['production_companies'] = extract_list_be_key(details['production_companies'], 'name')

    movie['cast'] = [extract_keys(p, ['name', 'character', 'profile_path','order']) for p in credits['cast'][:8]]
    movie['director'] = extract_director(credits['crew'])
    movie['tagline'] = details['tagline']

    movie['keywords'] = extract_list_be_key(keywords, 'name')

    del movie['genre_ids']
    del movie['video']


In [72]:
for movie in top_rated_movies:
    id = movie['id']
    populate_movie(movie, movie_details_map[id]['details'], movie_details_map[id]['credits'], movie_details_map[id]['keywords']['keywords'])

In [74]:
top_rated_movies[285]

{'adult': False,
 'backdrop_path': '/79bJL9ydAMYVltuNTt4VhxORqIz.jpg',
 'id': 329,
 'original_language': 'en',
 'original_title': 'Jurassic Park',
 'overview': 'A wealthy entrepreneur secretly creates a theme park featuring living dinosaurs drawn from prehistoric DNA. Before opening day, he invites a team of experts and his two eager grandchildren to experience the park and help calm anxious investors. However, the park is anything but amusing as the security systems go off-line and the dinosaurs escape.',
 'popularity': 25.38,
 'poster_path': '/ch72OO4bPIu0DOKniPySBlems2y.jpg',
 'release_date': '1993-06-11',
 'title': 'Jurassic Park',
 'vote_average': 7.9,
 'vote_count': 15005,
 'genres': ['Adventure', 'Science Fiction'],
 'title_he': 'פארק היורה',
 'year': '1993',
 'imdb_id': 'tt0107290',
 'overview_he': 'לגו\'ן האמונד, מיליונר מזדקן, יש רעיון מטורף, לפתוח פארק שעשועים שהאטרקציה המרכזית בו היא דינוזאורים. חוקריו של האמונד מצליחים, בעזרת תהליך חדשני, להפיק דנ"א של דינוזאורים שמתו לפני

## Export json and CSV

In [75]:
import json
with open('top-voted-movies.json', 'w', encoding='utf-8') as f1:
    json.dump(top_rated_movies, f1, ensure_ascii=False)

with open('movies-map.json', 'w', encoding='utf-8') as f2:
    movies_map = {m['id']: m for m in top_rated_movies}
    json.dump(movies_map, f2, ensure_ascii=False)

In [80]:
import pandas as pd
df = pd.DataFrame.from_dict(top_rated_movies)
df.drop(columns=['title_he', 'overview_he', 'original_title'], inplace=True)
df.to_csv('top_voted_movies.csv', encoding='utf-8', index=False)