In [1]:
import requests
import urllib
from bs4 import BeautifulSoup

## helper functions

In [2]:
def get_headers():
    headers = {'Accept-Language': 'en-US', 
               'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
               AppleWebKit/537.36 (KHTML, like Gecko) \
               Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.79'}
    return headers

In [3]:
def get_page_from_imdb(url):
    response = requests.get(url, headers=get_headers())
    assert response.status_code == 200
    return response.text

In [4]:
def get_page_soup(url, concat_string='', use_cache=False, force_update=False):
    url += concat_string
    if use_cache:
        global cache
        if url not in cache or force_update:
            cache[url] = get_page_from_imdb(url)
        page_soup = BeautifulSoup(cache[url])
    else:
        page_soup = BeautifulSoup(get_page_from_imdb(url))
    return page_soup

In [5]:
def get_canonical_url(page_soup):
    canonical_url = page_soup.find('link', attrs={'rel': 'canonical'}).attrs['href']
    return canonical_url

## get_actors_by_movie_soup()

In [6]:
def get_actors_by_movie_soup(cast_page_soup, num_of_actors_limit=None):
    cast_table_rows = cast_page_soup.find_all('tr', attrs={'class': ['odd', 'even']})
    canonical_url = get_canonical_url(cast_page_soup)
    movie_actors = [(row.contents[3].text.strip(), 
                     urllib.parse.urljoin(canonical_url, row.contents[3].contents[1]['href'].split('?')[0]))
                     for row in cast_table_rows]
    return movie_actors[:num_of_actors_limit]

In [7]:
# movie "Black Widow"
url = 'https://www.imdb.com/title/tt3480822/fullcredits/'
cast_page_soup = get_page_soup(url, concat_string='fullcredits/')

In [8]:
test_nolimit = get_actors_by_movie_soup(cast_page_soup)
len(test_nolimit), test_nolimit[100:]

(107,
 [('Iain Tingley', 'https://www.imdb.com/name/nm9974888/'),
  ('Andrew Tull', 'https://www.imdb.com/name/nm12241326/'),
  ('David Turner', 'https://www.imdb.com/name/nm10491024/'),
  ('Kalina Vanska', 'https://www.imdb.com/name/nm2338656/'),
  ('Chad J. Wagner', 'https://www.imdb.com/name/nm8242662/'),
  ('Ian Wilson', 'https://www.imdb.com/name/nm11227852/'),
  ('Daniel Joseph Woolf', 'https://www.imdb.com/name/nm9281009/')])

In [9]:
test_150 = get_actors_by_movie_soup(cast_page_soup, num_of_actors_limit=150)
len(test_150), test_150[100:]

(107,
 [('Iain Tingley', 'https://www.imdb.com/name/nm9974888/'),
  ('Andrew Tull', 'https://www.imdb.com/name/nm12241326/'),
  ('David Turner', 'https://www.imdb.com/name/nm10491024/'),
  ('Kalina Vanska', 'https://www.imdb.com/name/nm2338656/'),
  ('Chad J. Wagner', 'https://www.imdb.com/name/nm8242662/'),
  ('Ian Wilson', 'https://www.imdb.com/name/nm11227852/'),
  ('Daniel Joseph Woolf', 'https://www.imdb.com/name/nm9281009/')])

In [10]:
test_005 = get_actors_by_movie_soup(cast_page_soup, num_of_actors_limit=5)
len(test_005), test_005

(5,
 [('Scarlett Johansson', 'https://www.imdb.com/name/nm0424060/'),
  ('Florence Pugh', 'https://www.imdb.com/name/nm6073955/'),
  ('Rachel Weisz', 'https://www.imdb.com/name/nm0001838/'),
  ('David Harbour', 'https://www.imdb.com/name/nm1092086/'),
  ('Ray Winstone', 'https://www.imdb.com/name/nm0935653/')])

## get_movies_by_actor_soup()

In [11]:
def get_movies_by_actor_soup(actor_page_soup, num_of_movies_limit=None, include_voice=True):
    actor_section = actor_page_soup.find('div', attrs={'id': ['filmo-head-actor', 'filmo-head-actress']})
    try:
      movie_section = actor_section.find_next_sibling('div', attrs={'class': 'filmo-category-section'})
      movie_records = movie_section.find_all('div', attrs={'class': ['filmo-row odd', 'filmo-row even']})
      canonical_url = get_canonical_url(actor_page_soup)
      actor_movies = [(record.contents[3].text, 
                        urllib.parse.urljoin(
                           canonical_url, record.contents[3].contents[0]['href'].split('?')[0])) 
                        for record in movie_records 
                        if record.contents[4].text.strip() == ''
                        and (eval('"(voice)" not in record.contents[6]') if not include_voice else True)]
    except:
       return [(None, None)]
    return actor_movies[:num_of_movies_limit]

In [12]:
# actor Dwayne Johnson
url = 'https://www.imdb.com/name/nm0425005/'
actor_page_soup = get_page_soup(url, concat_string='fullcredits/')

In [13]:
test_nolimit = get_movies_by_actor_soup(actor_page_soup)
len(test_nolimit), test_nolimit[-5:]

(46,
 [('Walking Tall', 'https://www.imdb.com/title/tt0351977/'),
  ('The Rundown', 'https://www.imdb.com/title/tt0327850/'),
  ('The Scorpion King', 'https://www.imdb.com/title/tt0277296/'),
  ('Longshot', 'https://www.imdb.com/title/tt0201694/'),
  ('The Mummy Returns', 'https://www.imdb.com/title/tt0209163/')])

In [14]:
test_100 = get_movies_by_actor_soup(actor_page_soup, num_of_movies_limit=100)
len(test_100), test_100[-5:]

(46,
 [('Walking Tall', 'https://www.imdb.com/title/tt0351977/'),
  ('The Rundown', 'https://www.imdb.com/title/tt0327850/'),
  ('The Scorpion King', 'https://www.imdb.com/title/tt0277296/'),
  ('Longshot', 'https://www.imdb.com/title/tt0201694/'),
  ('The Mummy Returns', 'https://www.imdb.com/title/tt0209163/')])

In [15]:
test_005 = get_movies_by_actor_soup(actor_page_soup, num_of_movies_limit=5)
len(test_005), test_005

(5,
 [('Fast X', 'https://www.imdb.com/title/tt5433140/'),
  ('Black Adam', 'https://www.imdb.com/title/tt6443346/'),
  ('DC League of Super-Pets', 'https://www.imdb.com/title/tt8912936/'),
  ('Red Notice', 'https://www.imdb.com/title/tt7991608/'),
  ('Free Guy', 'https://www.imdb.com/title/tt6264654/')])

In [16]:
# actress Scarlett Johansson
url = 'https://www.imdb.com/name/nm0424060/'
actor_page_soup = get_page_soup(url, concat_string='fullcredits/')

In [17]:
test_nolimit = get_movies_by_actor_soup(actor_page_soup)
len(test_nolimit), test_nolimit[-5:]

(55,
 [('Fall', 'https://www.imdb.com/title/tt0119098/'),
  ('If Lucy Fell', 'https://www.imdb.com/title/tt0116606/'),
  ('Manny & Lo', 'https://www.imdb.com/title/tt0116985/'),
  ('Just Cause', 'https://www.imdb.com/title/tt0113501/'),
  ('North', 'https://www.imdb.com/title/tt0110687/')])

In [18]:
test_100 = get_movies_by_actor_soup(actor_page_soup, num_of_movies_limit=100)
len(test_100), test_100[-5:]

(55,
 [('Fall', 'https://www.imdb.com/title/tt0119098/'),
  ('If Lucy Fell', 'https://www.imdb.com/title/tt0116606/'),
  ('Manny & Lo', 'https://www.imdb.com/title/tt0116985/'),
  ('Just Cause', 'https://www.imdb.com/title/tt0113501/'),
  ('North', 'https://www.imdb.com/title/tt0110687/')])

In [19]:
test_005 = get_movies_by_actor_soup(actor_page_soup, num_of_movies_limit=5)
len(test_005), test_005

(5,
 [('Asteroid City', 'https://www.imdb.com/title/tt14230388/'),
  ('Sing 2', 'https://www.imdb.com/title/tt6467266/'),
  ('Black Widow', 'https://www.imdb.com/title/tt3480822/'),
  ('Jojo Rabbit', 'https://www.imdb.com/title/tt2584384/'),
  ('Marriage Story', 'https://www.imdb.com/title/tt7653254/')])