In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

In [4]:
#I have to hard code the url for imdb's specific advanced search fields. This function will return the search results
#url for a page in the pagination of the search specified by count

PER_PAGE = 250

def get_search_url(count):
    date = '2010-01-01,2021-12-31'
    countries = 'us'
    runtime = '60'
    start = (count * PER_PAGE) + 1
    search_string = f'https://www.imdb.com/search/title/?title_type=feature&release_date={date}&countries={countries}&runtime={runtime},&count={PER_PAGE}&start={start}&ref=adv_nxt'
    
    return requests.get(search_string)

In [6]:
#get first 5000 in search

MAX_RESULTS = 5000
num_of_pages = MAX_RESULTS // PER_PAGE

results = []

for i in range(num_of_pages):
    r = get_search_url(i)
    soup = BeautifulSoup(r.text, 'html.parser')
    
    all_titles = soup.find_all('div', class_='lister-item-content')
    
    for title in all_titles:
        result = {
            'id' : title.a['href'][-10:-1],
            'url' : title.a['href'],
            'title' : title.a.text,
            'year' : title.find('span', class_="lister-item-year").text if title.find('span', class_="lister-item-year") != None else None,
            'genre' : title.find('span', class_='genre').text.strip().strip('\n').split(',') if title.find('span', class_='genre') != None else None,
            'certificate' : title.find('span', class_='certificate').text if title.find('span', class_='certificate') != None else None,
            'runtime' : title.find('span', class_='runtime').text if title.find('span', class_='runtime') != None else None,
            'imdb_rating' : title.find('div', class_='ratings-imdb-rating').find('strong').text if title.find('div', class_='ratings-imdb-rating') != None else None,
            'metascore' : title.find('span', class_="metascore mixed").text.strip() if title.find('span', class_="metascore mixed") != None else None,
        }
        
        results.append(result)

    time.sleep(2)
        
    print(f'collected {i+1} / {num_of_pages} pages', end="\r")



collected 20 / 20 pages

In [131]:
#Some information isn't available from the details in the imdb search page. We need to go to each page
#to get budget and box office info, as well as cast and director

for i, result in enumerate(results):
    r = requests.get('https://imdb.com' + result['url'])
    soup = BeautifulSoup(r.text, 'html.parser')
    
    time.sleep(1)
    
    metadata = soup.find('script')
    print(metadata.json())
    try:
        meta_json = metadata.json()
        meta_keys = meta_json.keys()
        result['principals'] = [{'name':x['name'], 'id':x['url'][-10:-1]} for x in meta_json['actor']] if 'actor' in meta_keys else None
        result['director'] = [{'name':x['name'], 'id':x['url'][-10:-1]} for x in meta_json['director']] if 'director' in meta_keys else None
        result['creator'] = [{'id':x['url'][-10:-1]} for x in meta_json['creator']] if 'creator' in meta_keys else None
    except:
        pass
    

    result['budget'] = get_data_from_dataid('li[data-testid="title-boxoffice-budget"]', 'span', soup, isNumber = True)
    result['domestic_box_office'] = get_data_from_dataid('li[data-testid="title-boxoffice-grossdomestic"]', 'span', soup, isNumber = True)
    result['worldwide_box_office'] = get_data_from_dataid('li[data-testid="title-boxoffice-cumulativeworldwidegross"]', 'span', soup, isNumber = True)
    result['origin'] = get_data_from_dataid('li[data-testid="title-details-origin"]', 'a', soup)
    
    
    print(f'getting more info for {i} / {len(results)} results', end="\r")

AttributeError: 'NoneType' object has no attribute 'json'

In [133]:
#test connection

r = requests.get('https://imdb.com' + results[0]['url'])
print(r)

In [7]:
imdb_df = pd.DataFrame(results)

In [8]:
imdb_df.head()

Unnamed: 0,id,url,title,year,genre,certificate,runtime,imdb_rating,metascore
0,tt1477834,/title/tt1477834/,Aquaman,(2018),"[Action, Adventure, Fantasy]",PG-13,143 min,6.8,55.0
1,tt1879016,/title/tt1879016/,Operation Mincemeat,(2021),"[Drama, War]",PG-13,128 min,6.7,
2,tt4513678,/title/tt4513678/,Ghostbusters: Afterlife,(2021),"[Adventure, Comedy, Fantasy]",PG-13,124 min,7.1,45.0
3,t10954652,/title/tt10954652/,Old,(2021),"[Drama, Horror, Mystery]",PG-13,108 min,5.8,55.0
4,t10872600,/title/tt10872600/,Spider-Man: No Way Home,(2021),"[Action, Adventure, Fantasy]",PG-13,148 min,8.3,


In [9]:
#clean numbers

def string_to_int(string):
    return int(''.join([x for x in string if x.isnumeric()])) if string != None else None

def string_to_float(string):
    return float(''.join([x for x in string if x.isnumeric()])) if string != None else None

imdb_df.year = imdb_df.year.map(string_to_int)
imdb_df.runtime = imdb_df.runtime.map(string_to_int)
imdb_df.imdb_rating = imdb_df.imdb_rating.map(string_to_float)
imdb_df.metascore = imdb_df.metascore.map(string_to_int)

In [10]:
imdb_df.head()

Unnamed: 0,id,url,title,year,genre,certificate,runtime,imdb_rating,metascore
0,tt1477834,/title/tt1477834/,Aquaman,2018,"[Action, Adventure, Fantasy]",PG-13,143,68.0,55.0
1,tt1879016,/title/tt1879016/,Operation Mincemeat,2021,"[Drama, War]",PG-13,128,67.0,
2,tt4513678,/title/tt4513678/,Ghostbusters: Afterlife,2021,"[Adventure, Comedy, Fantasy]",PG-13,124,71.0,45.0
3,t10954652,/title/tt10954652/,Old,2021,"[Drama, Horror, Mystery]",PG-13,108,58.0,55.0
4,t10872600,/title/tt10872600/,Spider-Man: No Way Home,2021,"[Action, Adventure, Fantasy]",PG-13,148,83.0,


In [11]:
imdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           5000 non-null   object 
 1   url          5000 non-null   object 
 2   title        5000 non-null   object 
 3   year         5000 non-null   int64  
 4   genre        5000 non-null   object 
 5   certificate  4508 non-null   object 
 6   runtime      5000 non-null   int64  
 7   imdb_rating  4999 non-null   float64
 8   metascore    1560 non-null   float64
dtypes: float64(2), int64(2), object(5)
memory usage: 351.7+ KB


In [12]:
imdb_df.to_csv('imdb_scrape.csv')