### 1. Installing all the necessary libraries

In [1]:
pip install requests beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [22]:
pip install cinemagoer

Note: you may need to restart the kernel to use updated packages.


In [43]:
import pandas as pd   # To create a DataFrame
import requests       # To send requests to the URLs
from bs4 import BeautifulSoup # To get the content in the form of HTML
import re
import imdb # To extract data about films

### 2. Choosing a URL, scraping data using Beutiful Soup library, creating a preliminary DataFrame

In [6]:
# Assigning an URL to a variable
url = 'https://pro.imdb.com/name/nm0000424/credits'

# Sending a HTTP request
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

In [7]:
# Storing the data we are curious about in the variable
movie_data = soup.find('table', id="past_film_filmography")

# Storing IMDb ids of films in the variable
ids = [tag['data-filter-item-id'] for tag in movie_data.select('tr[data-filter-item-id]')]

# Storing a role kind (actor/thanks/actor soundtrack) in the variable
role  = [tag['data-filterable-name'] for tag in movie_data.select('tr[data-filterable-name]')]

# Creating a preliminary DataFrame
data = []
table = soup.find('table', id ="past_film_filmography")

rows = table.find_all('tr')
for row in rows:
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    data.append([ele for ele in cols if ele])
    
df = pd.DataFrame(data, columns = ['raw_title','budget','opening_weekend','gross_us_and_canada','gross_worldwide'])

In [8]:
df.head()

Unnamed: 0,raw_title,budget,opening_weekend,gross_us_and_canada,gross_worldwide
0,,,,,
1,Operation Fortune: Ruse de guerre (2023)\n\n\n...,$50MM,$19MM,,
2,Glass Onion (2022)\n\n\nPhillip\n\n Phillip\n ...,$40MM,$9.4MM,$13MM,$13MM
3,The Gentlemen (2019)\n\n\nFletcher\n\n Fletche...,$22MM,$11MM,$36MM,$115MM
4,"Paddington 2 (2017)\n\n\nPhoenix Buchanan,\nPe...",$40MM,$11MM,$41MM,$228MM


In [10]:
# If we check the IDs' list we'll see that every ID in the list has 2 extra characters 'tt' at the beginning. 
ids[0]

'tt7985704'

In [11]:
# Creating a new clean IDs list.
film_id = []
for i in ids:
    film_id.append(i[2:])

film_id[0]

'7985704'

In [12]:
# Let's have a look at our preliminary DataFrame. We see that the first row is empty, let's delete it.
df.head()

Unnamed: 0,raw_title,budget,opening_weekend,gross_us_and_canada,gross_worldwide
1,Operation Fortune: Ruse de guerre (2023)\n\n\n...,$50MM,$19MM,,
2,Glass Onion (2022)\n\n\nPhillip\n\n Phillip\n ...,$40MM,$9.4MM,$13MM,$13MM
3,The Gentlemen (2019)\n\n\nFletcher\n\n Fletche...,$22MM,$11MM,$36MM,$115MM
4,"Paddington 2 (2017)\n\n\nPhoenix Buchanan,\nPe...",$40MM,$11MM,$41MM,$228MM
5,Bridget Jones's Baby (2016)\n\n\nSpecial Thank...,$35MM,$8.6MM,$24MM,$212MM


In [14]:
# Deleting an empty row
df = df.drop(index = 0)

# Enriching the DataFrame with IMDb's film ids and roles (kinds)
df['film_id'] = film_id
df['actor_role'] = role

df.head()

Unnamed: 0,raw_title,budget,opening_weekend,gross_us_and_canada,gross_worldwide,film_id,actor_role
1,Operation Fortune: Ruse de guerre (2023)\n\n\n...,$50MM,$19MM,,,7985704,actor
2,Glass Onion (2022)\n\n\nPhillip\n\n Phillip\n ...,$40MM,$9.4MM,$13MM,$13MM,11564570,actor
3,The Gentlemen (2019)\n\n\nFletcher\n\n Fletche...,$22MM,$11MM,$36MM,$115MM,8367814,actor
4,"Paddington 2 (2017)\n\n\nPhoenix Buchanan,\nPe...",$40MM,$11MM,$41MM,$228MM,4468740,actor soundtrack
5,Bridget Jones's Baby (2016)\n\n\nSpecial Thank...,$35MM,$8.6MM,$24MM,$212MM,1473832,thanks


### 3. Extracting data from IMDb website using Cinemagoer library

##### Note

At first I tried to extract all the data about all the films at once, but the Cinemagoer library shows an error if you try to extract the data that doesn't exist (for example, not all films have rating information). So we need to create a ID's list for every information type we want to fetch.

In [29]:
ia = imdb.Cinemagoer()

In [30]:
# Let's check what kind of information we could extract about films using the first film in the IDs' list.
ia.get_movie(film_id[0]).infoset2keys

{'main': ['localized title',
  'cast',
  'genres',
  'runtimes',
  'countries',
  'country codes',
  'language codes',
  'color info',
  'box office',
  'certificates',
  'original air date',
  'rating',
  'votes',
  'cover url',
  'imdbID',
  'videos',
  'languages',
  'title',
  'year',
  'kind',
  'original title',
  'director',
  'writer',
  'producer',
  'composer',
  'cinematographer',
  'editor',
  'editorial department',
  'casting director',
  'production design',
  'art direction',
  'set decoration',
  'costume designer',
  'make up',
  'production manager',
  'assistant director',
  'art department',
  'sound crew',
  'special effects',
  'visual effects',
  'stunt performer',
  'camera and electrical department',
  'casting department',
  'costume department',
  'location management',
  'music department',
  'script department',
  'transportation department',
  'miscellaneous crew',
  'thanks',
  'akas',
  'production companies',
  'distributors',
  'special effects compan

We are interested in:
title
year
genres
runtimes
cast
director
rating
production companies

In [31]:
# Using an 'infoset2keys' attribute to get to know what kind of data we can extract about every film
file = []
for i in film_id:
    file.append(ia.get_movie(i).infoset2keys)

file2 = []
for i in file:
    file2.append(str(i.values()))
    
title_ids = []
for i in file2:
        title_ids.append('title' in i)
    
year_ids = []
for i in file2:
        year_ids.append('year' in i)

genres_ids = []
for i in file2:
    genres_ids.append('genres' in i)

runtimes_ids = []
for i in file2:
    runtimes_ids.append('runtimes' in i)

cast_ids = []
for i in file2:
    cast_ids.append('cast' in i)

director_ids = []
for i in file2:
    director_ids.append('director' in i)

rating_ids = []
for i in file2:
    rating_ids.append('rating' in i)

imdb_ids = []
for i in file2:
    imdb_ids.append('imdbID' in i)
    
production_companies_ids = []
for i in file2:
    production_companies_ids.append('production companies' in i)

In [32]:
# Making a dictionary. The keys will become the column names of a DataFrame
dictionary = {'film_id':film_id, 'title': title_ids, 'year': year_ids, 'genres':genres_ids,'runtimes':runtimes_ids, 'rating':rating_ids,'cast': cast_ids, 'director': director_ids,'imdbID':imdb_ids,'production companies': production_companies_ids}

# Converting dictionary to a DataFrame
df_file = pd.DataFrame(dictionary)
df_file.head()

Unnamed: 0,film_id,title,year,genres,runtimes,rating,cast,director,imdbID,production companies
0,7985704,True,True,True,True,True,True,True,True,True
1,11564570,True,True,True,True,True,True,True,True,True
2,8367814,True,True,True,True,True,True,True,True,True
3,4468740,True,True,True,True,True,True,True,True,True
4,1473832,True,True,True,True,True,True,True,True,True


In [33]:
# Creating a columns list
columns_list = df_file.columns.to_list()
columns_list.remove('film_id')

# Assigning the total number of films to a variable
l = len(film_id)

# Counting True values for every column
for i in columns_list:
    if len(df_file.film_id[df_file[i] == True].to_list()) < l:
        print(i)

rating
production companies


In [34]:
# Creating two lists for columns which have less than 47 values — 1) for rating, 2) for production companies
rating_list = df_file.film_id[df_file.rating == True].to_list()
production_list = df_file.film_id[df_file['production companies'] == True].to_list()

In [35]:
# Extracting data about rating and production companies
rating = []
for i in rating_list:
      rating.append(ia.get_movie(i)['rating'])
rating_df = pd.DataFrame({'film_id':rating_list, 'rating':rating})
        
production_companies = []
for i in production_list:
    production_companies.append(ia.get_movie(i)['production companies'])

In [82]:
# Extracting the rest of the data 
# film_name = []
# year = []
genres = []

for i in film_id:
#     film_name.append(ia.get_movie(i)['title'])
#     year.append(ia.get_movie(i)['year'])
    genres.append(ia.get_movie(i)['genres'])

In [41]:
# Extracting the rest of the data 

runtimes = []
# rating = []
director = []
#production_companies = []
cast = []
for i in film_id:
    runtimes.append(ia.get_movie(i)['runtimes'])
    director.append(ia.get_movie(i)['director'])
    cast.append(ia.get_movie(i)['cast'])
#       rating.append(ia.get_movie(i)['rating'])
#       production_companies.append(ia.get_movie(i)['production companies'])    

In [50]:
# Clearing runtimes data
runtimes2 = []
for i in runtimes:
    runtimes2.append(str(i).strip('[]'))

runtimes3 = []
for i in runtimes2:
    runtimes3.append(i.strip("''"))
    

In [None]:
# # готовим два списка 1) для rating, 2) для production companies, где все значения True
# rating_list = df_file.film_id[df_file.rating == True].to_list()
# production_list = df_file.film_id[df_file['production companies'] == True].to_list()

Multiple values:
genres
director
cast
production_companies

In [44]:
# Cleaning directors data
director2 = []
for i in director:
    director2.append(str(i))

director4 = []
for a in director2:
    director4.append(re.split(r',|:_|_>|name', a))

dir_df = pd.DataFrame(director4)
directors_fin = dir_df[[2,6,10]]
directors_fin = directors_fin.rename(columns = {2:'dir1',6:'dir2',10:'dir3'})
directors_fin['film_id'] = film_id

In [45]:
# Cleaning cast data
cast2 = []
for i in cast:
    cast2.append(str(i))

cast3 = []
for a in cast2:
    cast3.append(re.split(r',|:_|_>|name', a))

cast_df = pd.DataFrame(cast3)
cast_20_actors = cast_df[[2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,66,70,74,78]]
cast_20_actors = cast_20_actors.rename(columns = {2:'act1',6:'act2',10:'act3',14:'act4',18:'act5',22:'act6',26:'act7',30:'act8',34:'act9',38:'act10',42:'act11',46:'act12',50:'act13',54:'act14',58:'act15',62:'act16',66:'act17',70:'act18',74:'act19',78:'act20'})
cast_20_actors['film_id'] = film_id

In [46]:
# Cleaning production companies data
production_companies2 = []
for i in production_companies:
    production_companies2.append(str(i))
    
production_companies3 = []
for a in production_companies2:
    production_companies3.append(re.split(r',|:_|_>|name', a))
    
prod_df = pd.DataFrame(production_companies3)
prod_df['film_id'] = production_list
production_fin = prod_df[['film_id',2,6,10,14,18,22,26,30,34,38,42]]
production_fin = production_fin.rename(columns = {2:'pc1', 6:'pc2', 10:'pc3', 14:'pc4',18:'pc5',22:'pc6',26:'pc7',30:'pc8',34:'pc9',38:'pc10',42:'pc11'})

In [90]:
genres2 = []
for i in genres:
    genres2.append(str(i).strip('[]'))

In [96]:
genres_df = pd.DataFrame(genres2)
genres_df['film_id'] = film_id

In [None]:
# runtimes2 = []
# for i in runtimes:
#     runtimes2.append(str(i).strip('[]'))

In [47]:
# # Cleaning genres data
# genres = pd.DataFrame(genres)
# genres['film_id'] = film_id
# genres = genres.rename(columns = {0: 'genre1',1: 'genre2', 2: 'genre3', 3: 'genre4', 4: 'genre5'})

### 4. Building a DataFrame

In [48]:
# Let's have a look at our preliminary DataFrame
df.head()

Unnamed: 0,raw_title,budget,opening_weekend,gross_us_and_canada,gross_worldwide,film_id,actor_role
1,Operation Fortune: Ruse de guerre (2023)\n\n\n...,$50MM,$19MM,,,7985704,actor
2,Glass Onion (2022)\n\n\nPhillip\n\n Phillip\n ...,$40MM,$9.4MM,$13MM,$13MM,11564570,actor
3,The Gentlemen (2019)\n\n\nFletcher\n\n Fletche...,$22MM,$11MM,$36MM,$115MM,8367814,actor
4,"Paddington 2 (2017)\n\n\nPhoenix Buchanan,\nPe...",$40MM,$11MM,$41MM,$228MM,4468740,actor soundtrack
5,Bridget Jones's Baby (2016)\n\n\nSpecial Thank...,$35MM,$8.6MM,$24MM,$212MM,1473832,thanks


In [68]:
# Deleting the column 'raw_title'
df = df.drop(['raw_title'], axis = 1)

In [97]:
# Enriching our preliminary Data Frame with columns
df['film_title'] = film_name
df['year'] = year
df['runtimes'] = runtimes3

df1 = df.merge(genres_df, on = 'film_id', how = 'left')
df2 = df1.merge(rating_df, on = 'film_id', how = 'left')
df3 = df2.merge(directors_fin, on = 'film_id', how = 'left')
df4 = df3.merge(production_fin, on = 'film_id', how = 'left')

# Creating a final Data Frame
hugh_grant_films_data = df4.merge(cast_20_actors, on = 'film_id', how = 'left')

In [98]:
hugh_grant_films_data.head(3)

Unnamed: 0,budget,opening_weekend,gross_us_and_canada,gross_worldwide,film_id,actor_role,film_title,year,runtimes,0,...,act11,act12,act13,act14,act15,act16,act17,act18,act19,act20
0,$50MM,$19MM,,,7985704,actor,Operation Fortune: Ruse de guerre,2023,114,"'Action', 'Comedy', 'Thriller'",...,Lourdes Faberes,Sam Douglas,Ergun Kuyucu,Oliver Maltman,Matthew Hawksley,Max Beesley,Aksel Ustun,Ozan Ayhan,Antonio Bustorff,Joshua Flickema
1,$40MM,$9.4MM,$13MM,$13MM,11564570,actor,Glass Onion,2022,139,"'Comedy', 'Crime', 'Drama', 'Mystery', 'Thriller'",...,Jackie Hoffman,Dallas Roberts,Ethan Hawke,Hugh Grant,Stephen Sondheim,Natasha Lyonne,Kareem Abdul-Jabbar,Serena Williams,Yo-Yo Ma,Joseph Gordon-Levitt
2,$22MM,$11MM,$36MM,$115MM,8367814,actor,The Gentlemen,2019,113,"'Action', 'Comedy', 'Crime'",...,Simon R. Barker,Eddie Marsan,Jason Wong,John Dagleish,Jordan Long,Lily Frazer,Gershwyn Eustache Jnr,Samuel West,Geraldine Somerville,Eliot Sumner


In [None]:
# Saving a final DataFrame to csv file
hugh_grant_films_data.to_csv(r'/Users/anastasia/Downloads/hugh_grant_films_data.csv')

###### The DataFrame with films is ready! 

### 5. Scraping data and building a dataset about awards

In [75]:
# IMDb has quite a limited data about salary, so we won't include it in a DataFrame
ia.get_person('0000424')['salary history']

['Four Weddings and a Funeral (1994)::£35,000',
 'Notting Hill (1999)::$7,500,000',
 'Mickey Blue Eyes (1999)::$7,500,000',
 'Two Weeks Notice (2002)::$12,500,000']

In [76]:
# Again assigning an URL to a variable
url_awards = 'https://www.imdb.com/name/nm0000424/awards/?ref_=nm_awd'

response = requests.get(url_awards)
soup2 = BeautifulSoup(response.content, 'html.parser')

In [77]:
# Again storing the data we are curious about in the variable
hugh_awards_data = soup2.find('div', id="main")

# Creating lists which will be used for our DataFrame

award_year = []
outcome = []
award_name = []
award_film = []
notes = []

rows = hugh_awards_data.find_all('tr')

for row in enumerate(rows):
    col1 = row[1].find(class_="award_year")
    if col1 is not None:
        award_year.append(col1.text.strip())
    else:
        award_year.append(rows[row[0]-1].find(class_="award_year").text.strip())
        
    col2 = row[1].find('b')
    if col2 is not None:
        outcome.append(col2.text.strip())
    else:
        outcome.append(rows[row[0]-1].find('b').text.strip())
        
    col3 = row[1].find(class_='award_category')
    if col3 is not None:
        award_name.append(col3.text.strip())
    else:
        award_name.append(rows[row[0]-1].find(class_='award_category').text.strip())
   
    col4 = row[1].find(class_ = 'award_description').find('a')
    if col4 is not None:
        award_film.append(col4.text.strip())
    else:
        award_film.append('')
    
    col5 = row[1].find(class_ = 'award_description')
#     col5 = [ele.text.strip() for ele in col5]
    notes.append(col5.text.strip()) # Get rid of empty values


In [78]:
# Creating the final DataFrame
df_hugh_awards = pd.DataFrame({'award_year':award_year,'outcome':outcome,'award_name':award_name, 'award_film':award_film, 'notes':notes})

In [79]:
df_hugh_awards.head()

Unnamed: 0,award_year,outcome,award_name,award_film,notes
0,2021,Nominee,Primetime Emmy,The Undoing,Outstanding Lead Actor in a Limited or Antholo...
1,2019,Nominee,Primetime Emmy,A Very English Scandal,Outstanding Lead Actor in a Limited Series or ...
2,2019,Nominee,BAFTA TV Award,A Very English Scandal,Best Leading Actor\nA Very English Scandal (2018)
3,2018,Nominee,BAFTA Film Award,Paddington 2,Best Supporting Actor\nPaddington 2 (2017)
4,2017,Nominee,BAFTA Film Award,Florence Foster Jenkins,Best Supporting Actor\nFlorence Foster Jenkins...


In [80]:
# Saving the final DataFrame to csv file
df_hugh_awards.to_csv(r'/Users/anastasia/Downloads/hugh_grant_awards8.csv')

###### The DataFrame with awards is ready!