### 1. Installing all the necessary libraries

In [1]:
pip install requests beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install cinemagoer

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd   # To create a DataFrame
import requests       # To send requests to the URLs
from bs4 import BeautifulSoup # To get the content in the form of HTML
import re # To work with strings
import imdb # To extract data about films

### 2. Choosing a URL, scraping data using Beutiful Soup library, creating a preliminary DataFrame

In [4]:
# Assigning an URL to a variable
url = 'https://pro.imdb.com/name/nm0000147/credits'

# Sending a HTTP request
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

In [5]:
# Storing the data we are curious about in the variable
movie_data = soup.find('table', id="past_film_filmography")

# Storing IMDb ids of films in the variable
ids = [tag['data-filter-item-id'] for tag in movie_data.select('tr[data-filter-item-id]')]

# Storing a role kind (actor/thanks/actor soundtrack) in the variable
role  = [tag['data-filterable-name'] for tag in movie_data.select('tr[data-filterable-name]')]

# Creating a preliminary DataFrame
data = []
table = soup.find('table', id ="past_film_filmography")

df = pd.read_html(str(table))[0]

In [6]:
# Changing a DataFrame columns header to lowercase
df.columns = map(str.lower, df.columns)

In [7]:
df.head()

Unnamed: 0,past film & video (80 titles),budget,opening weekend,gross (us & canada),gross (worldwide)
0,Squaring the Circle (The Story of Hipgnosis) (...,,,,
1,Empire of Light (2022) Donald Ellis Donald El...,,$163K,$1.2MM,$4.4MM
2,Operation Mincemeat (2021) Ewen Montagu Ewen ...,,,,$16MM
3,Mothering Sunday (2021) Mr Godfrey Niven Mr G...,,$9K,$275K,$2.3MM
4,"Supernova (2020) Sam, Performer (""Salut d'Amou...",,$90K,$234K,$3.4MM


In [8]:
# If we check the IDs' list we'll see that every ID in the list has 2 extra characters 'tt' at the beginning. 
ids[0]

'tt10850264'

In [9]:
# Creating a new clean IDs list.
film_id = []
for i in ids:
    film_id.append(i[2:])

film_id[0]

'10850264'

In [10]:
# Enriching the DataFrame with IMDb's film ids and roles (kinds)
df['film_id'] = film_id
df['actor_role'] = role

df.head()

Unnamed: 0,past film & video (80 titles),budget,opening weekend,gross (us & canada),gross (worldwide),film_id,actor_role
0,Squaring the Circle (The Story of Hipgnosis) (...,,,,,10850264,producer
1,Empire of Light (2022) Donald Ellis Donald El...,,$163K,$1.2MM,$4.4MM,14402146,actor
2,Operation Mincemeat (2021) Ewen Montagu Ewen ...,,,,$16MM,1879016,actor
3,Mothering Sunday (2021) Mr Godfrey Niven Mr G...,,$9K,$275K,$2.3MM,12229370,actor
4,"Supernova (2020) Sam, Performer (""Salut d'Amou...",,$90K,$234K,$3.4MM,11169050,actor soundtrack


### 3. Extracting data from IMDb website using Cinemagoer library

##### Note

At first I tried to extract all the data about all the films at once, but the Cinemagoer library shows an error if you try to extract the data that doesn't exist (for example, not all films have rating information). So we need to create a ID's list for every information type we want to fetch.

In [11]:
ia = imdb.Cinemagoer()

In [12]:
# Let's check what kind of information we could extract about films using the first film in the IDs' list.
ia.get_movie(film_id[0]).infoset2keys

{'main': ['localized title',
  'cast',
  'genres',
  'runtimes',
  'countries',
  'country codes',
  'language codes',
  'color info',
  'rating',
  'votes',
  'cover url',
  'imdbID',
  'plot outline',
  'languages',
  'title',
  'year',
  'kind',
  'original title',
  'director',
  'writer',
  'producer',
  'cinematographer',
  'editor',
  'editorial department',
  'art department',
  'sound crew',
  'camera and electrical department',
  'music department',
  'miscellaneous crew',
  'akas',
  'production companies',
  'other companies'],
 'plot': []}

In [13]:
# Using an 'infoset2keys' attribute to get to know what kind of data we can extract about every film
file = []
for i in film_id:
    file.append(ia.get_movie(i).infoset2keys)

file2 = []
for i in file:
    file2.append(str(i.values()))
    
title_ids = []
for i in file2:
        title_ids.append('title' in i)
    
year_ids = []
for i in file2:
        year_ids.append('year' in i)

genres_ids = []
for i in file2:
    genres_ids.append('genres' in i)

runtimes_ids = []
for i in file2:
    runtimes_ids.append('runtimes' in i)

cast_ids = []
for i in file2:
    cast_ids.append('cast' in i)

director_ids = []
for i in file2:
    director_ids.append('director' in i)

rating_ids = []
for i in file2:
    rating_ids.append('rating' in i)
    
production_companies_ids = []
for i in file2:
    production_companies_ids.append('production companies' in i)

In [14]:
# Making a dictionary. The keys will become the columns' names in a DataFrame
dictionary = {'film_id':film_id, 'title': title_ids, 'year': year_ids, 'genres':genres_ids,'runtimes':runtimes_ids, 'rating':rating_ids,'cast': cast_ids, 'director': director_ids,'production companies': production_companies_ids}

# Converting dictionary to a DataFrame
df_file = pd.DataFrame(dictionary)
df_file.head()

Unnamed: 0,film_id,title,year,genres,runtimes,rating,cast,director,production companies
0,10850264,True,True,True,True,True,True,True,True
1,14402146,True,True,True,True,True,True,True,True
2,1879016,True,True,True,True,True,True,True,True
3,12229370,True,True,True,True,True,True,True,True
4,11169050,True,True,True,True,True,True,True,True


##### Lists of films' IDs for different kinds of data

In [15]:
# Creating a list of IDs for each kind of data
title_list = df_file.film_id[df_file['title'] == True].to_list()
year_list = df_file.film_id[df_file['year'] == True].to_list()
genres_list = df_file.film_id[df_file['genres'] == True].to_list()
runtimes_list = df_file.film_id[df_file['runtimes'] == True].to_list()
cast_list = df_file.film_id[df_file['cast'] == True].to_list()
director_list = df_file.film_id[df_file['director'] == True].to_list()
rating_list = df_file.film_id[df_file['rating'] == True].to_list()
production_list = df_file.film_id[df_file['production companies'] == True].to_list()

##### Title

In [16]:
# Fetching data about titles
title = []
for i in title_list:
    title.append(ia.get_movie(i)['title'])

# Creating a Dataframe
title_df = pd.DataFrame({'film_id':title_list, 'title':title})

##### Year

In [29]:
# Fetching data about years
year = []
for i in year_list:
    year.append(ia.get_movie(i)['year'])

# Creating a Dataframe
year_df = pd.DataFrame({'film_id':year_list, 'year':year})

##### Genres

In [17]:
# Fetching data about genres
genres = []
for i in genres_list:
    genres.append(ia.get_movie(i)['genres'])
    
# Creating a Dataframe
genres_df = pd.DataFrame(genres)
genres_df['film_id'] = genres_list
genres_df = genres_df.rename(columns = {0: 'genre1',1: 'genre2', 2: 'genre3', 3: 'genre4', 4: 'genre5', 5: 'genre6'})

##### Runtimes

In [18]:
# Fetching data about runtimes
runtimes = []
for i in runtimes_list:
    runtimes.append(ia.get_movie(i)['runtimes'])

# Cleaning data
runtimes2 = []
for i in runtimes:
    runtimes2.append(str(i).strip('[]'))

runtimes3 = []
for i in runtimes2:
    runtimes3.append(i.strip("''"))

# Creating a Dataframe
runtimes_df = pd.DataFrame({'film_id':runtimes_list, 'runtimes':runtimes3})

##### Directors

In [21]:
# Fetching data about directors
director = []
for i in director_list:
    director.append(ia.get_movie(i)['director'])

# Cleaning data
director2 = []
for i in director:
    director2.append(str(i))

director4 = []
for a in director2:
    director4.append(re.split(r',|:_|_>|name', a))

# Creating a Dataframe
dir_df = pd.DataFrame(director4)
dir_df['film_id'] = director_list
director_df = dir_df[['film_id',2,6,10]]
director_df = director_df.rename(columns = {2:'dir1',6:'dir2',10:'dir3'})

##### Cast

In [22]:
# Fetching data about cast
cast = []
for i in cast_list:
    cast.append(ia.get_movie(i)['cast'])

# Cleaning data
cast2 = []
for i in cast:
    cast2.append(str(i))

cast3 = []
for a in cast2:
    cast3.append(re.split(r',|:_|_>|name', a))
    
# Creating a Dataframe
cast_df = pd.DataFrame(cast3)
cast_20_actors = cast_df[[2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,66,70,74,78]]
cast_df = cast_20_actors.rename(columns = {2:'act1',6:'act2',10:'act3',14:'act4',18:'act5',22:'act6',26:'act7',30:'act8',34:'act9',38:'act10',42:'act11',46:'act12',50:'act13',54:'act14',58:'act15',62:'act16',66:'act17',70:'act18',74:'act19',78:'act20'})
cast_df['film_id'] = cast_list

##### Rating

In [23]:
# Fetching data about ratings
rating = []
for i in rating_list:
      rating.append(ia.get_movie(i)['rating'])
        
# Creating a Dataframe
rating_df = pd.DataFrame({'film_id':rating_list, 'rating':rating})       

##### Production companies

In [24]:
# Fetching data about production companies
production_companies = []
for i in production_list:
    production_companies.append(ia.get_movie(i)['production companies'])

# Cleaning data
production_companies2 = []
for i in production_companies:
    production_companies2.append(str(i))
    
production_companies3 = []
for a in production_companies2:
    production_companies3.append(re.split(r',|:_|_>|name', a))

# Creating a Dataframe
prod_df = pd.DataFrame(production_companies3)
production_fin = prod_df[[2,6,10,14,18,22,26,30,34,38,42]]
production_df = production_fin.rename(columns = {2:'pc1', 6:'pc2', 10:'pc3', 14:'pc4',18:'pc5',22:'pc6',26:'pc7',30:'pc8',34:'pc9',38:'pc10',42:'pc11'})
production_df['film_id'] = production_list

### 4. Building a DataFrame

In [25]:
# Let's have a look at our preliminary DataFrame
df.head()

Unnamed: 0,past film & video (80 titles),budget,opening weekend,gross (us & canada),gross (worldwide),film_id,actor_role
0,Squaring the Circle (The Story of Hipgnosis) (...,,,,,10850264,producer
1,Empire of Light (2022) Donald Ellis Donald El...,,$163K,$1.2MM,$4.4MM,14402146,actor
2,Operation Mincemeat (2021) Ewen Montagu Ewen ...,,,,$16MM,1879016,actor
3,Mothering Sunday (2021) Mr Godfrey Niven Mr G...,,$9K,$275K,$2.3MM,12229370,actor
4,"Supernova (2020) Sam, Performer (""Salut d'Amou...",,$90K,$234K,$3.4MM,11169050,actor soundtrack


In [26]:
df.columns

Index(['past film & video  (80 titles)', 'budget', 'opening weekend',
       'gross (us & canada)', 'gross (worldwide)', 'film_id', 'actor_role'],
      dtype='object')

In [27]:
# Deleting the column 'past film & video (47 titles)'
df = df.drop(df.columns[0], axis=1)

In [32]:
# Enriching our preliminary DataFrame with columns

df1 = df.merge(title_df, on = 'film_id', how = 'left')
df2 = df1.merge(year_df, on = 'film_id', how = 'left')
df3 = df2.merge(runtimes_df, on = 'film_id', how = 'left')
df4 = df3.merge(genres_df, on = 'film_id', how = 'left')
df5 = df4.merge(rating_df, on = 'film_id', how = 'left')
df6 = df5.merge(director_df, on = 'film_id', how = 'left')
df7 = df6.merge(production_df, on = 'film_id', how = 'left')
df8 = df7.merge(cast_df, on = 'film_id', how = 'left')

# Creating a final DataFrame
colin_firth_films_data = df8

In [33]:
# Saving a final DataFrame to csv file
colin_firth_films_data.to_csv(r'/Users/anastasia/Downloads/colin_firth_films_data.csv')

###### The DataFrame with films is ready! 

### 5. Scraping data and building a dataset about awards

In [35]:
# Again assigning an URL to a variable
url_awards = 'https://www.imdb.com/name/nm0000147/awards/?ref_=nm_awd'

response = requests.get(url_awards)
soup2 = BeautifulSoup(response.content, 'html.parser')

In [36]:
# Again storing the data we are curious about in the variable
hugh_awards_data = soup2.find('div', id="main")

# Creating lists which will be used for our DataFrame

award_year = []
outcome = []
award_name = []
award_film = []
notes = []

rows = hugh_awards_data.find_all('tr')

for row in enumerate(rows):
    col1 = row[1].find(class_="award_year")
    if col1 is not None:
        award_year.append(col1.text.strip())
    else:
        award_year.append(rows[row[0]-1].find(class_="award_year").text.strip())
        
    col2 = row[1].find('b')
    if col2 is not None:
        outcome.append(col2.text.strip())
    else:
        outcome.append(rows[row[0]-1].find('b').text.strip())
        
    col3 = row[1].find(class_='award_category')
    if col3 is not None:
        award_name.append(col3.text.strip())
    else:
        award_name.append(rows[row[0]-1].find(class_='award_category').text.strip())
   
    col4 = row[1].find(class_ = 'award_description').find('a')
    if col4 is not None:
        award_film.append(col4.text.strip())
    else:
        award_film.append('')
    
    col5 = row[1].find(class_ = 'award_description')
#     col5 = [ele.text.strip() for ele in col5]
    notes.append(col5.text.strip()) # Get rid of empty values


In [38]:
# Creating the final DataFrame
colin_firth_awards = pd.DataFrame({'award_year':award_year,'outcome':outcome,'award_name':award_name, 'award_film':award_film, 'notes':notes})

In [39]:
colin_firth_awards.head()

Unnamed: 0,award_year,outcome,award_name,award_film,notes
0,2011,Winner,Oscar,The King's Speech,Best Performance by an Actor in a Leading Role...
1,2010,Nominee,Oscar,A Single Man,Best Performance by an Actor in a Leading Role...
2,2022,Nominee,Primetime Emmy,The Staircase,Outstanding Lead Actor in a Limited or Antholo...
3,2001,Nominee,Primetime Emmy,Conspiracy,Outstanding Supporting Actor in a Miniseries o...
4,2011,Winner,BAFTA Film Award,The King's Speech,Best Leading Actor\nThe King's Speech (2010)


In [71]:
# Saving the final DataFrame to csv file
colin_firth_awards.to_csv(r'/Users/anastasia/Downloads/colin_firth_awards.csv')

###### The DataFrame with awards is ready!