# Set-up

In [0]:
# load packages
import requests
from bs4 import BeautifulSoup

In [0]:
# Define the URL of the site
base_site = "https://editorial.rottentomatoes.com/guide/140-essential-action-movies-to-watch-now/2/"

In [0]:
# sending a request to the webpage
response = requests.get(base_site)
response.status_code

200

In [0]:
# get the HTML from the webpage
html = response.content

## Choosing a parser

### html.parser

In [0]:
# convert the HTML to a Beautiful Soup object
soup = BeautifulSoup(html, 'html.parser')

In [0]:
# Exporting the HTML to a file
with open('Rotten_tomatoes_page_2_HTML_Parser.html', 'wb') as file:
    file.write(soup.prettify('utf-8'))

In [0]:
# When inspecting the file we see that HTML element is closed at the begining -- it parsed incorrectly!
# Let's check another parser

### lxml

In [0]:
# convert the HTML to a BeatifulSoup object
soup = BeautifulSoup(html, 'lxml')

In [0]:
# Exporting the HTML to a file
with open('Rotten_tomatoes_page_2_LXML_Parser.html', 'wb') as file:
    file.write(soup.prettify('utf-8'))

In [0]:
# By first accounts of inspecting the file everything seems fine

### A word of caution

In [0]:
# Beautiful Soup ranks the lxml parser as the best one.

# If a parser is not explicitly stated in the Beautiful Soup constructor,
# the best one available on the current machine is chosen.

# This means that the same piece of code can give different results on different computers.

## Finding an element containing all the data

In [0]:
# Find all div tags on the webpage containing the information we want to scrape
divs = soup.find_all("div", {"class": "col-sm-18 col-full-xs countdown-item-content"})
divs

[<div class="col-sm-18 col-full-xs countdown-item-content">
 <div class="row countdown-item-title-bar">
 <div class="col-sm-20 col-full-xs" style="height: 100%;">
 <div class="article_movie_title" style="float: left;">
 <div><h2><a href="https://www.rottentomatoes.com/m/13_assassins_2011/">13 Assassins</a> <span class="subtle start-year">(2011)</span> <span class="icon tiny certified" title="Certified Fresh"></span> <span class="tMeterScore">95%</span></h2></div>
 </div>
 </div>
 <div class="col-sm-4 col-full-xs" style="height: 100%;">
 <div class="countdown-index">#70</div>
 </div>
 </div>
 <div class="row countdown-item-details">
 <div class="col-sm-24">
 <div class="info countdown-adjusted-score"><span class="descriptor">Adjusted Score: </span>98.424% <span class="glyphicon glyphicon-question-sign" data-html="true" data-original-title="The Adjusted Score comes from a weighted formula (Bayesian) that we use that accounts for variation in the number of reviews per movie." data-placeme

# Extracting the title, year and score of each movie

In [0]:
# The title, year and score of each movie are contained in the 'h2' tags

In [0]:
# for instance, let's explore the first div
divs[0].find("h2")

<h2><a href="https://www.rottentomatoes.com/m/13_assassins_2011/">13 Assassins</a> <span class="subtle start-year">(2011)</span> <span class="icon tiny certified" title="Certified Fresh"></span> <span class="tMeterScore">95%</span></h2>

In [0]:
# Extracting all 'h2' tags
headings = [div.find("h2") for div in divs]
headings

[<h2><a href="https://www.rottentomatoes.com/m/13_assassins_2011/">13 Assassins</a> <span class="subtle start-year">(2011)</span> <span class="icon tiny certified" title="Certified Fresh"></span> <span class="tMeterScore">95%</span></h2>,
 <h2><a href="https://www.rottentomatoes.com/m/full_contact/">Full Contact</a> <span class="subtle start-year">(1992)</span> <span class="icon tiny fresh" title="Fresh"></span> <span class="tMeterScore">88%</span></h2>,
 <h2><a href="https://www.rottentomatoes.com/m/indiana_jones_and_the_last_crusade/">Indiana Jones and the Last Crusade</a> <span class="subtle start-year">(1989)</span> <span class="icon tiny certified" title="Certified Fresh"></span> <span class="tMeterScore">88%</span></h2>,
 <h2><a href="https://www.rottentomatoes.com/m/kung_fu_hustle/">Kung Fu Hustle</a> <span class="subtle start-year">(2005)</span> <span class="icon tiny certified" title="Certified Fresh"></span> <span class="tMeterScore">90%</span></h2>,
 <h2><a href="https://www

In [0]:
# Inspecting the text inside the headings
[heading.text for heading in headings]

['13 Assassins (2011)  95%',
 'Full Contact (1992)  88%',
 'Indiana Jones and the Last Crusade (1989)  88%',
 'Kung Fu Hustle (2005)  90%',
 'A Better Tomorrow (2010)  93%',
 'Iron Man (2008)  94%',
 'The Night Comes For Us (2018)  90%',
 'Logan (2017)  93%',
 'Goldfinger (1964)  97%',
 'Assault on Precinct 13 (1976)  98%',
 'Wonder Woman (2017)  93%',
 'Fist of Fury (Jing wu men) (1972)  92%',
 'Captain America: The Winter Soldier (2014)  90%',
 'Oldboy (2005)  82%',
 'The French Connection (1971)  98%',
 'Furious 7 (2015)  81%',
 'La Femme Nikita (Nikita) (1990)  88%',
 'Supercop (1996)  96%',
 'Dirty Harry (1971)  91%',
 'Live Die Repeat: Edge of Tomorrow (2014)  90%',
 'X2: X-Men United (2003)  85%',
 'The Fugitive (1993)  96%',
 'Black Panther (2018)  97%',
 'Inception (2010)  87%',
 'Braveheart (1995)  77%',
 'Minority Report (2002)  90%',
 'Avengers: Endgame (2019)  94%',
 'Dredd (2012)  79%',
 'The Bourne Identity (2002)  83%',
 'Ip Man (2010)  85%',
 'Face/Off (1997)  92%',
 '

In [0]:
# It does contain the info we want to extract
# However, we need to obtain the title, year and score separately
# Let's inspect one heading to see if there is a way to distinguish between them
headings[0]

<h2><a href="https://www.rottentomatoes.com/m/13_assassins_2011/">13 Assassins</a> <span class="subtle start-year">(2011)</span> <span class="icon tiny certified" title="Certified Fresh"></span> <span class="tMeterScore">95%</span></h2>

In [0]:
# We notice that:

# The movie title is in the 'a' tag
# The year is in a 'span' with class 'start-year'
# The score is in a 'span' with class 'tMeterScore'

## Title

In [0]:
# Let's check all heading links
[heading.find('a') for heading in headings]

[<a href="https://www.rottentomatoes.com/m/13_assassins_2011/">13 Assassins</a>,
 <a href="https://www.rottentomatoes.com/m/full_contact/">Full Contact</a>,
 <a href="https://www.rottentomatoes.com/m/indiana_jones_and_the_last_crusade/">Indiana Jones and the Last Crusade</a>,
 <a href="https://www.rottentomatoes.com/m/kung_fu_hustle/">Kung Fu Hustle</a>,
 <a href="https://www.rottentomatoes.com/m/better_tomorrow/">A Better Tomorrow</a>,
 <a href="https://www.rottentomatoes.com/m/iron_man/">Iron Man</a>,
 <a href="https://www.rottentomatoes.com/m/the_night_comes_for_us/">The Night Comes For Us</a>,
 <a href="https://www.rottentomatoes.com/m/logan_2017/">Logan</a>,
 <a href="https://www.rottentomatoes.com/m/goldfinger/">Goldfinger</a>,
 <a href="https://www.rottentomatoes.com/m/1001280-assault_on_precinct_13/">Assault on Precinct 13</a>,
 <a href="https://www.rottentomatoes.com/m/wonder_woman_2017/">Wonder Woman</a>,
 <a href="https://www.rottentomatoes.com/m/chinese_connection/">Fist of

In [0]:
# Obtaining the movie titles from the links
movie_names = [heading.find('a').string for heading in headings]
movie_names

['13 Assassins',
 'Full Contact',
 'Indiana Jones and the Last Crusade',
 'Kung Fu Hustle',
 'A Better Tomorrow',
 'Iron Man',
 'The Night Comes For Us',
 'Logan',
 'Goldfinger',
 'Assault on Precinct 13',
 'Wonder Woman',
 'Fist of Fury (Jing wu men)',
 'Captain America: The Winter Soldier',
 'Oldboy',
 'The French Connection',
 'Furious 7',
 'La Femme Nikita (Nikita)',
 'Supercop',
 'Dirty Harry',
 'Live Die Repeat: Edge of Tomorrow',
 'X2: X-Men United',
 'The Fugitive',
 'Black Panther',
 'Inception',
 'Braveheart',
 'Minority Report',
 'Avengers: Endgame',
 'Dredd',
 'The Bourne Identity',
 'Ip Man',
 'Face/Off',
 'To Live and Die in L.A.',
 'The Dark Knight',
 'Mission: Impossible Ghost Protocol',
 'Fast Five',
 'Lethal Weapon',
 'The Rock',
 'RoboCop',
 'John Wick: Chapter 2',
 'Casino Royale',
 'Baby Driver',
 'Fist of Legend (Jing wu ying xiong)',
 'The Killer',
 'The Raid 2',
 'Enter the Dragon',
 'Commando',
 'First Blood',
 'Mission: Impossible Rogue Nation',
 'The Terminat

## Year

In [0]:
# Filtering only the spans containing the year
[heading.find("span", class_ = 'start-year') for heading in headings]

[<span class="subtle start-year">(2011)</span>,
 <span class="subtle start-year">(1992)</span>,
 <span class="subtle start-year">(1989)</span>,
 <span class="subtle start-year">(2005)</span>,
 <span class="subtle start-year">(2010)</span>,
 <span class="subtle start-year">(2008)</span>,
 <span class="subtle start-year">(2018)</span>,
 <span class="subtle start-year">(2017)</span>,
 <span class="subtle start-year">(1964)</span>,
 <span class="subtle start-year">(1976)</span>,
 <span class="subtle start-year">(2017)</span>,
 <span class="subtle start-year">(1972)</span>,
 <span class="subtle start-year">(2014)</span>,
 <span class="subtle start-year">(2005)</span>,
 <span class="subtle start-year">(1971)</span>,
 <span class="subtle start-year">(2015)</span>,
 <span class="subtle start-year">(1990)</span>,
 <span class="subtle start-year">(1996)</span>,
 <span class="subtle start-year">(1971)</span>,
 <span class="subtle start-year">(2014)</span>,
 <span class="subtle start-year">(2003)<

In [0]:
# Extracting the year string
years = [heading.find("span", class_ = 'start-year').string for heading in headings]
years

['(2011)',
 '(1992)',
 '(1989)',
 '(2005)',
 '(2010)',
 '(2008)',
 '(2018)',
 '(2017)',
 '(1964)',
 '(1976)',
 '(2017)',
 '(1972)',
 '(2014)',
 '(2005)',
 '(1971)',
 '(2015)',
 '(1990)',
 '(1996)',
 '(1971)',
 '(2014)',
 '(2003)',
 '(1993)',
 '(2018)',
 '(2010)',
 '(1995)',
 '(2002)',
 '(2019)',
 '(2012)',
 '(2002)',
 '(2010)',
 '(1997)',
 '(1985)',
 '(2008)',
 '(2011)',
 '(2011)',
 '(1987)',
 '(1996)',
 '(1987)',
 '(2017)',
 '(2006)',
 '(2017)',
 '(1994)',
 '(1989)',
 '(2014)',
 '(1973)',
 '(1985)',
 '(1982)',
 '(2015)',
 '(1984)',
 '(2000)',
 '(2003)',
 '(1994)',
 '(1994)',
 '(1994)',
 '(2014)',
 '(2001)',
 '(1987)',
 '(2007)',
 '(1990)',
 '(1982)',
 '(1995)',
 '(2012)',
 '(2018)',
 '(1981)',
 '(1986)',
 '(1992)',
 '(1999)',
 '(1991)',
 '(1988)',
 '(2015)']

In [0]:
years[0]

'(2011)'

### Removing the brackets

In [0]:
# One way to remove the brackets is to drop the first and last symbol of the string
years[0][1:-1]

'2011'

In [0]:
# However, this will break, if the format of the year is changed

In [0]:
# Alternativelly, we can do it with the help of the strip() method (this is robust)

# It removes leading and trailing symbols from a string
# By default, it removes whitespace, but we can specify other symbols to strip

In [0]:
# Removing '('
years[0].strip('(')

'2011)'

In [0]:
# Removing ')'
years[0].strip(')')

'(2011'

In [0]:
# Combining both
years[0].strip('()')

'2011'

In [0]:
# Updating years with stripped values
years = [year.strip('()') for year in years]
years

['2011',
 '1992',
 '1989',
 '2005',
 '2010',
 '2008',
 '2018',
 '2017',
 '1964',
 '1976',
 '2017',
 '1972',
 '2014',
 '2005',
 '1971',
 '2015',
 '1990',
 '1996',
 '1971',
 '2014',
 '2003',
 '1993',
 '2018',
 '2010',
 '1995',
 '2002',
 '2019',
 '2012',
 '2002',
 '2010',
 '1997',
 '1985',
 '2008',
 '2011',
 '2011',
 '1987',
 '1996',
 '1987',
 '2017',
 '2006',
 '2017',
 '1994',
 '1989',
 '2014',
 '1973',
 '1985',
 '1982',
 '2015',
 '1984',
 '2000',
 '2003',
 '1994',
 '1994',
 '1994',
 '2014',
 '2001',
 '1987',
 '2007',
 '1990',
 '1982',
 '1995',
 '2012',
 '2018',
 '1981',
 '1986',
 '1992',
 '1999',
 '1991',
 '1988',
 '2015']

In [0]:
# Converting all the strings to integers
years = [int(year) for year in years]
years

[2011,
 1992,
 1989,
 2005,
 2010,
 2008,
 2018,
 2017,
 1964,
 1976,
 2017,
 1972,
 2014,
 2005,
 1971,
 2015,
 1990,
 1996,
 1971,
 2014,
 2003,
 1993,
 2018,
 2010,
 1995,
 2002,
 2019,
 2012,
 2002,
 2010,
 1997,
 1985,
 2008,
 2011,
 2011,
 1987,
 1996,
 1987,
 2017,
 2006,
 2017,
 1994,
 1989,
 2014,
 1973,
 1985,
 1982,
 2015,
 1984,
 2000,
 2003,
 1994,
 1994,
 1994,
 2014,
 2001,
 1987,
 2007,
 1990,
 1982,
 1995,
 2012,
 2018,
 1981,
 1986,
 1992,
 1999,
 1991,
 1988,
 2015]

## Score

In [0]:
# HOMEWORK

# Filtering only the spans containing the score
[heading.find("span", class_ = 'tMeterScore') for heading in headings]

[<span class="tMeterScore">95%</span>,
 <span class="tMeterScore">88%</span>,
 <span class="tMeterScore">88%</span>,
 <span class="tMeterScore">90%</span>,
 <span class="tMeterScore">93%</span>,
 <span class="tMeterScore">94%</span>,
 <span class="tMeterScore">90%</span>,
 <span class="tMeterScore">93%</span>,
 <span class="tMeterScore">97%</span>,
 <span class="tMeterScore">98%</span>,
 <span class="tMeterScore">93%</span>,
 <span class="tMeterScore">92%</span>,
 <span class="tMeterScore">90%</span>,
 <span class="tMeterScore">82%</span>,
 <span class="tMeterScore">98%</span>,
 <span class="tMeterScore">81%</span>,
 <span class="tMeterScore">88%</span>,
 <span class="tMeterScore">96%</span>,
 <span class="tMeterScore">91%</span>,
 <span class="tMeterScore">90%</span>,
 <span class="tMeterScore">85%</span>,
 <span class="tMeterScore">96%</span>,
 <span class="tMeterScore">97%</span>,
 <span class="tMeterScore">87%</span>,
 <span class="tMeterScore">77%</span>,
 <span class="tMeterScore

In [0]:
# Extracting the score string
scores = [heading.find("span", class_ = 'tMeterScore').string for heading in headings]
scores

['95%',
 '88%',
 '88%',
 '90%',
 '93%',
 '94%',
 '90%',
 '93%',
 '97%',
 '98%',
 '93%',
 '92%',
 '90%',
 '82%',
 '98%',
 '81%',
 '88%',
 '96%',
 '91%',
 '90%',
 '85%',
 '96%',
 '97%',
 '87%',
 '77%',
 '90%',
 '94%',
 '79%',
 '83%',
 '85%',
 '92%',
 '91%',
 '94%',
 '93%',
 '77%',
 '82%',
 '66%',
 '89%',
 '89%',
 '95%',
 '93%',
 '100%',
 '98%',
 '80%',
 '94%',
 '70%',
 '87%',
 '93%',
 '100%',
 '76%',
 '85%',
 '73%',
 '94%',
 '83%',
 '86%',
 '97%',
 '81%',
 '92%',
 '82%',
 '95%',
 '86%',
 '86%',
 '97%',
 '95%',
 '99%',
 '94%',
 '88%',
 '93%',
 '93%',
 '97%']

In [0]:
# Removing the '%' sign
scores = [s.strip('%') for s in scores]
scores

['95',
 '88',
 '88',
 '90',
 '93',
 '94',
 '90',
 '93',
 '97',
 '98',
 '93',
 '92',
 '90',
 '82',
 '98',
 '81',
 '88',
 '96',
 '91',
 '90',
 '85',
 '96',
 '97',
 '87',
 '77',
 '90',
 '94',
 '79',
 '83',
 '85',
 '92',
 '91',
 '94',
 '93',
 '77',
 '82',
 '66',
 '89',
 '89',
 '95',
 '93',
 '100',
 '98',
 '80',
 '94',
 '70',
 '87',
 '93',
 '100',
 '76',
 '85',
 '73',
 '94',
 '83',
 '86',
 '97',
 '81',
 '92',
 '82',
 '95',
 '86',
 '86',
 '97',
 '95',
 '99',
 '94',
 '88',
 '93',
 '93',
 '97']

In [0]:
# Converting each score to an integer
scores = [int(s) for s in scores]
scores

[95,
 88,
 88,
 90,
 93,
 94,
 90,
 93,
 97,
 98,
 93,
 92,
 90,
 82,
 98,
 81,
 88,
 96,
 91,
 90,
 85,
 96,
 97,
 87,
 77,
 90,
 94,
 79,
 83,
 85,
 92,
 91,
 94,
 93,
 77,
 82,
 66,
 89,
 89,
 95,
 93,
 100,
 98,
 80,
 94,
 70,
 87,
 93,
 100,
 76,
 85,
 73,
 94,
 83,
 86,
 97,
 81,
 92,
 82,
 95,
 86,
 86,
 97,
 95,
 99,
 94,
 88,
 93,
 93,
 97]

# Extracting the rest of the information

In [0]:
# Data left to scrape:
# - The Critics Consensus (inside a div with class 'critics-consensus')
# - Adjusted score (inside a div with class 'countdown-adjusted-score')  --> homework 
# - Synopsis (inside a div with class 'synopsis')  --> homework
# - Cast (inside a div with class 'cast')
# - Director (inside a div with class 'director')

# All of the above are inside the original divs we scraped

## Critics Consensus

In [0]:
# The critics consensus is located inside a 'div' tag with the class 'info critics-consensus'
# This can be found inside the original 'div's we scraped
divs

[<div class="col-sm-18 col-full-xs countdown-item-content">
 <div class="row countdown-item-title-bar">
 <div class="col-sm-20 col-full-xs" style="height: 100%;">
 <div class="article_movie_title" style="float: left;">
 <div><h2><a href="https://www.rottentomatoes.com/m/13_assassins_2011/">13 Assassins</a> <span class="subtle start-year">(2011)</span> <span class="icon tiny certified" title="Certified Fresh"></span> <span class="tMeterScore">95%</span></h2></div>
 </div>
 </div>
 <div class="col-sm-4 col-full-xs" style="height: 100%;">
 <div class="countdown-index">#70</div>
 </div>
 </div>
 <div class="row countdown-item-details">
 <div class="col-sm-24">
 <div class="info countdown-adjusted-score"><span class="descriptor">Adjusted Score: </span>98.424% <span class="glyphicon glyphicon-question-sign" data-html="true" data-original-title="The Adjusted Score comes from a weighted formula (Bayesian) that we use that accounts for variation in the number of reviews per movie." data-placeme

In [0]:
# Getting the 'div' tags containing the critics consensus
consensus = [div.find("div", {"class": "info critics-consensus"}) for div in divs]
consensus

[<div class="info critics-consensus"><span class="descriptor">Critics Consensus:</span> Takashi Miike's electric remake of Eiichi Kudo's 1963 period action film is a wild spectacle executed with killer, dizzying panache.</div>,
 <div class="info critics-consensus"><span class="descriptor">Critics Consensus:</span> No consensus yet.</div>,
 <div class="info critics-consensus"><span class="descriptor">Critics Consensus:</span> Lighter and more comedic than its predecessor, Indiana Jones and the Last Crusade returns the series to the brisk serial adventure of Raiders, while adding a dynamite double act between Harrison Ford and Sean Connery.</div>,
 <div class="info critics-consensus"><span class="descriptor">Critics Consensus:</span> Kung Fu Hustle blends special effects, martial arts, and the Looney Toons to hilarious effect.</div>,
 <div class="info critics-consensus"><span class="descriptor">Critics Consensus:</span> No consensus yet.</div>,
 <div class="info critics-consensus"><span 

In [0]:
# Inspecting the text inside these tags
[con.text for con in consensus]

["Critics Consensus: Takashi Miike's electric remake of Eiichi Kudo's 1963 period action film is a wild spectacle executed with killer, dizzying panache.",
 'Critics Consensus: No consensus yet.',
 'Critics Consensus: Lighter and more comedic than its predecessor, Indiana Jones and the Last Crusade returns the series to the brisk serial adventure of Raiders, while adding a dynamite double act between Harrison Ford and Sean Connery.',
 'Critics Consensus: Kung Fu Hustle blends special effects, martial arts, and the Looney Toons to hilarious effect.',
 'Critics Consensus: No consensus yet.',
 "Critics Consensus: Powered by Robert Downey Jr.'s vibrant charm, Iron Man turbo-charges the superhero genre with a deft intelligence and infectious sense of fun.",
 'Critics Consensus: A bloody thrill ride designed to test the limits of more squeamish viewers, The Night Comes for Us wields a stylishly violent, action-packed punch.',
 'Critics Consensus: Hugh Jackman makes the most of his final outi

In [0]:
# Every consensus starts with the string 'Critics Consensus: '
# There are a couple of ways to remove it from the final text

### Way #1: Text processing

In [0]:
# The simplest (but not necessarily the best) way of achieving it is by taking the substring after the common phrase

In [0]:
# Defining the phrase to be removed (note the space at the end)
common_phrase = 'Critics Consensus: '

In [0]:
# Finding how long is the common phrase
len(common_phrase)

19

In [0]:
consensus[0].text

"Critics Consensus: Takashi Miike's electric remake of Eiichi Kudo's 1963 period action film is a wild spectacle executed with killer, dizzying panache."

In [0]:
# Taking only the part of the text after the common phrase
consensus[0].text[19:]

"Takashi Miike's electric remake of Eiichi Kudo's 1963 period action film is a wild spectacle executed with killer, dizzying panache."

In [0]:
# Define a variable to store the length
common_len = len(common_phrase)

In [0]:
# Cleaning the list of the common phrase
consensus_text = [con.text[common_len:] for con in consensus]
consensus_text

["Takashi Miike's electric remake of Eiichi Kudo's 1963 period action film is a wild spectacle executed with killer, dizzying panache.",
 'No consensus yet.',
 'Lighter and more comedic than its predecessor, Indiana Jones and the Last Crusade returns the series to the brisk serial adventure of Raiders, while adding a dynamite double act between Harrison Ford and Sean Connery.',
 'Kung Fu Hustle blends special effects, martial arts, and the Looney Toons to hilarious effect.',
 'No consensus yet.',
 "Powered by Robert Downey Jr.'s vibrant charm, Iron Man turbo-charges the superhero genre with a deft intelligence and infectious sense of fun.",
 'A bloody thrill ride designed to test the limits of more squeamish viewers, The Night Comes for Us wields a stylishly violent, action-packed punch.',
 'Hugh Jackman makes the most of his final outing as Wolverine with a gritty, nuanced performance in a violent but surprisingly thoughtful superhero action film that defies genre conventions.',
 'Gol

In [0]:
# We can add if-else logic to only truncate the string in case it starts with the common phrase
consensus_text = [con.text[common_len:] if con.text.startswith(common_phrase) else con.text for con in consensus ]
consensus_text

["Takashi Miike's electric remake of Eiichi Kudo's 1963 period action film is a wild spectacle executed with killer, dizzying panache.",
 'No consensus yet.',
 'Lighter and more comedic than its predecessor, Indiana Jones and the Last Crusade returns the series to the brisk serial adventure of Raiders, while adding a dynamite double act between Harrison Ford and Sean Connery.',
 'Kung Fu Hustle blends special effects, martial arts, and the Looney Toons to hilarious effect.',
 'No consensus yet.',
 "Powered by Robert Downey Jr.'s vibrant charm, Iron Man turbo-charges the superhero genre with a deft intelligence and infectious sense of fun.",
 'A bloody thrill ride designed to test the limits of more squeamish viewers, The Night Comes for Us wields a stylishly violent, action-packed punch.',
 'Hugh Jackman makes the most of his final outing as Wolverine with a gritty, nuanced performance in a violent but surprisingly thoughtful superhero action film that defies genre conventions.',
 'Gol

### Way #2: Inspecting the HTML

In [0]:
consensus[0]

<div class="info critics-consensus"><span class="descriptor">Critics Consensus:</span> Takashi Miike's electric remake of Eiichi Kudo's 1963 period action film is a wild spectacle executed with killer, dizzying panache.</div>

In [0]:
# When inspecting the HTML we see that the common phrase ("Critics Consensus: ")
# is located inside a span element
# The string we want to obtain follows that

In [0]:
# We can use .contents to obtain a list of all children of the tag
consensus[0].contents

[<span class="descriptor">Critics Consensus:</span>,
 " Takashi Miike's electric remake of Eiichi Kudo's 1963 period action film is a wild spectacle executed with killer, dizzying panache."]

In [0]:
# The second element of that list is the text we want
consensus[0].contents[1]

" Takashi Miike's electric remake of Eiichi Kudo's 1963 period action film is a wild spectacle executed with killer, dizzying panache."

In [0]:
# We can remove the extra whitespace (space at the beginning) with the .strip() method
consensus[0].contents[1].strip()

"Takashi Miike's electric remake of Eiichi Kudo's 1963 period action film is a wild spectacle executed with killer, dizzying panache."

In [0]:
# Processing all texts
consensus_text = [con.contents[1].strip() for con in consensus]
consensus_text

["Takashi Miike's electric remake of Eiichi Kudo's 1963 period action film is a wild spectacle executed with killer, dizzying panache.",
 'No consensus yet.',
 'Lighter and more comedic than its predecessor, Indiana Jones and the Last Crusade returns the series to the brisk serial adventure of Raiders, while adding a dynamite double act between Harrison Ford and Sean Connery.',
 'Kung Fu Hustle blends special effects, martial arts, and the Looney Toons to hilarious effect.',
 'No consensus yet.',
 "Powered by Robert Downey Jr.'s vibrant charm, Iron Man turbo-charges the superhero genre with a deft intelligence and infectious sense of fun.",
 'A bloody thrill ride designed to test the limits of more squeamish viewers, The Night Comes for Us wields a stylishly violent, action-packed punch.',
 'Hugh Jackman makes the most of his final outing as Wolverine with a gritty, nuanced performance in a violent but surprisingly thoughtful superhero action film that defies genre conventions.',
 'Gol

In [0]:
# In my opinion, this method is closer to the BeautifulSoup approach

## Directors

In [0]:
# Extracting all director divs
directors = [div.find("div", class_ = 'director') for div in divs]
directors

[<div class="info director">
 <span class="descriptor">Directed By:</span> <a class="" href="/celebrity/takashi_miike/">Takashi Miike</a></div>,
 <div class="info director">
 <span class="descriptor">Directed By:</span> <a class="" href="/celebrity/ringo_lam/">Ringo Lam</a></div>,
 <div class="info director">
 <span class="descriptor">Directed By:</span> <a class="" href="/celebrity/steven_spielberg/">Steven Spielberg</a></div>,
 <div class="info director">
 <span class="descriptor">Directed By:</span> <a class="" href="/celebrity/stephen_chow/">Stephen Chow</a></div>,
 <div class="info director">
 <span class="descriptor">Directed By:</span> <a class="" href="/celebrity/haesung_song/">Hae-sung Song</a></div>,
 <div class="info director">
 <span class="descriptor">Directed By:</span> </div>,
 <div class="info director">
 <span class="descriptor">Directed By:</span> <a class="" href="/celebrity/timo_tjahjanto/">Timo Tjahjanto</a></div>,
 <div class="info director">
 <span class="descrip

In [0]:
# Inspecting a div
directors[0]

<div class="info director">
<span class="descriptor">Directed By:</span> <a class="" href="/celebrity/takashi_miike/">Takashi Miike</a></div>

In [0]:
# The director's name can be found as the string of a link

# Obtaining all director links
[director.find("a") for director in directors]

[<a class="" href="/celebrity/takashi_miike/">Takashi Miike</a>,
 <a class="" href="/celebrity/ringo_lam/">Ringo Lam</a>,
 <a class="" href="/celebrity/steven_spielberg/">Steven Spielberg</a>,
 <a class="" href="/celebrity/stephen_chow/">Stephen Chow</a>,
 <a class="" href="/celebrity/haesung_song/">Hae-sung Song</a>,
 None,
 <a class="" href="/celebrity/timo_tjahjanto/">Timo Tjahjanto</a>,
 <a class="" href="/celebrity/james_mangold/">James Mangold</a>,
 <a class="" href="/celebrity/guy_hamilton/">Guy Hamilton</a>,
 <a class="" href="/celebrity/1040928-john_carpenter/">John Carpenter</a>,
 <a class="" href="/celebrity/patty_jenkins/">Patty Jenkins</a>,
 <a class="" href="/celebrity/lo_wei/">Lo Wei</a>,
 <a class="" href="/celebrity/joe_russo/">Joe Russo</a>,
 <a class="" href="/celebrity/chan_wook_park/">Chan-wook Park</a>,
 <a class="" href="/celebrity/william_friedkin/">William Friedkin</a>,
 <a class="" href="/celebrity/james_wan/">James Wan</a>,
 <a class="" href="/celebrity/luc_b

In [0]:
# Notice that one link is None - the director of Iron Man is missing!

# This means we can't simply use .string,
# because None has no string attribute

In [0]:
# Running the line below will raise an error if uncommented

#[director.find("a").string for director in directors]

In [0]:
# We can use if-else to deal with the None value

final_directors = [None if director.find("a") is None else director.find("a").string for director in directors]
final_directors

['Takashi Miike',
 'Ringo Lam',
 'Steven Spielberg',
 'Stephen Chow',
 'Hae-sung Song',
 None,
 'Timo Tjahjanto',
 'James Mangold',
 'Guy Hamilton',
 'John Carpenter',
 'Patty Jenkins',
 'Lo Wei',
 'Joe Russo',
 'Chan-wook Park',
 'William Friedkin',
 'James Wan',
 'Luc Besson',
 'Stanley Tong',
 'Don Siegel',
 'Doug Liman',
 'Bryan Singer',
 'Andrew Davis',
 'Ryan Coogler',
 'Christopher Nolan',
 'Mel Gibson',
 'Steven Spielberg',
 'Joe Russo',
 'Pete Travis',
 'Doug Liman',
 'Wilson Yip',
 'John Woo',
 'William Friedkin',
 'Christopher Nolan',
 'Brad Bird',
 'Justin Lin',
 'Richard Donner',
 'Michael Bay',
 'Paul Verhoeven',
 'Chad Stahelski',
 'Martin Campbell',
 'Edgar Wright',
 'Gordon Chan',
 'John Woo',
 'Gareth Evans',
 'Robert Clouse',
 'Mark L. Lester',
 'Ted Kotcheff',
 'Christopher McQuarrie',
 'James Cameron',
 'Ridley Scott',
 'Quentin Tarantino',
 'Luc Besson',
 'Jan de Bont',
 'Lau Kar-Leung',
 'David Leitch',
 'Ang Lee',
 'John McTiernan',
 'Paul Greengrass',
 'Paul Ve

## Cast info

In [0]:
cast_info = [div.find("div", class_ = 'cast') for div in divs]
cast_info

[<div class="info cast">
 <span class="descriptor">Starring:</span> <a class="" href="/celebrity/koji_yakusho/">Koji Yakusho</a>, <a class="" href="/celebrity/1164582-takayuki_yamada/">Takayuki Yamada</a>, <a class="" href="/celebrity/yusuke-iseya/">Yusuke Iseya</a>, <a class="" href="/celebrity/goro_inagaki/">Goro Inagaki</a></div>,
 <div class="info cast">
 <span class="descriptor">Starring:</span> <a class="" href="/celebrity/chow_yun_fat/">Yun-Fat Chow</a>, <a class="" href="/celebrity/simon_yam_da_wah_/">Simon Yam</a>, <a class="" href="/celebrity/ann_bridgewater/">Ann Bridgewater</a>, <a class="" href="/celebrity/10001156-anthony_wong/">Anthony Wong</a></div>,
 <div class="info cast">
 <span class="descriptor">Starring:</span> <a class="" href="/celebrity/harrison_ford/">Harrison Ford</a>, <a class="" href="/celebrity/sean_connery/">Sean Connery</a>, <a class="" href="/celebrity/alison_doody/">Alison Doody</a>, <a class="" href="/celebrity/denholm_elliott/">Denholm Elliott</a></d

In [0]:
cast_info[0]

<div class="info cast">
<span class="descriptor">Starring:</span> <a class="" href="/celebrity/koji_yakusho/">Koji Yakusho</a>, <a class="" href="/celebrity/1164582-takayuki_yamada/">Takayuki Yamada</a>, <a class="" href="/celebrity/yusuke-iseya/">Yusuke Iseya</a>, <a class="" href="/celebrity/goro_inagaki/">Goro Inagaki</a></div>

In [0]:
# Each cast member's name is the string of a link
# There are multiple cast members for a movie

In [0]:
# Let's first practice with a single movie

# Obtain all the links to different cast members
cast_links = cast_info[0].find_all('a')
cast_links

[<a class="" href="/celebrity/koji_yakusho/">Koji Yakusho</a>,
 <a class="" href="/celebrity/1164582-takayuki_yamada/">Takayuki Yamada</a>,
 <a class="" href="/celebrity/yusuke-iseya/">Yusuke Iseya</a>,
 <a class="" href="/celebrity/goro_inagaki/">Goro Inagaki</a>]

In [0]:
# Extract the names from the links
cast_names = [link.string for link in cast_links]
cast_names

['Koji Yakusho', 'Takayuki Yamada', 'Yusuke Iseya', 'Goro Inagaki']

In [0]:
# OPTIONALLY: We can stitch all names together as one string

# This can be done using the join method
# To use join, pick a string to use as a separator (in our case a comma, followed with a space) and
# pass the list of strings you want to merge to the join method

cast = ", ".join(cast_names)
cast

'Koji Yakusho, Takayuki Yamada, Yusuke Iseya, Goro Inagaki'

In [0]:
# Now we need to do the above operations for every movie

# We can either use a for loop (clearer), or
# use a nested list compehension (more concise)

### Using a for loop

In [0]:
# Initialize the list of all cast memners
cast = []

# Just put all previous operations inside a for loop
for c in cast_info:
    cast_links = c.find_all('a')
    cast_names = [link.string for link in cast_links]
    
    cast.append(", ".join(cast_names)) # Joining is optional

cast

['Koji Yakusho, Takayuki Yamada, Yusuke Iseya, Goro Inagaki',
 'Yun-Fat Chow, Simon Yam, Ann Bridgewater, Anthony Wong',
 'Harrison Ford, Sean Connery, Alison Doody, Denholm Elliott',
 'Stephen Chow, Wah Yuen, Shengyi Huang, Kwok-Kwan Chan',
 'Yun-Fat Chow, Leslie Cheung, Lung Ti, Young Pao I',
 'Robert Downey Jr., Terrence Howard, Jeff Bridges, Gwyneth Paltrow',
 'Joe Taslim, Iko Uwais, Julie Estelle, Sunny Pang',
 'Hugh Jackman, Patrick Stewart, Boyd Holbrook, Stephen Merchant',
 'Sean Connery, Gert Fröbe, Honor Blackman, Shirley Eaton',
 'Austin Stoker, Darwin Joston, Gilbert De la Pena, Laurie Zimmer',
 'Gal Gadot, Chris Pine, Connie Nielsen, Robin Wright',
 'Bruce Lee, Miao Ker Hsiu, James Tien, Robert Baker',
 'Chris Evans, Samuel L. Jackson, Scarlett Johansson, Anthony Mackie',
 'Choi Min-sik, Gang Hye-jeong, Yoo Ji-tae, Gang Hye-jung',
 'Gene Hackman, Fernando Rey, Roy Scheider, Marcel Bozzuffi',
 'Vin Diesel, Paul Walker, Jason Statham, Dwayne Johnson',
 'Tcheky Karyo, Anne Pa

### Nested list comprehension

In [0]:
# As you can see this can be done in just one line using nested list comprehension
# However, the code is harded to understand

cast = [", ".join([link.string for link in c.find_all("a")]) for c in cast_info]
cast

['Koji Yakusho, Takayuki Yamada, Yusuke Iseya, Goro Inagaki',
 'Yun-Fat Chow, Simon Yam, Ann Bridgewater, Anthony Wong',
 'Harrison Ford, Sean Connery, Alison Doody, Denholm Elliott',
 'Stephen Chow, Wah Yuen, Shengyi Huang, Kwok-Kwan Chan',
 'Yun-Fat Chow, Leslie Cheung, Lung Ti, Young Pao I',
 'Robert Downey Jr., Terrence Howard, Jeff Bridges, Gwyneth Paltrow',
 'Joe Taslim, Iko Uwais, Julie Estelle, Sunny Pang',
 'Hugh Jackman, Patrick Stewart, Boyd Holbrook, Stephen Merchant',
 'Sean Connery, Gert Fröbe, Honor Blackman, Shirley Eaton',
 'Austin Stoker, Darwin Joston, Gilbert De la Pena, Laurie Zimmer',
 'Gal Gadot, Chris Pine, Connie Nielsen, Robin Wright',
 'Bruce Lee, Miao Ker Hsiu, James Tien, Robert Baker',
 'Chris Evans, Samuel L. Jackson, Scarlett Johansson, Anthony Mackie',
 'Choi Min-sik, Gang Hye-jeong, Yoo Ji-tae, Gang Hye-jung',
 'Gene Hackman, Fernando Rey, Roy Scheider, Marcel Bozzuffi',
 'Vin Diesel, Paul Walker, Jason Statham, Dwayne Johnson',
 'Tcheky Karyo, Anne Pa

## Adjusted score

In [0]:
# Homework

# The adjusted scores can be found in a div with class 'info countdown-adjusted-score'
adj_scores = [div.find("div", {"class": "info countdown-adjusted-score"}) for div in divs]
adj_scores

[<div class="info countdown-adjusted-score"><span class="descriptor">Adjusted Score: </span>98.424% <span class="glyphicon glyphicon-question-sign" data-html="true" data-original-title="The Adjusted Score comes from a weighted formula (Bayesian) that we use that accounts for variation in the number of reviews per movie." data-placement="top" data-toggle="tooltip" rel="tooltip" title=""></span></div>,
 <div class="info countdown-adjusted-score"><span class="descriptor">Adjusted Score: </span>61.566% <span class="glyphicon glyphicon-question-sign" data-html="true" data-original-title="The Adjusted Score comes from a weighted formula (Bayesian) that we use that accounts for variation in the number of reviews per movie." data-placement="top" data-toggle="tooltip" rel="tooltip" title=""></span></div>,
 <div class="info countdown-adjusted-score"><span class="descriptor">Adjusted Score: </span>93.242% <span class="glyphicon glyphicon-question-sign" data-html="true" data-original-title="The Ad

In [0]:
# Inspecting an element
adj_scores[0]

<div class="info countdown-adjusted-score"><span class="descriptor">Adjusted Score: </span>98.424% <span class="glyphicon glyphicon-question-sign" data-html="true" data-original-title="The Adjusted Score comes from a weighted formula (Bayesian) that we use that accounts for variation in the number of reviews per movie." data-placement="top" data-toggle="tooltip" rel="tooltip" title=""></span></div>

In [0]:
# By inspection we see that the string we are looking for is the second child of the 'div' tag
adj_scores[0].contents[1]  # Note the extra whitespace at the end

'98.424% '

In [0]:
# Extracting the string (without '%' sign and extra space)
adj_scores_clean = [score.contents[1].strip('% ') for score in adj_scores]
adj_scores_clean

['98.424',
 '61.566',
 '93.242',
 '96.081',
 '43.856',
 '104.639',
 '90.367',
 '110.355',
 '103.101',
 '102.299',
 '112.287',
 '92.779',
 '101.637',
 '86.566',
 '104.153',
 '91.175',
 '90.562',
 '98.253',
 '95.62',
 '102.453',
 '92.17',
 '101.95',
 '119.101',
 '97.823',
 '81.83',
 '97.304',
 '117.873',
 '84.548',
 '88.278',
 '74.441',
 '97.234',
 '93.011',
 '107.067',
 '100.434',
 '82.844',
 '86.338',
 '69.338',
 '94.309',
 '100.237',
 '103.837',
 '108.929',
 '77.178',
 '100.619',
 '86.264',
 '98.687',
 '72.011',
 '90.925',
 '105.058',
 '104.919',
 '83.141',
 '91.869',
 '76.862',
 '97.862',
 '87.788',
 '94.155',
 '101.879',
 '84.562',
 '100.908',
 '86.768',
 '98.816',
 '91.219',
 '91.589',
 '115.565',
 '101.091',
 '105.846',
 '96.061',
 '94.662',
 '99.241',
 '98.793',
 '113.244']

In [0]:
# Converting the strings to numbers
final_adj = [float(score) for score in adj_scores_clean] # Note that this time the scores are float, not int!
final_adj

[98.424,
 61.566,
 93.242,
 96.081,
 43.856,
 104.639,
 90.367,
 110.355,
 103.101,
 102.299,
 112.287,
 92.779,
 101.637,
 86.566,
 104.153,
 91.175,
 90.562,
 98.253,
 95.62,
 102.453,
 92.17,
 101.95,
 119.101,
 97.823,
 81.83,
 97.304,
 117.873,
 84.548,
 88.278,
 74.441,
 97.234,
 93.011,
 107.067,
 100.434,
 82.844,
 86.338,
 69.338,
 94.309,
 100.237,
 103.837,
 108.929,
 77.178,
 100.619,
 86.264,
 98.687,
 72.011,
 90.925,
 105.058,
 104.919,
 83.141,
 91.869,
 76.862,
 97.862,
 87.788,
 94.155,
 101.879,
 84.562,
 100.908,
 86.768,
 98.816,
 91.219,
 91.589,
 115.565,
 101.091,
 105.846,
 96.061,
 94.662,
 99.241,
 98.793,
 113.244]

## Synopsis

In [0]:
# Homework

# The synopsis is located inside a 'div' tag with the class 'info synopsis'
synopsis = [div.find('div', class_='synopsis') for div in divs]
synopsis

[<div class="info synopsis"><span class="descriptor">Synopsis:</span> Cult director Takeshi Miike (Ichi the Killer, Audition) delivers a bravado period action film set at the end of Japan's...<a class="" data-pageheader="" href="https://www.rottentomatoes.com/m/13_assassins_2011/" target="_top"> [More]</a></div>,
 <div class="info synopsis"><span class="descriptor">Synopsis:</span> Another entry into the "cheer for the most likeable bad guy" series of Hong Kong action flicks, Full Contact tells...<a class="" data-pageheader="" href="https://www.rottentomatoes.com/m/full_contact/" target="_top"> [More]</a></div>,
 <div class="info synopsis"><span class="descriptor">Synopsis:</span> The third installment in the Spielberg/Lucas Indiana Jones saga, Indiana Jones and the Last Crusade evokes many of the thrills...<a class="" data-pageheader="" href="https://www.rottentomatoes.com/m/indiana_jones_and_the_last_crusade/" target="_top"> [More]</a></div>,
 <div class="info synopsis"><span class="

In [0]:
# Inspecting the element
synopsis[0]

<div class="info synopsis"><span class="descriptor">Synopsis:</span> Cult director Takeshi Miike (Ichi the Killer, Audition) delivers a bravado period action film set at the end of Japan's...<a class="" data-pageheader="" href="https://www.rottentomatoes.com/m/13_assassins_2011/" target="_top"> [More]</a></div>

In [0]:
# The text is the second child
synopsis[0].contents[1]

" Cult director Takeshi Miike (Ichi the Killer, Audition) delivers a bravado period action film set at the end of Japan's..."

In [0]:
# Extracting the text
synopsis_text = [syn.contents[1] for syn in synopsis]
synopsis_text

[" Cult director Takeshi Miike (Ichi the Killer, Audition) delivers a bravado period action film set at the end of Japan's...",
 ' Another entry into the "cheer for the most likeable bad guy" series of Hong Kong action flicks, Full Contact tells...',
 ' The third installment in the Spielberg/Lucas Indiana Jones saga, Indiana Jones and the Last Crusade evokes many of the thrills...',
 ' The work of international superstar Stephen Chow, Kung Fu Hustle is a humorous, special-effects-filled, action-packed martial arts epic set in...',
 " John Woo established himself as one of Hong Kong's premiere action directors with this ultra-hip, ultra-violent action classic. The film...",
 ' Billionaire industrialist and genius inventor Tony Stark is kidnapped and forced to build a devastating weapon. Instead, using his intelligence...',
 ' The Night Comes For Us follows Ito (played by Joe Taslim), a former triad enforcer, and his mission to protect...',
 " It's 2029. Mutants are gone--or very nearly 

# Representing the data in structured form

In [0]:
# We will take advantage of pandas and its dataframe for data storage

In [0]:
# load the pandas package
import pandas as pd

## Creating a Data Frame

In [0]:
# A dataframe is a tabular data type, frequently used in data science

movies_info = pd.DataFrame()
movies_info  # The dataframe is still empty, we need to fill it with the info we gathered

## Populating the dataframe

In [0]:
# Populating the dataframe

movies_info["Movie Title"] = movie_names
movies_info["Year"] = years
movies_info["Score"] = scores
movies_info["Adjusted Score"] = final_adj  # Homework
movies_info["Director"] = final_directors
movies_info["Synopsis"] = synopsis_text    # Homework
movies_info["Cast"] = cast
movies_info["Consensus"] = consensus_text

# Let's see how it looks
movies_info

Unnamed: 0,Movie Title,Year,Score,Adjusted Score,Director,Synopsis,Cast,Consensus
0,13 Assassins,2011,95,98.424,Takashi Miike,"Cult director Takeshi Miike (Ichi the Killer,...","Koji Yakusho, Takayuki Yamada, Yusuke Iseya, G...",Takashi Miike's electric remake of Eiichi Kudo...
1,Full Contact,1992,88,61.566,Ringo Lam,"Another entry into the ""cheer for the most li...","Yun-Fat Chow, Simon Yam, Ann Bridgewater, Anth...",No consensus yet.
2,Indiana Jones and the Last Crusade,1989,88,93.242,Steven Spielberg,The third installment in the Spielberg/Lucas ...,"Harrison Ford, Sean Connery, Alison Doody, Den...","Lighter and more comedic than its predecessor,..."
3,Kung Fu Hustle,2005,90,96.081,Stephen Chow,The work of international superstar Stephen C...,"Stephen Chow, Wah Yuen, Shengyi Huang, Kwok-Kw...","Kung Fu Hustle blends special effects, martial..."
4,A Better Tomorrow,2010,93,43.856,Hae-sung Song,John Woo established himself as one of Hong K...,"Yun-Fat Chow, Leslie Cheung, Lung Ti, Young Pao I",No consensus yet.
5,Iron Man,2008,94,104.639,,Billionaire industrialist and genius inventor...,"Robert Downey Jr., Terrence Howard, Jeff Bridg...","Powered by Robert Downey Jr.'s vibrant charm, ..."
6,The Night Comes For Us,2018,90,90.367,Timo Tjahjanto,The Night Comes For Us follows Ito (played by...,"Joe Taslim, Iko Uwais, Julie Estelle, Sunny Pang",A bloody thrill ride designed to test the limi...
7,Logan,2017,93,110.355,James Mangold,It's 2029. Mutants are gone--or very nearly s...,"Hugh Jackman, Patrick Stewart, Boyd Holbrook, ...",Hugh Jackman makes the most of his final outin...
8,Goldfinger,1964,97,103.101,Guy Hamilton,"To many, the quintessential Bond film and a b...","Sean Connery, Gert Fröbe, Honor Blackman, Shir...",Goldfinger is where James Bond as we know him ...
9,Assault on Precinct 13,1976,98,102.299,John Carpenter,"Cops, secretaries, and prisoners stuck in a s...","Austin Stoker, Darwin Joston, Gilbert De la Pe...","Lean, taut and compellingly gritty, John Carpe..."


In [0]:
# By default pandas abbreviates any text beyond a certain length (as seen in the Cast and Consensus columns)

# We can change that by setting the maximum column width to -1,
# which means the column would be as wide as to display the whole text
pd.set_option('display.max_colwidth', -1)
movies_info

Unnamed: 0,Movie Title,Year,Score,Adjusted Score,Director,Synopsis,Cast,Consensus
0,13 Assassins,2011,95,98.424,Takashi Miike,"Cult director Takeshi Miike (Ichi the Killer, Audition) delivers a bravado period action film set at the end of Japan's...","Koji Yakusho, Takayuki Yamada, Yusuke Iseya, Goro Inagaki","Takashi Miike's electric remake of Eiichi Kudo's 1963 period action film is a wild spectacle executed with killer, dizzying panache."
1,Full Contact,1992,88,61.566,Ringo Lam,"Another entry into the ""cheer for the most likeable bad guy"" series of Hong Kong action flicks, Full Contact tells...","Yun-Fat Chow, Simon Yam, Ann Bridgewater, Anthony Wong",No consensus yet.
2,Indiana Jones and the Last Crusade,1989,88,93.242,Steven Spielberg,"The third installment in the Spielberg/Lucas Indiana Jones saga, Indiana Jones and the Last Crusade evokes many of the thrills...","Harrison Ford, Sean Connery, Alison Doody, Denholm Elliott","Lighter and more comedic than its predecessor, Indiana Jones and the Last Crusade returns the series to the brisk serial adventure of Raiders, while adding a dynamite double act between Harrison Ford and Sean Connery."
3,Kung Fu Hustle,2005,90,96.081,Stephen Chow,"The work of international superstar Stephen Chow, Kung Fu Hustle is a humorous, special-effects-filled, action-packed martial arts epic set in...","Stephen Chow, Wah Yuen, Shengyi Huang, Kwok-Kwan Chan","Kung Fu Hustle blends special effects, martial arts, and the Looney Toons to hilarious effect."
4,A Better Tomorrow,2010,93,43.856,Hae-sung Song,"John Woo established himself as one of Hong Kong's premiere action directors with this ultra-hip, ultra-violent action classic. The film...","Yun-Fat Chow, Leslie Cheung, Lung Ti, Young Pao I",No consensus yet.
5,Iron Man,2008,94,104.639,,"Billionaire industrialist and genius inventor Tony Stark is kidnapped and forced to build a devastating weapon. Instead, using his intelligence...","Robert Downey Jr., Terrence Howard, Jeff Bridges, Gwyneth Paltrow","Powered by Robert Downey Jr.'s vibrant charm, Iron Man turbo-charges the superhero genre with a deft intelligence and infectious sense of fun."
6,The Night Comes For Us,2018,90,90.367,Timo Tjahjanto,"The Night Comes For Us follows Ito (played by Joe Taslim), a former triad enforcer, and his mission to protect...","Joe Taslim, Iko Uwais, Julie Estelle, Sunny Pang","A bloody thrill ride designed to test the limits of more squeamish viewers, The Night Comes for Us wields a stylishly violent, action-packed punch."
7,Logan,2017,93,110.355,James Mangold,"It's 2029. Mutants are gone--or very nearly so. An isolated, despondent Logan is drinking his days away in a hideout...","Hugh Jackman, Patrick Stewart, Boyd Holbrook, Stephen Merchant","Hugh Jackman makes the most of his final outing as Wolverine with a gritty, nuanced performance in a violent but surprisingly thoughtful superhero action film that defies genre conventions."
8,Goldfinger,1964,97,103.101,Guy Hamilton,"To many, the quintessential Bond film and a brilliant third entry in the series. Here Bond gets his Aston Martin,...","Sean Connery, Gert Fröbe, Honor Blackman, Shirley Eaton","Goldfinger is where James Bond as we know him comes into focus - it features one of 007's most famous lines (""A martini. Shaken, not stirred"") and a wide range of gadgets that would become the series' trademark."
9,Assault on Precinct 13,1976,98,102.299,John Carpenter,"Cops, secretaries, and prisoners stuck in a soon-to-be-shuttered L.A. police station fight off a horde of murderous gang members in...","Austin Stoker, Darwin Joston, Gilbert De la Pena, Laurie Zimmer","Lean, taut and compellingly gritty, John Carpenter's loose update of Rio Bravo ranks as a cult action classic and one of the filmmaker's best."


## Exporting the data to CSV (comma-separated values) and excel files

In [0]:
# Write data to excel file
movies_info.to_excel("movies_info.xlsx", index = False, header = True)

In [0]:
# or write data to CSV file
movies_info.to_csv("movies_info.csv", index = False, header = True)

In [0]:
# Index is set to False so that the index (0,1,2...) of each movie is not saved to the file (the index is purely internal)
# The header is set to True, so that the names of the columns are saved

## Extracting tables with Beautiful Soup

## Setup

In [0]:
import requests
from bs4 import BeautifulSoup

In [0]:
base_site = "https://en.wikipedia.org/wiki/list_of_national_capitals_by_population"

In [0]:
r = requests.get(base_site)

In [0]:
html = r.content
html

b'\n<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>List of national capitals by population - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"XpT9AwpAMOIAAjB-Eo4AAABD","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_national_capitals_by_population","wgTitle":"List of national capitals by population","wgCurRevisionId":950026590,"wgRevisionId":950026590,"wgArticleId":21224339,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 Indonesian-language sources (id)","CS1 Spanish-language sources (es)","Webarchive templat

In [0]:
soup = BeautifulSoup(html, 'lxml')

In [0]:
soup.find_all('table')

[<table class="navbox vertical-navbox" style="clear:right; float:right; margin:0 0 0.5em 1em; width:16.0em; text-align:left; font-size:95%; line-height:1.1em; border-collapse:collapse;">
 <tbody><tr>
 <th style="padding:4px;"><a href="/wiki/Lists_of_capitals" title="Lists of capitals">Lists of capitals</a>
 </th></tr>
 <tr>
 <td style="padding:4px;"><b>Of countries</b>
 <ul><li><a href="/wiki/List_of_national_capitals" title="List of national capitals">in alphabetical order</a></li>
 <li><a href="/wiki/List_of_national_capitals_by_latitude" title="List of national capitals by latitude">by latitude</a></li>
 <li><a class="mw-selflink selflink">by population</a></li>
 <li><a href="/wiki/List_of_former_national_capitals" title="List of former national capitals">Former</a></li>
 <li><a href="/wiki/List_of_purpose-built_national_capitals" title="List of purpose-built national capitals">Purpose-built</a></li>
 <li><a href="/wiki/List_of_national_capitals_situated_on_an_international_border" 

In [0]:
table = soup.find_all('table')[1]

In [0]:
table.find_all('tr')[0].contents

['\n',
 <th>Rank</th>,
 '\n',
 <th style="width:110pt;">Country/Territory</th>,
 '\n',
 <th>Capital</th>,
 '\n',
 <th>Population</th>,
 '\n',
 <th>Year
 </th>,
 '\n',
 <th>% of<br/>country's<br/>population
 </th>]

In [0]:
# to extract the sixth element  wich is capital
capitals = [row.contents[5].text for row in table.find_all('tr')]

capitals

['Capital',
 'Beijing',
 'Tokyo',
 'Moscow',
 'Kinshasa',
 'Jakarta',
 'Seoul',
 'Cairo',
 'Mexico City',
 'London',
 'Dhaka',
 'Lima',
 'Tehran',
 'Bangkok',
 'Hanoi',
 'Riyadh',
 'Hong Kong',
 'Bogotá',
 'Baghdad',
 'Santiago',
 'Singapore',
 'Ankara',
 'Berlin',
 'Damascus',
 'Algiers',
 'Madrid',
 'Pyongyang',
 'Kabul',
 'Nairobi',
 'Addis Ababa',
 'Buenos Aires',
 'Rome',
 'Kiev',
 'Yaoundé',
 'Taipei',
 'Brasília',
 'Amman',
 'Luanda',
 'Guatemala City',
 'Pretoria',
 'Paris',
 'Tashkent',
 'Baku',
 'Havana',
 'Phnom Penh',
 'Bucharest',
 'Caracas',
 'Brazzaville',
 'Rabat',
 'Manila',
 'Vienna',
 'Khartoum',
 'Budapest',
 'Warsaw',
 'Minsk',
 'Kampala',
 'Accra',
 'Antananarivo',
 'Beirut',
 'Quito (de iure)  Guayaquil (de facto) seat-of-government',
 'Harare',
 'Doha',
 "Sana'a",
 'Conakry',
 'Kuala Lumpur',
 'Montevideo',
 'Lusaka',
 'Bamako',
 'Sofia',
 'Prague',
 'Port-au-Prince',
 'Tripoli',
 'Dublin',
 'Kuwait City',
 'Belgrade',
 'Santo Domingo',
 'Mogadishu',
 'Yerevan',

## Using Pandas to extract tables

In [0]:
import pandas as pd

In [0]:
tables = pd.read_html(base_site)

In [0]:
type(tables[0])

pandas.core.frame.DataFrame

In [0]:
tables

[                                   Lists of capitals
 0  Of countries in alphabetical order by latitude...
 1  Of country subdivisions Capitals outside the t...
 2                                                vte,
      Rank  ... % ofcountry'spopulation
 0       1  ...                    1.5%
 1       2  ...                  11.03%
 2       3  ...                   8.52%
 3       4  ...                   12.9%
 4       5  ...                   3.76%
 ..    ...  ...                     ...
 239   240  ...                     ---
 240   241  ...                   1.87%
 241   242  ...                  22.06%
 242   243  ...                    100%
 243   244  ...                     69%
 
 [244 rows x 6 columns],
   vteWorld's largest cities                        vteWorld's largest cities.1
 0               City proper  Capitals Americas (North Latin Central South) ...
 1         Metropolitan area  Americas (North South West Indies) Europe (Eur...
 2  Urban area/agglomeration  Asia A

In [0]:
tables[1]

Unnamed: 0,Rank,Country/Territory,Capital,Population,Year,% ofcountry'spopulation
0,1,China PR,Beijing,"21,542,000[1]",2010,1.5%
1,2,Japan,Tokyo,"13,929,286[2]",2017,11.03%
2,3,Russia,Moscow,"12,506,468[3]",2011,8.52%
3,4,DR Congo,Kinshasa,"11,855,000[4]",2012,12.9%
4,5,Indonesia,Jakarta,"10,075,310[5]",2011,3.76%
...,...,...,...,...,...,...
239,240,Montserrat (UK),Brades (de facto),391,,---
240,241,Palau,Ngerulmud,391,,1.87%
241,242,Cocos (Keeling) Islands (Australia),West Island,120,,22.06%
242,243,Pitcairn Islands (UK),Adamstown,56,,100%


In [0]:
tables[2]

Unnamed: 0,vteWorld's largest cities,vteWorld's largest cities.1
0,City proper,Capitals Americas (North Latin Central South) ...
1,Metropolitan area,Americas (North South West Indies) Europe (Eur...
2,Urban area/agglomeration,Asia Africa Europe European Union Nordic North...
3,Historical,World Europe
4,Related articles,Arcology Ecumenopolis Global city Megacity Meg...


In [0]:
tables[1].columns

Index(['Rank', 'Country/Territory', 'Capital', 'Population', 'Year',
       '% ofcountry'spopulation'],
      dtype='object')

In [0]:
pd.read_html(base_site, attrs={"class": "navbox"})

[                                   Lists of capitals
 0  Of countries in alphabetical order by latitude...
 1  Of country subdivisions Capitals outside the t...
 2                                                vte]

In [0]:
pd.read_html(html, attrs={"class": "wikitable sortable"})

[     Rank  ... % ofcountry'spopulation
 0       1  ...                    1.5%
 1       2  ...                  11.03%
 2       3  ...                   8.52%
 3       4  ...                   12.9%
 4       5  ...                   3.76%
 ..    ...  ...                     ...
 239   240  ...                     ---
 240   241  ...                   1.87%
 241   242  ...                  22.06%
 242   243  ...                    100%
 243   244  ...                     69%
 
 [244 rows x 6 columns]]

##  Introduction to the requests-html package 

### Exploring the capabilities of requests-html for Web Scraping 

In [0]:
from requests_html import HTMLSession

In [0]:
session = HTMLSession()

In [7]:
r = session.get("https://en.wikipedia.org/wiki/Assocaition_football")
r.status_code

404

In [8]:
r.html

<HTML url='https://en.wikipedia.org/wiki/Assocaition_football'>

##LINKs--we extract all links from the page

In [9]:
urls = r.html.links
urls

{'//en.m.wikipedia.org/w/index.php?title=Assocaition_football&mobileaction=toggle_view_mobile',
 '//en.wikipedia.org/wiki/Wikipedia:Contact_us',
 '//shop.wikimedia.org',
 '/w/index.php?title=Assocaition_football&action=edit&redlink=1',
 '/w/index.php?title=Assocaition_football&action=info',
 '/w/index.php?title=Special:CreateAccount&returnto=Assocaition+football',
 '/w/index.php?title=Special:UserLogin&returnto=Assocaition+football',
 '/w/index.php?title=Talk:Assocaition_football&action=edit&redlink=1',
 '/wiki/Case_sensitivity',
 '/wiki/File:Commons-logo.svg',
 '/wiki/File:Wikibooks-logo.svg',
 '/wiki/File:Wikidata-logo.svg',
 '/wiki/File:Wikinews-logo.svg',
 '/wiki/File:Wikiquote-logo.svg',
 '/wiki/File:Wikisource-logo.svg',
 '/wiki/File:Wikiversity-logo.svg',
 '/wiki/File:Wikivoyage-Logo-v3-icon.svg',
 '/wiki/File:Wiktionary-logo-v2.svg',
 '/wiki/Help:Contents',
 '/wiki/Main_Page',
 '/wiki/Portal:Current_events',
 '/wiki/Special:MyContributions',
 '/wiki/Special:MyTalk',
 '/wiki/Spe

In [10]:
full_path_urls = r.html.absolute_links
full_path_urls

{'https://commons.wikimedia.org/wiki/Special:Search/Assocaition_football',
 'https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en',
 'https://en.m.wikipedia.org/w/index.php?title=Assocaition_football&mobileaction=toggle_view_mobile',
 'https://en.wikibooks.org/wiki/Special:Search/Assocaition_football',
 'https://en.wikinews.org/wiki/Special:Search/Assocaition_football',
 'https://en.wikipedia.org/w/index.php?search=Assocaition+football&title=Special%3ASearch&fulltext=1',
 'https://en.wikipedia.org/w/index.php?search=Assocaition+football&title=Special%3ASearch&fulltext=1&ns0=1',
 'https://en.wikipedia.org/w/index.php?title=Assocaition_football&action=edit&redlink=1',
 'https://en.wikipedia.org/w/index.php?title=Assocaition_football&action=info',
 'https://en.wikipedia.org/w/index.php?title=Assocaition_football&action=purge',
 'https://en.wikipedia.org/w/index.php?title=Special:CreateAccount&retur

In [11]:
type(urls)

set

## Searching fro elements

In [12]:
links = r.html.find("a")
links

[<Element 'a' id='top'>,
 <Element 'a' class=('mw-jump-link',) href='#mw-head'>,
 <Element 'a' class=('mw-jump-link',) href='#p-search'>,
 <Element 'a' href='/wiki/Special:SiteMatrix' title='Special:SiteMatrix'>,
 <Element 'a' class=('image',) href='/wiki/File:Wiktionary-logo-v2.svg'>,
 <Element 'a' class=('extiw',) href='https://en.wiktionary.org/wiki/Special:Search/Assocaition_football' title='wiktionary:Special:Search/Assocaition football'>,
 <Element 'a' class=('image',) href='/wiki/File:Wikibooks-logo.svg'>,
 <Element 'a' class=('extiw',) href='https://en.wikibooks.org/wiki/Special:Search/Assocaition_football' title='wikibooks:Special:Search/Assocaition football'>,
 <Element 'a' class=('image',) href='/wiki/File:Wikiquote-logo.svg'>,
 <Element 'a' class=('extiw',) href='https://en.wikiquote.org/wiki/Special:Search/Assocaition_football' title='wikiquote:Special:Search/Assocaition football'>,
 <Element 'a' class=('image',) href='/wiki/File:Wikisource-logo.svg'>,
 <Element 'a' class=

In [13]:
links[4]

<Element 'a' class=('image',) href='/wiki/File:Wiktionary-logo-v2.svg'>

In [14]:
links[4].html

'<a class="image" href="/wiki/File:Wiktionary-logo-v2.svg"><img alt="Wiktionary-logo-v2.svg" data-file-height="391" data-file-width="391" decoding="async" height="30" src="//upload.wikimedia.org/wikipedia/en/thumb/0/06/Wiktionary-logo-v2.svg/30px-Wiktionary-logo-v2.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/0/06/Wiktionary-logo-v2.svg/45px-Wiktionary-logo-v2.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/0/06/Wiktionary-logo-v2.svg/60px-Wiktionary-logo-v2.svg.png 2x" width="30"/></a>'

In [15]:
links[4].attrs

{'class': ('image',), 'href': '/wiki/File:Wiktionary-logo-v2.svg'}

In [16]:
r.html.find("a", containing="wikipedia")

[<Element 'a' href='//en.wikipedia.org/wiki/Wikipedia:Contact_us'>,
 <Element 'a' href='/wiki/Wikipedia:About' title='Wikipedia:About'>,
 <Element 'a' href='/wiki/Wikipedia:About' title='Find out about Wikipedia'>,
 <Element 'a' href='//shop.wikimedia.org' title='Visit the Wikipedia store'>,
 <Element 'a' href='https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en' title='Support us'>,
 <Element 'a' dir='ltr' href='https://en.wikipedia.org/wiki/Assocaition_football'>,
 <Element 'a' href='/wiki/Special:WhatLinksHere/Assocaition_football' title='Special:WhatLinksHere/Assocaition football'>,
 <Element 'a' class=('external', 'text') href='https://en.wikipedia.org/w/index.php?search=Assocaition+football&title=Special%3ASearch&fulltext=1'>]

In [17]:
[tag.text for tag in r.html.find("a", containing="wikipedia")]

['Contact Wikipedia',
 'About Wikipedia',
 'About Wikipedia',
 'Wikipedia store',
 'Donate to Wikipedia',
 'https://en.wikipedia.org/wiki/Assocaition_football',
 'Look for pages within Wikipedia that link to this title',
 'search for Assocaition football in Wikipedia']

In [18]:
r.html.find("p", first=True)

<Element 'p' >

## Searching for text

In [19]:
r.html.search_all("known{}soccer")

[]

## CSS selectors

In [21]:
r.html.find('span')

[<Element 'span' class=('plainlinks',)>,
 <Element 'span' class=('plainlinks',)>,
 <Element 'span' class=('plainlinks',)>,
 <Element 'span' >,
 <Element 'span' >]

## Select elelments based upon ID

NOte that the ID attributes are case sensitive and unique

In [23]:
r.html.find('#Name')

[]

In [0]:
r.html.find("#Duration_and_tie-breaking_methods", first=True)