# Set-up

In [1]:
# Import Libraries
import requests
from bs4 import BeautifulSoup

In [2]:
# Define the URL of the site we will scrape
base_site = "https://editorial.rottentomatoes.com/guide/140-essential-action-movies-to-watch-now/"

In [3]:
# Sending a request to the webpage
response = requests.get(base_site)
response

<Response [200]>

In [4]:
# Get the html from the webpage
html = response.content
html

b'<!DOCTYPE html>\n<html lang="en-US" class="hitim">\n<head prefix="og: http://ogp.me/ns# flixstertomatoes: http://ogp.me/ns/apps/flixstertomatoes#">\n    <meta http-equiv="content-type" content="text/html; charset=UTF-8" />\n    \n    <!-- OneTrust Cookies Consent Notice start for rottentomatoes.com -->\n    <script src="https://cdn.cookielaw.org/consent/7e979733-6841-4fce-9182-515fac69187f/otSDKStub.js"\n        type="text/javascript"\n        charset="UTF-8"\n        data-domain-script="7e979733-6841-4fce-9182-515fac69187f"\n        integrity="sha384-TKdmlzVmoD70HzftTw4WtOzIBL5mNx8mXSRzEvwrWjpIJ7FZ/EuX758yMDWXtRUN"\n        crossorigin="anonymous" >\n    </script>\n    <script type="text/javascript">\n        function OptanonWrapper() { }\n    </script>\n    <!-- OneTrust Cookies Consent Notice end for rottentomatoes.com -->\n    <!-- OneTrust IAB US Privacy (USP) -->\n    <script src="https://cdn.cookielaw.org/opt-out/otCCPAiab.js"\n        type="text/javascript"\n        charset="

# Choosing a parser: lxml

In [5]:
soup = BeautifulSoup(html, 'lxml')

In [6]:
with open('Rotten_tomatoes_LXML_Parser.html', 'wb') as file:
    file.write(soup.prettify('utf-8'))

# Finding an element containing all the data 

In [7]:
# Find all div tags on the webpage containing the information we want to scrape
# Use inspect to see the div tag attribute we are targetting
divs = soup.find_all("div", {"class": "col-sm-18 col-full-xs countdown-item-content"})
divs

[<div class="col-sm-18 col-full-xs countdown-item-content">
 <div class="row countdown-item-title-bar">
 <div class="col-sm-20 col-full-xs" style="height: 100%;">
 <div class="article_movie_title" style="float: left;">
 <div><h2><a href="https://www.rottentomatoes.com/m/1018009-running_scared">Running Scared</a> <span class="subtle start-year">(1986)</span> <span class="icon tiny fresh" title="Fresh"></span> <span class="tMeterScore">63%</span></h2></div>
 </div>
 </div>
 <div class="col-sm-4 col-full-xs" style="height: 100%;">
 <div class="countdown-index">#140</div>
 </div>
 </div>
 <div class="row countdown-item-details">
 <div class="col-sm-24">
 <div class="info countdown-adjusted-score"><span class="descriptor">Adjusted Score: </span>64770% <span class="glyphicon glyphicon-question-sign" data-html="true" data-original-title="The Adjusted Score comes from a weighted formula (Bayesian) that we use that accounts for variation in the number of reviews per movie." data-placement="top"

# Extracting these informations:
1. Title
1. Year
1. Score
1. The Critics Consensus
1. Director
1. Cast
1. Synopsis

In [8]:
# The title, year and score of each movie are contained in the 'h2' tags
# Lets check the html for the first div
divs[0]

<div class="col-sm-18 col-full-xs countdown-item-content">
<div class="row countdown-item-title-bar">
<div class="col-sm-20 col-full-xs" style="height: 100%;">
<div class="article_movie_title" style="float: left;">
<div><h2><a href="https://www.rottentomatoes.com/m/1018009-running_scared">Running Scared</a> <span class="subtle start-year">(1986)</span> <span class="icon tiny fresh" title="Fresh"></span> <span class="tMeterScore">63%</span></h2></div>
</div>
</div>
<div class="col-sm-4 col-full-xs" style="height: 100%;">
<div class="countdown-index">#140</div>
</div>
</div>
<div class="row countdown-item-details">
<div class="col-sm-24">
<div class="info countdown-adjusted-score"><span class="descriptor">Adjusted Score: </span>64770% <span class="glyphicon glyphicon-question-sign" data-html="true" data-original-title="The Adjusted Score comes from a weighted formula (Bayesian) that we use that accounts for variation in the number of reviews per movie." data-placement="top" data-toggle="

In [9]:
# Get the headings by getting the h2 tag
divs[0].find("h2")

<h2><a href="https://www.rottentomatoes.com/m/1018009-running_scared">Running Scared</a> <span class="subtle start-year">(1986)</span> <span class="icon tiny fresh" title="Fresh"></span> <span class="tMeterScore">63%</span></h2>

In [10]:
# Extracting all 'h2' tags
headings = [div.find("h2") for div in divs]
headings

[<h2><a href="https://www.rottentomatoes.com/m/1018009-running_scared">Running Scared</a> <span class="subtle start-year">(1986)</span> <span class="icon tiny fresh" title="Fresh"></span> <span class="tMeterScore">63%</span></h2>,
 <h2><a href="https://www.rottentomatoes.com/m/equilibrium">Equilibrium</a> <span class="subtle start-year">(2002)</span> <span class="icon tiny rotten" title="Rotten"></span> <span class="tMeterScore">40%</span></h2>,
 <h2><a href="https://www.rottentomatoes.com/m/hero">Hero</a> <span class="subtle start-year">(2002)</span> <span class="icon tiny certified" title="Certified Fresh"></span> <span class="tMeterScore">94%</span></h2>,
 <h2><a href="https://www.rottentomatoes.com/m/1017666-road_house">Road House</a> <span class="subtle start-year">(1989)</span> <span class="icon tiny rotten" title="Rotten"></span> <span class="tMeterScore">44%</span></h2>,
 <h2><a href="https://www.rottentomatoes.com/m/unstoppable-2010">Unstoppable</a> <span class="subtle start-y

In [11]:
# Inspecting the text inside the headings
[heading.text for heading in headings]

['Running Scared (1986)  63%',
 'Equilibrium (2002)  40%',
 'Hero (2002)  94%',
 'Road House (1989)  44%',
 'Unstoppable (2010)  87%',
 'Shaft (1971)  88%',
 'The Villainess (2017)  85%',
 'Highlander (1986)  71%',
 'Die Hard 2 (1990)  69%',
 'National Treasure (2004)  46%',
 'The Protector (2005)  53%',
 'Revenge (2017)  93%',
 'El Mariachi (1992)  91%',
 'A Touch of Zen (1971)  97%',
 'Top Gun (1986)  57%',
 'Con Air (1997)  58%',
 'The Expendables 2 (2012)  67%',
 'The Mummy (1999)  62%',
 'Mr. & Mrs. Smith (2005)  60%',
 'Rush Hour (1998)  62%',
 'The Equalizer (2014)  61%',
 'Captain America: Civil War (2016)  90%',
 'Air Force One (1997)  79%',
 'Bloodsport (1988)  46%',
 'Blade (1998)  59%',
 'Bad Boys (1995)  44%',
 'Die Hard With a Vengeance (1995)  59%',
 'The Running Man (1987)  67%',
 'Code of Silence (1985)  68%',
 "Shoot 'Em Up (2007)  67%",
 'Crank (2006)  62%',
 'Machete (2010)  70%',
 'Drive (2011)  93%',
 'Batman (1989)  77%',
 'Under Siege (1992)  80%',
 'Independenc

In [12]:
# We got the information we need
# However, we want the title, year, and sport seperately
# Let's see the first heading
headings[0]

<h2><a href="https://www.rottentomatoes.com/m/1018009-running_scared">Running Scared</a> <span class="subtle start-year">(1986)</span> <span class="icon tiny fresh" title="Fresh"></span> <span class="tMeterScore">63%</span></h2>

In [13]:
# We notice that:

# The movie title is in the 'a' tag
# The year is in a 'span' with class 'start-year'
# The score is in a 'span' with class 'tMeterScore'

# Title

In [14]:
# Let's check all anchor tags
[heading.find("a") for heading in headings]

[<a href="https://www.rottentomatoes.com/m/1018009-running_scared">Running Scared</a>,
 <a href="https://www.rottentomatoes.com/m/equilibrium">Equilibrium</a>,
 <a href="https://www.rottentomatoes.com/m/hero">Hero</a>,
 <a href="https://www.rottentomatoes.com/m/1017666-road_house">Road House</a>,
 <a href="https://www.rottentomatoes.com/m/unstoppable-2010">Unstoppable</a>,
 <a href="https://www.rottentomatoes.com/m/1018699-shaft">Shaft</a>,
 <a href="https://www.rottentomatoes.com/m/the_villainess">The Villainess</a>,
 <a href="https://www.rottentomatoes.com/m/highlander">Highlander</a>,
 <a href="https://www.rottentomatoes.com/m/die_hard_2_1990">Die Hard 2</a>,
 <a href="https://www.rottentomatoes.com/m/national_treasure">National Treasure</a>,
 <a href="https://www.rottentomatoes.com/m/protector">The Protector</a>,
 <a href="https://www.rottentomatoes.com/m/revenge_2018">Revenge</a>,
 <a href="https://www.rottentomatoes.com/m/el_mariachi">El Mariachi</a>,
 <a href="https://www.rotten

In [15]:
# Obtaining the movie titles from the links
movie_names = [heading.find("a").string for heading in headings]
movie_names

['Running Scared',
 'Equilibrium',
 'Hero',
 'Road House',
 'Unstoppable',
 'Shaft',
 'The Villainess',
 'Highlander',
 'Die Hard 2',
 'National Treasure',
 'The Protector',
 'Revenge',
 'El Mariachi',
 'A Touch of Zen',
 'Top Gun',
 'Con Air',
 'The Expendables 2',
 'The Mummy',
 'Mr. & Mrs. Smith',
 'Rush Hour',
 'The Equalizer',
 'Captain America: Civil War',
 'Air Force One',
 'Bloodsport',
 'Blade',
 'Bad Boys',
 'Die Hard With a Vengeance',
 'The Running Man',
 'Code of Silence',
 "Shoot 'Em Up",
 'Crank',
 'Machete',
 'Drive',
 'Batman',
 'Under Siege',
 'Independence Day',
 'Bullitt',
 'Wanted',
 'Superman: The Movie',
 'Ronin',
 'They Live',
 'Cliffhanger',
 "Marvel's the Avengers",
 'Hot Fuzz',
 'The Warriors',
 'Starship Troopers',
 'Elite Squad 2',
 'Point Break',
 'The Long Kiss Goodnight',
 'The Guest',
 'Taken',
 '300',
 'True Lies',
 'Demolition Man',
 'Hardcore Henry',
 'Police Story',
 'Brotherhood of the Wolf',
 'Kingsman: The Secret Service',
 'The Fifth Element',
 

# Year

In [16]:
# Filtering only the spans containing the year
[heading.find("span", {"class": "start-year"}) for heading in headings]

[<span class="subtle start-year">(1986)</span>,
 <span class="subtle start-year">(2002)</span>,
 <span class="subtle start-year">(2002)</span>,
 <span class="subtle start-year">(1989)</span>,
 <span class="subtle start-year">(2010)</span>,
 <span class="subtle start-year">(1971)</span>,
 <span class="subtle start-year">(2017)</span>,
 <span class="subtle start-year">(1986)</span>,
 <span class="subtle start-year">(1990)</span>,
 <span class="subtle start-year">(2004)</span>,
 <span class="subtle start-year">(2005)</span>,
 <span class="subtle start-year">(2017)</span>,
 <span class="subtle start-year">(1992)</span>,
 <span class="subtle start-year">(1971)</span>,
 <span class="subtle start-year">(1986)</span>,
 <span class="subtle start-year">(1997)</span>,
 <span class="subtle start-year">(2012)</span>,
 <span class="subtle start-year">(1999)</span>,
 <span class="subtle start-year">(2005)</span>,
 <span class="subtle start-year">(1998)</span>,
 <span class="subtle start-year">(2014)<

In [17]:
# Extracting the year string
years = [heading.find("span", {"class": "start-year"}).string for heading in headings]
years

['(1986)',
 '(2002)',
 '(2002)',
 '(1989)',
 '(2010)',
 '(1971)',
 '(2017)',
 '(1986)',
 '(1990)',
 '(2004)',
 '(2005)',
 '(2017)',
 '(1992)',
 '(1971)',
 '(1986)',
 '(1997)',
 '(2012)',
 '(1999)',
 '(2005)',
 '(1998)',
 '(2014)',
 '(2016)',
 '(1997)',
 '(1988)',
 '(1998)',
 '(1995)',
 '(1995)',
 '(1987)',
 '(1985)',
 '(2007)',
 '(2006)',
 '(2010)',
 '(2011)',
 '(1989)',
 '(1992)',
 '(1996)',
 '(1968)',
 '(2008)',
 '(1978)',
 '(1998)',
 '(1988)',
 '(1993)',
 '(2012)',
 '(2007)',
 '(1979)',
 '(1997)',
 '(2010)',
 '(1991)',
 '(1996)',
 '(2014)',
 '(2008)',
 '(2006)',
 '(1994)',
 '(1993)',
 '(2015)',
 '(1985)',
 '(2001)',
 '(2014)',
 '(1997)',
 '(1986)',
 '(2017)',
 '(1995)',
 '(2004)',
 '(1984)',
 '(2003)',
 '(2004)',
 '(1993)',
 '(1981)',
 '(2000)',
 '(2004)',
 '(2010)',
 '(1992)',
 '(1989)',
 '(2004)',
 '(1986)',
 '(2008)',
 '(2018)',
 '(2017)',
 '(1964)',
 '(1976)',
 '(2017)',
 '(1972)',
 '(2014)',
 '(2003)',
 '(1971)',
 '(2015)',
 '(1990)',
 '(1992)',
 '(1971)',
 '(2014)',
 '(2003)',

We don't want the brackets, so we have to remove it

## Removing the brackets

In [18]:
# One way to remove the brackets is to drop the first and last symbol of the string
years[0][1:-1]

'1986'

However, this will break, if the format of the year is changed

For example: If the original data doesn't have brackets anymore

In [19]:
# Alternativelly, we can do it with the help of the strip() method (this is robust)

# It removes leading and trailing symbols from a string
# By default, it removes whitespace, but we can specify other symbols to strip

In [20]:
# Removing '('
years[0].strip('(')

'1986)'

In [21]:
# Removing ')'
years[0].strip(')')

'(1986'

In [22]:
# Combining both
years[0].strip('()')

'1986'

In [23]:
# Updating years with stripped values
years = [year.strip('()') for year in years]
years

['1986',
 '2002',
 '2002',
 '1989',
 '2010',
 '1971',
 '2017',
 '1986',
 '1990',
 '2004',
 '2005',
 '2017',
 '1992',
 '1971',
 '1986',
 '1997',
 '2012',
 '1999',
 '2005',
 '1998',
 '2014',
 '2016',
 '1997',
 '1988',
 '1998',
 '1995',
 '1995',
 '1987',
 '1985',
 '2007',
 '2006',
 '2010',
 '2011',
 '1989',
 '1992',
 '1996',
 '1968',
 '2008',
 '1978',
 '1998',
 '1988',
 '1993',
 '2012',
 '2007',
 '1979',
 '1997',
 '2010',
 '1991',
 '1996',
 '2014',
 '2008',
 '2006',
 '1994',
 '1993',
 '2015',
 '1985',
 '2001',
 '2014',
 '1997',
 '1986',
 '2017',
 '1995',
 '2004',
 '1984',
 '2003',
 '2004',
 '1993',
 '1981',
 '2000',
 '2004',
 '2010',
 '1992',
 '1989',
 '2004',
 '1986',
 '2008',
 '2018',
 '2017',
 '1964',
 '1976',
 '2017',
 '1972',
 '2014',
 '2003',
 '1971',
 '2015',
 '1990',
 '1992',
 '1971',
 '2014',
 '2003',
 '1993',
 '2018',
 '2010',
 '1995',
 '2002',
 '2019',
 '2012',
 '2002',
 '2008',
 '1997',
 '1985',
 '2008',
 '2011',
 '2011',
 '1987',
 '1996',
 '1987',
 '2017',
 '2006',
 '2017',
 

In [24]:
# Converting all the strings to integers
years = [int(year) for year in years]
years

[1986,
 2002,
 2002,
 1989,
 2010,
 1971,
 2017,
 1986,
 1990,
 2004,
 2005,
 2017,
 1992,
 1971,
 1986,
 1997,
 2012,
 1999,
 2005,
 1998,
 2014,
 2016,
 1997,
 1988,
 1998,
 1995,
 1995,
 1987,
 1985,
 2007,
 2006,
 2010,
 2011,
 1989,
 1992,
 1996,
 1968,
 2008,
 1978,
 1998,
 1988,
 1993,
 2012,
 2007,
 1979,
 1997,
 2010,
 1991,
 1996,
 2014,
 2008,
 2006,
 1994,
 1993,
 2015,
 1985,
 2001,
 2014,
 1997,
 1986,
 2017,
 1995,
 2004,
 1984,
 2003,
 2004,
 1993,
 1981,
 2000,
 2004,
 2010,
 1992,
 1989,
 2004,
 1986,
 2008,
 2018,
 2017,
 1964,
 1976,
 2017,
 1972,
 2014,
 2003,
 1971,
 2015,
 1990,
 1992,
 1971,
 2014,
 2003,
 1993,
 2018,
 2010,
 1995,
 2002,
 2019,
 2012,
 2002,
 2008,
 1997,
 1985,
 2008,
 2011,
 2011,
 1987,
 1996,
 1987,
 2017,
 2006,
 2017,
 1994,
 1989,
 2014,
 1973,
 1985,
 1982,
 2015,
 1984,
 2000,
 2003,
 1994,
 1994,
 1994,
 2014,
 2000,
 1987,
 2007,
 1990,
 1981,
 1995,
 2011,
 2018,
 1981,
 1986,
 1992,
 1999,
 1991,
 1988,
 2015]

# Score

In [25]:
# Filtering only the spans containing the score
[heading.find("span", {"class": "tMeterScore"}) for heading in headings]

[<span class="tMeterScore">63%</span>,
 <span class="tMeterScore">40%</span>,
 <span class="tMeterScore">94%</span>,
 <span class="tMeterScore">44%</span>,
 <span class="tMeterScore">87%</span>,
 <span class="tMeterScore">88%</span>,
 <span class="tMeterScore">85%</span>,
 <span class="tMeterScore">71%</span>,
 <span class="tMeterScore">69%</span>,
 <span class="tMeterScore">46%</span>,
 <span class="tMeterScore">53%</span>,
 <span class="tMeterScore">93%</span>,
 <span class="tMeterScore">91%</span>,
 <span class="tMeterScore">97%</span>,
 <span class="tMeterScore">57%</span>,
 <span class="tMeterScore">58%</span>,
 <span class="tMeterScore">67%</span>,
 <span class="tMeterScore">62%</span>,
 <span class="tMeterScore">60%</span>,
 <span class="tMeterScore">62%</span>,
 <span class="tMeterScore">61%</span>,
 <span class="tMeterScore">90%</span>,
 <span class="tMeterScore">79%</span>,
 <span class="tMeterScore">46%</span>,
 <span class="tMeterScore">59%</span>,
 <span class="tMeterScore

In [26]:
# Extracting the score string
scores = [heading.find("span", {"class": "tMeterScore"}).string for heading in headings]
scores

['63%',
 '40%',
 '94%',
 '44%',
 '87%',
 '88%',
 '85%',
 '71%',
 '69%',
 '46%',
 '53%',
 '93%',
 '91%',
 '97%',
 '57%',
 '58%',
 '67%',
 '62%',
 '60%',
 '62%',
 '61%',
 '90%',
 '79%',
 '46%',
 '59%',
 '44%',
 '59%',
 '67%',
 '68%',
 '67%',
 '62%',
 '70%',
 '93%',
 '77%',
 '80%',
 '68%',
 '98%',
 '71%',
 '94%',
 '70%',
 '86%',
 '68%',
 '91%',
 '91%',
 '88%',
 '72%',
 '91%',
 '69%',
 '68%',
 '92%',
 '60%',
 '61%',
 '70%',
 '63%',
 '51%',
 '93%',
 '73%',
 '75%',
 '71%',
 '74%',
 '79%',
 '80%',
 '80%',
 '83%',
 '85%',
 '86%',
 '91%',
 '88%',
 '90%',
 '93%',
 '95%',
 '89%',
 '84%',
 '91%',
 '93%',
 '94%',
 '91%',
 '93%',
 '99%',
 '96%',
 '93%',
 '83%',
 '90%',
 '83%',
 '96%',
 '81%',
 '90%',
 '93%',
 '89%',
 '91%',
 '85%',
 '96%',
 '96%',
 '87%',
 '76%',
 '89%',
 '94%',
 '80%',
 '84%',
 '86%',
 '93%',
 '88%',
 '94%',
 '93%',
 '78%',
 '80%',
 '67%',
 '92%',
 '89%',
 '94%',
 '92%',
 '100%',
 '95%',
 '82%',
 '88%',
 '67%',
 '86%',
 '94%',
 '100%',
 '80%',
 '85%',
 '74%',
 '95%',
 '84%',
 '86%'

In [27]:
# Removing the '%' sign
scores = [score.strip('%') for score in scores]
scores

['63',
 '40',
 '94',
 '44',
 '87',
 '88',
 '85',
 '71',
 '69',
 '46',
 '53',
 '93',
 '91',
 '97',
 '57',
 '58',
 '67',
 '62',
 '60',
 '62',
 '61',
 '90',
 '79',
 '46',
 '59',
 '44',
 '59',
 '67',
 '68',
 '67',
 '62',
 '70',
 '93',
 '77',
 '80',
 '68',
 '98',
 '71',
 '94',
 '70',
 '86',
 '68',
 '91',
 '91',
 '88',
 '72',
 '91',
 '69',
 '68',
 '92',
 '60',
 '61',
 '70',
 '63',
 '51',
 '93',
 '73',
 '75',
 '71',
 '74',
 '79',
 '80',
 '80',
 '83',
 '85',
 '86',
 '91',
 '88',
 '90',
 '93',
 '95',
 '89',
 '84',
 '91',
 '93',
 '94',
 '91',
 '93',
 '99',
 '96',
 '93',
 '83',
 '90',
 '83',
 '96',
 '81',
 '90',
 '93',
 '89',
 '91',
 '85',
 '96',
 '96',
 '87',
 '76',
 '89',
 '94',
 '80',
 '84',
 '86',
 '93',
 '88',
 '94',
 '93',
 '78',
 '80',
 '67',
 '92',
 '89',
 '94',
 '92',
 '100',
 '95',
 '82',
 '88',
 '67',
 '86',
 '94',
 '100',
 '80',
 '85',
 '74',
 '95',
 '84',
 '86',
 '98',
 '80',
 '92',
 '82',
 '94',
 '83',
 '87',
 '97',
 '93',
 '98',
 '92',
 '83',
 '91',
 '94',
 '97']

In [28]:
# Converting each score to an integer
scores = [int(score) for score in scores]
scores

[63,
 40,
 94,
 44,
 87,
 88,
 85,
 71,
 69,
 46,
 53,
 93,
 91,
 97,
 57,
 58,
 67,
 62,
 60,
 62,
 61,
 90,
 79,
 46,
 59,
 44,
 59,
 67,
 68,
 67,
 62,
 70,
 93,
 77,
 80,
 68,
 98,
 71,
 94,
 70,
 86,
 68,
 91,
 91,
 88,
 72,
 91,
 69,
 68,
 92,
 60,
 61,
 70,
 63,
 51,
 93,
 73,
 75,
 71,
 74,
 79,
 80,
 80,
 83,
 85,
 86,
 91,
 88,
 90,
 93,
 95,
 89,
 84,
 91,
 93,
 94,
 91,
 93,
 99,
 96,
 93,
 83,
 90,
 83,
 96,
 81,
 90,
 93,
 89,
 91,
 85,
 96,
 96,
 87,
 76,
 89,
 94,
 80,
 84,
 86,
 93,
 88,
 94,
 93,
 78,
 80,
 67,
 92,
 89,
 94,
 92,
 100,
 95,
 82,
 88,
 67,
 86,
 94,
 100,
 80,
 85,
 74,
 95,
 84,
 86,
 98,
 80,
 92,
 82,
 94,
 83,
 87,
 97,
 93,
 98,
 92,
 83,
 91,
 94,
 97]

## Critics Consensus

In [29]:
# The critics consensus is located inside a 'div' tag with the class 'info critics-consensus'
# This can be found inside the original 'div's we scraped
divs[0]

<div class="col-sm-18 col-full-xs countdown-item-content">
<div class="row countdown-item-title-bar">
<div class="col-sm-20 col-full-xs" style="height: 100%;">
<div class="article_movie_title" style="float: left;">
<div><h2><a href="https://www.rottentomatoes.com/m/1018009-running_scared">Running Scared</a> <span class="subtle start-year">(1986)</span> <span class="icon tiny fresh" title="Fresh"></span> <span class="tMeterScore">63%</span></h2></div>
</div>
</div>
<div class="col-sm-4 col-full-xs" style="height: 100%;">
<div class="countdown-index">#140</div>
</div>
</div>
<div class="row countdown-item-details">
<div class="col-sm-24">
<div class="info countdown-adjusted-score"><span class="descriptor">Adjusted Score: </span>64770% <span class="glyphicon glyphicon-question-sign" data-html="true" data-original-title="The Adjusted Score comes from a weighted formula (Bayesian) that we use that accounts for variation in the number of reviews per movie." data-placement="top" data-toggle="

In [30]:
# Getting the "div" tags containing the critics concensus
consensus = [div.find("div", {"class": "info critics-consensus"}) for div in divs]
consensus

[<div class="info critics-consensus"><span class="descriptor">Critics Consensus:</span> <em>Running Scared</em> struggles to strike a consistent balance between violent action and humor, but the chemistry between its well-matched leads keeps things entertaining.</div>,
 <div class="info critics-consensus"><span class="descriptor">Critics Consensus:</span> Equilibrium is a reheated mishmash of other sci-fi movies.</div>,
 <div class="info critics-consensus"><span class="descriptor">Critics Consensus:</span> With death-defying action sequences and epic historic sweep, <em>Hero</em> offers everything a martial arts fan could ask for.</div>,
 <div class="info critics-consensus"><span class="descriptor">Critics Consensus:</span> Whether <em>Road House</em> is simply bad or so bad it's good depends largely on the audience's fondness for Swayze -- and tolerance for violently cheesy action.</div>,
 <div class="info critics-consensus"><span class="descriptor">Critics Consensus:</span> As fast, 

In [31]:
# Inspecting the text inside these tags
[con.text for con in consensus]

['Critics Consensus: Running Scared struggles to strike a consistent balance between violent action and humor, but the chemistry between its well-matched leads keeps things entertaining.',
 'Critics Consensus: Equilibrium is a reheated mishmash of other sci-fi movies.',
 'Critics Consensus: With death-defying action sequences and epic historic sweep, Hero offers everything a martial arts fan could ask for.',
 "Critics Consensus: Whether Road House is simply bad or so bad it's good depends largely on the audience's fondness for Swayze -- and tolerance for violently cheesy action.",
 "Critics Consensus: As fast, loud, and relentless as the train at the center of the story, Unstoppable is perfect popcorn entertainment -- and director Tony Scott's best movie in years.",
 'Critics Consensus: This is the man that would risk his neck for his brother, man. Can you dig it?',
 'Critics Consensus: The Villainess offers enough pure kinetic thrills to satisfy genre enthusiasts -- and carve out a bl

In [32]:
# Every consensus starts with the string 'Critics Consensus: '
# We want to remove it

In [33]:
# Defining the phrase to be removed (note the space at the end)
common_phrase = 'Critics Consensus: '

In [34]:
# Finding how long is the common phrase
common_phrase_len = len(common_phrase)

In [35]:
consensus[0].text

'Critics Consensus: Running Scared struggles to strike a consistent balance between violent action and humor, but the chemistry between its well-matched leads keeps things entertaining.'

In [36]:
# Taking only the part of the text after the common phrase
consensus[0].text[common_phrase_len:]

'Running Scared struggles to strike a consistent balance between violent action and humor, but the chemistry between its well-matched leads keeps things entertaining.'

In [37]:
# Remove common phrase from the whole list
consensus_text = [con.text[common_phrase_len:] for con in consensus]
consensus_text

['Running Scared struggles to strike a consistent balance between violent action and humor, but the chemistry between its well-matched leads keeps things entertaining.',
 'Equilibrium is a reheated mishmash of other sci-fi movies.',
 'With death-defying action sequences and epic historic sweep, Hero offers everything a martial arts fan could ask for.',
 "Whether Road House is simply bad or so bad it's good depends largely on the audience's fondness for Swayze -- and tolerance for violently cheesy action.",
 "As fast, loud, and relentless as the train at the center of the story, Unstoppable is perfect popcorn entertainment -- and director Tony Scott's best movie in years.",
 'This is the man that would risk his neck for his brother, man. Can you dig it?',
 'The Villainess offers enough pure kinetic thrills to satisfy genre enthusiasts -- and carve out a bloody niche for itself in modern Korean action cinema.',
 "People hate Highlander because it's cheesy, bombastic, and absurd. And peop

# Directors

In [38]:
# Extracting all director divs
directors = [div.find("div", class_ = 'director') for div in divs]
directors

[<div class="info director">
 <span class="descriptor">Directed By:</span> <a class="" href="//www.rottentomatoes.com/celebrity/peter_hyams">Peter Hyams</a></div>,
 <div class="info director">
 <span class="descriptor">Directed By:</span> <a class="" href="//www.rottentomatoes.com/celebrity/kurt_wimmer">Kurt Wimmer</a></div>,
 <div class="info director">
 <span class="descriptor">Directed By:</span> <a class="" href="//www.rottentomatoes.com/celebrity/zhang_yimou">Yimou Zhang</a></div>,
 <div class="info director">
 <span class="descriptor">Directed By:</span> <a class="" href="//www.rottentomatoes.com/celebrity/rowdy_herrington">Rowdy Herrington</a></div>,
 <div class="info director">
 <span class="descriptor">Directed By:</span> <a class="" href="//www.rottentomatoes.com/celebrity/tony_scott">Tony Scott</a></div>,
 <div class="info director">
 <span class="descriptor">Directed By:</span> <a class="" href="//www.rottentomatoes.com/celebrity/gordon_parks">Gordon Parks</a></div>,
 <div 

In [39]:
# The director's name can be found as the string of an anchor tag

# Obtaining all director anchor tags
directors = [director.find("a").string for director in directors]
directors

['Peter Hyams',
 'Kurt Wimmer',
 'Yimou Zhang',
 'Rowdy Herrington',
 'Tony Scott',
 'Gordon Parks',
 'Jeong Byeong-gil',
 'Russell Mulcahy',
 'Renny Harlin',
 'Jon Turteltaub',
 'Prachya Pinkaew',
 'Coralie Fargeat',
 'Robert Rodriguez',
 'King Hu',
 'Tony Scott',
 'Simon West',
 'Simon West',
 'Stephen Sommers',
 'Doug Liman',
 'Brett Ratner',
 'Antoine Fuqua',
 'Anthony Russo',
 'Wolfgang Petersen',
 'Newt Arnold',
 'Stephen Norrington',
 'Michael Bay',
 'John McTiernan',
 'Paul Michael Glaser',
 'Andrew Davis',
 'Michael Davis',
 'Mark Neveldine',
 'Robert Rodriguez',
 'Nicolas Winding Refn',
 'Tim Burton',
 'Andrew Davis',
 'Roland Emmerich',
 'Peter Yates',
 'Timur Bekmambetov',
 'Richard Donner',
 'John Frankenheimer',
 'John Carpenter',
 'Renny Harlin',
 'Joss Whedon',
 'Edgar Wright',
 'Walter Hill',
 'Paul Verhoeven',
 'José Padilha',
 'Kathryn Bigelow',
 'Renny Harlin',
 'Adam Wingard',
 'Pierre Morel',
 'Zack Snyder',
 'James Cameron',
 'Marco Brambilla',
 'Ilya Naishuller'

# Cast Info

In [40]:
cast_info = [div.find("div", {"class": "cast"}) for div in divs]
cast_info

[<div class="info cast">
 <span class="descriptor">Starring:</span> <a class="" href="//www.rottentomatoes.com/celebrity/gregory_hines">Gregory Hines</a>, <a class="" href="//www.rottentomatoes.com/celebrity/billy_crystal">Billy Crystal</a>, <a class="" href="//www.rottentomatoes.com/celebrity/jimmy_smits">Jimmy Smits</a>, <a class="" href="//www.rottentomatoes.com/celebrity/steven_bauer">Steven Bauer</a></div>,
 <div class="info cast">
 <span class="descriptor">Starring:</span> <a class="" href="//www.rottentomatoes.com/celebrity/christian_bale">Christian Bale</a>, <a class="" href="//www.rottentomatoes.com/celebrity/emily_watson">Emily Watson</a>, <a class="" href="//www.rottentomatoes.com/celebrity/taye_diggs">Taye Diggs</a>, <a class="" href="//www.rottentomatoes.com/celebrity/angus_macfadyen">Angus Macfadyen</a></div>,
 <div class="info cast">
 <span class="descriptor">Starring:</span> <a class="" href="//www.rottentomatoes.com/celebrity/jet_li">Jet Li</a>, <a class="" href="//www

In [41]:
cast_info[0]

<div class="info cast">
<span class="descriptor">Starring:</span> <a class="" href="//www.rottentomatoes.com/celebrity/gregory_hines">Gregory Hines</a>, <a class="" href="//www.rottentomatoes.com/celebrity/billy_crystal">Billy Crystal</a>, <a class="" href="//www.rottentomatoes.com/celebrity/jimmy_smits">Jimmy Smits</a>, <a class="" href="//www.rottentomatoes.com/celebrity/steven_bauer">Steven Bauer</a></div>

Each cast member's name is the string of a link

There are multiple cast members for a movie

In [42]:
# Get the anchors for all cast members of a single movie
cast_links = cast_info[0].find_all("a")
cast_links

[<a class="" href="//www.rottentomatoes.com/celebrity/gregory_hines">Gregory Hines</a>,
 <a class="" href="//www.rottentomatoes.com/celebrity/billy_crystal">Billy Crystal</a>,
 <a class="" href="//www.rottentomatoes.com/celebrity/jimmy_smits">Jimmy Smits</a>,
 <a class="" href="//www.rottentomatoes.com/celebrity/steven_bauer">Steven Bauer</a>]

In [43]:
# Extract the names
cast_names = [link.string for link in cast_links]
cast_names

['Gregory Hines', 'Billy Crystal', 'Jimmy Smits', 'Steven Bauer']

In [44]:
# We can concatenate this list into a single string with a separator
cast = ", ".join(cast_names)
cast

'Gregory Hines, Billy Crystal, Jimmy Smits, Steven Bauer'

In [45]:
# Extract the names for all movies using a for loop
cast = []

for c in cast_info:
    cast_links = c.find_all("a")
    cast_names = [link.string for link in cast_links]
    
    cast.append(", ".join(cast_names))
    
cast

['Gregory Hines, Billy Crystal, Jimmy Smits, Steven Bauer',
 'Christian Bale, Emily Watson, Taye Diggs, Angus Macfadyen',
 'Jet Li, Tony Leung Chiu Wai, Maggie Cheung, Donnie Yen',
 'Patrick Swayze, Kelly Lynch, Sam Elliott, Ben Gazzara',
 'Denzel Washington, Chris Pine, Rosario Dawson, Kevin Dunn',
 'Richard Roundtree, Moses Gunn, Christopher St. John, Charles Cioffi',
 'Kim Ok-bin, Shin Ha-kyun, Sung-joon, Kim Seo-hyung',
 'Christopher Lambert, Sean Connery, Roxanne Hart, Clancy Brown',
 'Bruce Willis, Bonnie Bedelia, William Atherton, Reginald VelJohnson',
 'Nicolas Cage, Diane Kruger, Justin Bartha, Sean Bean',
 'Tachakorn Yeerum, Johnny Nguyen, Nathan Jones, Phetthai Vongkumlao',
 'Matilda Anna Ingrid Lutz, Kevin Janssens, Vincent Colombe, Guillaume Bouchède',
 'Carlos Gallardo, Consuelo Gómez, Reinol Martinez, Peter Marquardt',
 'Feng Hsu, Chun Shih, Pai Ying, Roy Chiao',
 'Tom Cruise, Kelly McGillis, Anthony Edwards, Val Kilmer',
 'Nicolas Cage, John Cusack, John Malkovich, Stev

# Synopsis

In [46]:
# The synopsis is located inside a 'div' tag with the class 'info synopsis'
synopsis = [div.find("div", {"class": "synopsis"}) for div in divs]
synopsis

[<div class="info synopsis"><span class="descriptor">Synopsis:</span> Ray and Danny (Gregory Hines, Billy Crystal) are two Chicago police detectives hot on the trail of drug kingpin Julio...<a class="" data-pageheader="" href="https://www.rottentomatoes.com/m/1018009-running_scared" target="_top"> [More]</a></div>,
 <div class="info synopsis"><span class="descriptor">Synopsis:</span> In a futuristic world, a regime has eliminated war by suppressing emotions: books, art and music are strictly forbidden and...<a class="" data-pageheader="" href="https://www.rottentomatoes.com/m/equilibrium" target="_top"> [More]</a></div>,
 <div class="info synopsis"><span class="descriptor">Synopsis:</span> In this visually arresting martial arts epic set in ancient China, an unnamed fighter (Jet Li) is being honored for...<a class="" data-pageheader="" href="https://www.rottentomatoes.com/m/hero" target="_top"> [More]</a></div>,
 <div class="info synopsis"><span class="descriptor">Synopsis:</span> The 

In [47]:
# Check the first element
synopsis[0]

<div class="info synopsis"><span class="descriptor">Synopsis:</span> Ray and Danny (Gregory Hines, Billy Crystal) are two Chicago police detectives hot on the trail of drug kingpin Julio...<a class="" data-pageheader="" href="https://www.rottentomatoes.com/m/1018009-running_scared" target="_top"> [More]</a></div>

Every synopsis starts with the string 'Synopsis: '

We want to remove it

We should use different method than before as in Critics Consensus because we also want to remove the text of the last tag "[More]"

In [48]:
# The text is the second child
synopsis[0].contents[1].strip()

'Ray and Danny (Gregory Hines, Billy Crystal) are two Chicago police detectives hot on the trail of drug kingpin Julio...'

In [49]:
# Extracting the text
synopsis_text = [syn.contents[1].strip() for syn in synopsis]
synopsis_text

['Ray and Danny (Gregory Hines, Billy Crystal) are two Chicago police detectives hot on the trail of drug kingpin Julio...',
 'In a futuristic world, a regime has eliminated war by suppressing emotions: books, art and music are strictly forbidden and...',
 'In this visually arresting martial arts epic set in ancient China, an unnamed fighter (Jet Li) is being honored for...',
 'The Double Deuce is the meanest, loudest and rowdiest bar south of the Mason-Dixon Line, and Dalton (Patrick Swayze) has...',
 'When a massive, unmanned locomotive roars out of control, the threat is more ominous than just a derailment. The train...',
 'John Shaft (Richard Roundtree) is the ultimate in suave black detectives. He first finds himself up against Bumpy (Moses Gunn),...',
 'Honed from childhood to be an elite assassin, Sook-hee embarks on a rampage of violence and revenge to finally earn...',
 'When the mystical Russell Nash (Christopher Lambert) kills a man in a sword fight in a New York City parkin

# Represent the data in a pandas dataframe

In [50]:
# Import pandas library
import pandas as pd

In [51]:
# Create a dataframe
movies_info = pd.DataFrame()

In [52]:
# Populating the dataframe
movies_info["Movie Title"] = movie_names
movies_info["Year"] = years
movies_info["Score"] = scores
movies_info["Director"] = directors
movies_info["Synopsis"] = synopsis_text
movies_info["Cast"] = cast
movies_info["Consensus"] = consensus_text

In [53]:
movies_info

Unnamed: 0,Movie Title,Year,Score,Director,Synopsis,Cast,Consensus
0,Running Scared,1986,63,Peter Hyams,"Ray and Danny (Gregory Hines, Billy Crystal) a...","Gregory Hines, Billy Crystal, Jimmy Smits, Ste...",Running Scared struggles to strike a consisten...
1,Equilibrium,2002,40,Kurt Wimmer,"In a futuristic world, a regime has eliminated...","Christian Bale, Emily Watson, Taye Diggs, Angu...",Equilibrium is a reheated mishmash of other sc...
2,Hero,2002,94,Yimou Zhang,In this visually arresting martial arts epic s...,"Jet Li, Tony Leung Chiu Wai, Maggie Cheung, Do...",With death-defying action sequences and epic h...
3,Road House,1989,44,Rowdy Herrington,"The Double Deuce is the meanest, loudest and r...","Patrick Swayze, Kelly Lynch, Sam Elliott, Ben ...",Whether Road House is simply bad or so bad it'...
4,Unstoppable,2010,87,Tony Scott,"When a massive, unmanned locomotive roars out ...","Denzel Washington, Chris Pine, Rosario Dawson,...","As fast, loud, and relentless as the train at ..."
...,...,...,...,...,...,...,...
135,Hard-Boiled,1992,92,John Woo,A cop who loses his partner in a shoot-out wit...,"Chow Yun-Fat, Bowie Lam, Philip Chan, Tony Leu...",Boasting impactful action as well as surprisin...
136,The Matrix,1999,83,Lilly Wachowski,Neo (Keanu Reeves) believes that Morpheus (Lau...,"Keanu Reeves, Laurence Fishburne, Carrie-Anne ...","Thanks to the Wachowskis' imaginative vision, ..."
137,Terminator 2: Judgment Day,1991,91,James Cameron,"In this sequel set eleven years after ""The Ter...","Arnold Schwarzenegger, Linda Hamilton, Edward ...",T2 features thrilling action sequences and eye...
138,Die Hard,1988,94,John McTiernan,New York City policeman John McClane (Bruce Wi...,"Bruce Willis, Alan Rickman, Bonnie Bedelia, Re...",Its many imitators (and sequels) have never co...


In [54]:
# Check for any missing data, as the web header state "140 ESSENTIAL ACTION MOVIES TO WATCH NOW", it should have 140 data
movies_info.count()

Movie Title    140
Year           140
Score          140
Director       140
Synopsis       140
Cast           140
Consensus      140
dtype: int64

# Exporting the data to CSV file

In [55]:
movies_info.to_excel("movies_info.xlsx", index = False, header = True)

# Exporting the data to excel file

In [56]:
movies_info.to_csv("movies_info.csv", index = False, header = True)