# Web Scraping Test
## IMDB Actor Filmography

## Objectives
* To learn more about web scraping
* Example used will be to pull actor filmography data from IMDB

In [2]:
# Install packages
# pip install requests
# pip install beautifulsoup4

In [1]:
# Load libraries and URL
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
# import numpy as np
# import seaborn as sns

# Example url is for Tom Cruise
url = 'https://www.imdb.com/name/nm0000129/'

# Load URL and confirm success
r = requests.get(url)
print(r.content[:100])

# Parse HTML with BeautifulSoup
soup = BeautifulSoup(r.content, 'html.parser')

b'\n\n\n\n<!DOCTYPE html>\n<html\n    xmlns:og="http://ogp.me/ns#"\n    xmlns:fb="http://www.facebook.com/200'


In [2]:
# Prettify HTML for manual review
print(soup.prettify())

<!DOCTYPE html>
<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="app-id=342792525, app-argument=imdb:///name/nm0000129?src=mdot" name="apple-itunes-app"/>
  <script type="text/javascript">
   var IMDbTimer={starttime: new Date().getTime(),pt:'java'};
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <title>
   Tom Cruise - IMDb
  </title>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("be", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   if (typeof uex == 'function') {
      uex("ld", "LoadTitle", {wb: 1});
    }

In [3]:
# Within 'soup', find all <div> tag with 'id' attribute starting with 'actor-'
films = soup.find_all('div', id = re.compile('^actor-'))

# # Within 'soup', find <div> tag with class "filmo-category-section" and find all <a> tags without class attribute
# films = soup.find('div', class_ = "filmo-category-section").find_all('a', class_ = False)

# Proof of parsing
# Print title and href
for i in films:
    print(i.text)
    print(i.get('href'))



 2021

Mission: Impossible 7
(announced)

Ethan Hunt

None


 

Luna Park
(announced)


None


 

Untitled Tom Cruise/SpaceX Project
(announced)


None


 2022

Mission: Impossible 8
(announced)

Ethan Hunt

None


 

Live Die Repeat and Repeat
(pre-production)

Cage (rumored)

None


 2020

Top Gun: Maverick
(post-production)

Maverick

None


 2018

Mission: Impossible - Fallout

Ethan Hunt

None


 2017

American Made

Barry Seal

None


 2017

The Mummy

Nick Morton

None


 2016

Jack Reacher: Never Go Back

Jack Reacher

None


 2015

Mission: Impossible - Rogue Nation

Ethan Hunt

None


 2014

Edge of Tomorrow

Cage

None


 2013/I

Oblivion

Jack

None


 2012

Jack Reacher

Reacher

None


 2012

Rock of Ages

Stacee Jaxx

None


 2011

Mission: Impossible - Ghost Protocol

Ethan Hunt

None


 2010

Knight and Day

Roy Miller

None


 2008

Valkyrie

Colonel Claus von Stauffenberg

None


 2008

Tropic Thunder

Les Grossman - Grossman's Office

None


 2007

Lions for Lambs

In [4]:
# Create new array, and append movie title and href
filmsarray = []
for film in films:
    filmsarray.append([film.a.text, film.a.get('href')])
filmsarray

[['Mission: Impossible 7', '/title/tt9603212/'],
 ['Luna Park', '/title/tt1123441/'],
 ['Untitled Tom Cruise/SpaceX Project', '/title/tt12273460/'],
 ['Mission: Impossible 8', '/title/tt9603208/'],
 ['Live Die Repeat and Repeat', '/title/tt5617712/'],
 ['Top Gun: Maverick', '/title/tt1745960/'],
 ['Mission: Impossible - Fallout', '/title/tt4912910/'],
 ['American Made', '/title/tt3532216/'],
 ['The Mummy', '/title/tt2345759/'],
 ['Jack Reacher: Never Go Back', '/title/tt3393786/'],
 ['Mission: Impossible - Rogue Nation', '/title/tt2381249/'],
 ['Edge of Tomorrow', '/title/tt1631867/'],
 ['Oblivion', '/title/tt1483013/'],
 ['Jack Reacher', '/title/tt0790724/'],
 ['Rock of Ages', '/title/tt1336608/'],
 ['Mission: Impossible - Ghost Protocol', '/title/tt1229238/'],
 ['Knight and Day', '/title/tt1013743/'],
 ['Valkyrie', '/title/tt0985699/'],
 ['Tropic Thunder', '/title/tt0942385/'],
 ['Lions for Lambs', '/title/tt0891527/'],
 ['Mission: Impossible III', '/title/tt0317919/'],
 ['War of the

## Repeat to Check
Repeat the same exercise but with another actor; in this case, Tom Hanks.

Note: Tom Hanks' filmography section begins with Producer, instead of Actor.  This originally caused an issue and forced me to review how to search for Actor credits, only.

In [5]:
# Repeat for another actor
# Example url is for Tom Hanks
url2 = 'https://www.imdb.com/name/nm0000158/'

# Load URL and confirm success
r2 = requests.get(url2)
# print(r2.content[:100])

# Parse HTML with BeautifulSoup
soup2 = BeautifulSoup(r2.content, 'html.parser')

In [6]:
# Prettify HTML for manual review
# print(soup2.prettify())

# Within 'soup', find all <div> tag with 'id' attribute starting with 'actor-'
films2 = soup2.find_all('div', id = re.compile('^actor-'))

filmsarray2 = []
for film in films2:
    filmsarray2.append([film.a.text, film.a.get('href')])
filmsarray2

[['Untitled Elvis Presley Project', '/title/tt3704428/'],
 ['A Man Called Ove', '/title/tt7405458/'],
 ['In the Garden of Beasts', '/title/tt2123969/'],
 ['BIOS', '/title/tt3420504/'],
 ['News of the World', '/title/tt6878306/'],
 ['Greyhound', '/title/tt6048922/'],
 ['A Beautiful Day in the Neighborhood', '/title/tt3224458/'],
 ['Toy Story 4', '/title/tt1979376/'],
 ['The Post', '/title/tt6294822/'],
 ['The David S. Pumpkins Halloween Special', '/title/tt7452910/'],
 ['The Circle', '/title/tt4287320/'],
 ['Inferno', '/title/tt3062096/'],
 ['Sully', '/title/tt3263904/'],
 ['Maya & Marty', '/title/tt5543284/'],
 ['A Hologram for the King', '/title/tt2980210/'],
 ['Ithaca', '/title/tt3501590/'],
 ['Bridge of Spies', '/title/tt3682448/'],
 ['Carly Rae Jepsen: I Really Like You', '/title/tt7094450/'],
 ['Toy Story That Time Forgot', '/title/tt3473654/'],
 ['Saving Mr. Banks', '/title/tt2140373/'],
 ['Toy Story of Terror', '/title/tt2446040/'],
 ['Captain Phillips', '/title/tt1535109/'],
 [