### Import necessary libraries

In [7]:
from bs4 import BeautifulSoup as bs
import requests
import json

### Scrape the infomation of a single movie on wikipedia

- Webpage: https://en.wikipedia.org/wiki/Toy_Story_3

In [2]:
response = requests.get('https://en.wikipedia.org/wiki/Toy_Story_3')
webpage = bs(response.content, 'html.parser')
[s.extract() for s in webpage('sup')] # extract all <sup> tags
contents = webpage.prettify()
# print(contents)

In [3]:
def get_value(td):
    if td.find('li'):
        return [li.get_text(" ", strip=True).replace('\xa0', ' ') for li in td.find_all('li')]
    else:
        return td.get_text(" ", strip=True).replace('\xa0', ' ')

# Scrape the infobox of a movie and save it into a dictionary
movie_info = {} 
infobox = webpage.find('table', class_='infobox vevent')
infobox_tr = infobox.find_all('tr')

for idx, row in enumerate(infobox_tr):
    if idx == 0:
        movie_info['Title'] = row.find('th').get_text(" ", strip=True)
    elif idx == 1:
        continue
    else:
        if row.find('th'): 
            key = row.find('th').get_text(" ", strip=True)
            value = get_value(row.find('td'))
            movie_info[key] = value
        
movie_info

{'Title': 'Toy Story 3',
 'Directed by': 'Lee Unkrich',
 'Screenplay by': 'Michael Arndt',
 'Story by': ['John Lasseter', 'Andrew Stanton', 'Lee Unkrich'],
 'Produced by': 'Darla K. Anderson',
 'Starring': ['Tom Hanks',
  'Tim Allen',
  'Joan Cusack',
  'Don Rickles',
  'Wallace Shawn',
  'John Ratzenberger',
  'Estelle Harris',
  'Ned Beatty',
  'Michael Keaton',
  'Jodi Benson',
  'John Morris'],
 'Cinematography': ['Jeremy Lasky', 'Kim White'],
 'Edited by': 'Ken Schretzmann',
 'Music by': 'Randy Newman',
 'Production companies': ['Walt Disney Pictures', 'Pixar Animation Studios'],
 'Distributed by': 'Walt Disney Studios Motion Pictures',
 'Release dates': ['June 12, 2010 ( 2010-06-12 ) ( Taormina Film Fest )',
  'June 18, 2010 ( 2010-06-18 ) (United States)'],
 'Running time': '103 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$200 million',
 'Box office': '$1.067 billion'}

### Scrape the information of all movies on wikipedia

- Webpage: https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films

In [4]:
response = requests.get('https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films')
webpage = bs(response.content, 'html.parser')
# Do not include table 'Upcoming' and 'Undated', since tables contain a lot of inaccurate wikipedia movie url
movie_tables = webpage.select('table.wikitable.sortable')[:-2]
movie_list = []
for table in movie_tables:
    movies_in_table = table.select('i > a')
    for movie in movies_in_table:
        movie_list.append(movie)

In [5]:
# previously defined function above
def get_value(td):
    if td.find('li'):
        return [li.get_text(" ", strip=True).replace('\xa0', ' ') for li in td.find_all('li')]
    else:
        return td.get_text(" ", strip=True).replace('\xa0', ' ')
    
# get movie information from infobox inside each wikipedia disney movie page
def get_movie_info(url):
    response = requests.get(url)
    webpage = bs(response.content, 'html.parser')
    movie_info = {} 
    infobox = webpage.find('table', class_='infobox vevent')
    infobox_tr = infobox.find_all('tr')
    for idx, row in enumerate(infobox_tr):
        if idx == 0:
            movie_info['Title'] = row.find('th').get_text(" ", strip=True)
        elif idx == 1:
            continue
        else:
            if row.find('th'): 
                key = row.find('th').get_text(" ", strip=True)
                value = get_value(row.find('td'))
                movie_info[key] = value

    return movie_info

In [6]:
disney_movie_data = []

for idx, movie in enumerate(movie_list):
    try:
        movie_href = movie['href']
        wiki_url = 'https://en.wikipedia.org' + movie_href
        disney_movie_data.append(get_movie_info(wiki_url))
        
    except Exception as e:
        print(movie.get_text())
        print(e)

Zorro the Avenger
'NoneType' object has no attribute 'find'
The Sign of Zorro
'NoneType' object has no attribute 'find'
The Beatles: Get Back – The Rooftop Concert
'NoneType' object has no attribute 'find'


### Save / Reload movie data

In [8]:
def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [9]:
def reload_data(title):
    with open(title, 'r', encoding='utf-8') as f:
        return json.load(f)

In [10]:
save_data('disney_movie_data.json', disney_movie_data)