In [102]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
import requests


Below is the function to get the information table of a movie on Wikipedia page

In [103]:
def get_movie_table(url, title):
    final_url = 'https://en.wikipedia.org'+url
    r = requests.get(final_url)
    soup = bs(r.content)
    
    table = soup.find("table", attrs={"class":"infobox vevent"}) #this line pulls the table of the movie
    try:
        headers = table.find_all('th', attrs={"scope":"row"}) #this line gets the movie title
    except:
        print(soup.prettify())
    
    data = {} #this variable is created to store data
    data['Title'] = title
    rows = table.find_all('tr') #pulls all the rows of the info table for the movie
    for row in rows: #
        header = row.find('th',attrs={"scope":"row"})
        if header:
            key = header.get_text("").strip()
            value = row.find('td').get_text().strip().replace('\xa0','').split('\n')
            data[key] = value
    return data

Below is the function to get the information tables of all the Walt Disney movies 

In [104]:
url = "https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films"
r = requests.get(url)
soup = bs(r.content)

#the last table lists all upcoming Disney movies which is not what we want - that is why we list [-1] 
#index to not include that table in the final data set.
tables = soup.find_all("table",attrs = {"class":"wikitable sortable"})[:-1] 
movie_table_list = [] #this variable is a list where all the information table info will be stored
for table in tables: #to iterate through every movie
    films = table.find_all('i') #
    for i in films: #for each movie
        if i.a: #if the movie link exists
            title = i.a.get_text(strip=True) #get the movie title
            link = i.a['href'] #get the movie link
            if link == '/wiki/True-Life_Adventures': 
                continue
            data = get_movie_table(link,title) #get data for the movie using the link that was saved earlier
            movie_table_list.append(data) #append new movie data to the variable movie_table_list
        else:
            link: "No Link"

Saving the Data

In [105]:
import json
with open('Wiki_Movie.json','w',encoding ='utf-8') as f:
    json.dump(movie_table_list,f,ensure_ascii=False,indent=6)

Using OMDB API to get additional movie data

In [106]:
#Function to load json data
def load_json(title):
    with open(title,encoding = 'utf-8') as f:
        return json.load(f)

In [107]:
#Loading json data 
mov_info = load_json('Wiki_Movie.json')

In [108]:
import urllib

#Function to access OMDB to get movie info
def get_scores(title):
    base_url = 'http://www.omdbapi.com/?'
    parameters = {'apikey':'2c848dbe','t':title} #api-key and movie title
    end_url = urllib.parse.urlencode(parameters) #converting parameters into url format
    final_url = base_url + end_url 
    return requests.get(final_url).json() #requesting info using OMDB API

#Function to get rotten tomatoes info from a nested dictionary.
def get_rts(temp):
    r = temp.get("Ratings",[])
    for ele in r:
        if ele["Source"] == "Rotten Tomatoes":
            return ele["Value"]
    return None

In [109]:
#Iterates through every movie to pull info if it exists. If not, then None.
for mov in mov_info:
    title = mov['Title']
    temp = get_scores(title)
    mov['Metascore'] = temp.get('Metascore', None)
    mov['imdbRating'] = temp.get('imdbRating', None)
    mov['Rotten_Tomatoes'] = get_rts(temp)
    mov['Genre'] = temp.get('Genre', None)
    mov['Rated'] = temp.get('Rated', None)
    mov['BoxOffice_Api'] = temp.get('BoxOffice', None)
    mov['Type'] = temp.get('Type',None)

Save final data to a JSON file

In [110]:
import json
with open('Final_Data.json','w',encoding = 'utf-8') as f:
    json.dump(mov_info,f,ensure_ascii=False,indent=6)

Create data frame and save to CSV file

In [111]:
df = pd.DataFrame(mov_info)

In [113]:
df.to_csv('Final_Data.csv', index = False)

In [114]:
df

Unnamed: 0,Title,Productioncompany,Release date,Running time,Country,Language,Box office,Metascore,imdbRating,Rotten_Tomatoes,...,Executive producer,Producers,Editors,Distributor,Japanese,Hepburn,Adaptation by,Animation by,Traditional,Simplified
0,Academy Award Review of Walt Disney Cartoons,[Walt Disney Productions],"[May19,1937(1937-05-19)]",[41 minutes (74 minutes 1966 release)],[United States],[English],[$45.472],,7.1,,...,,,,,,,,,,
1,Snow White and the Seven Dwarfs,[Walt Disney Productions],"[December21,1937(1937-12-21) (Carthay Circle T...",[83 minutes],[United States],[English],[$418 million[2]],95,7.6,,...,,,,,,,,,,
2,Pinocchio,[Walt Disney Productions],"[February7,1940(1940-02-07) (Center Theatre)[1...",[88 minutes],[United States],[English],[$164million],99,7.4,,...,,,,,,,,,,
3,Fantasia,[Walt Disney Productions],"[November13,1940(1940-11-13)]",[126 minutes[1]],[United States],[English],[$76.4–$83.3 million (United States and Canada...,,,,...,,,,,,,,,,
4,The Reluctant Dragon,[Walt Disney Productions],"[June27,1941(1941-06-27)[1]]",[74 minutes],[United States],[English],"[$960,000 (worldwide rentals) [3]]",,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437,Flora & Ulysses,,"[February19,2021(2021-02-19)]",[95 minutes],[United States],[English],,,,,...,,,,,,,,,,
438,Raya and the Last Dragon,,"[March5,2021(2021-03-05) (United States)]",[107 minutes[5]],[United States],[English],[$122.7 million],,,,...,,,,,,,,,,
439,Cruella,,"[May18,2021(2021-05-18) (El Capitan Theatre), ...",[134 minutes[4]],[United States],[English],[$226.3 million[8][9]],59,7.4,74%,...,,,,,,,,,,
440,Luca,,"[June13,2021(2021-06-13) (Aquarium of Genoa), ...",[95 minutes],[United States],[English],[$27.8 million[1][2]],71,7.5,91%,...,,,,,,,,,,
