### Disney Dataset Creation (w/ Python BeautifulSoup)
Scrape & clean a list of disney wikipedia pages to create a dataset to further analyze

#### Task #1: Get Info Box (store in Python dictionary)

In [1]:
from bs4 import BeautifulSoup as bs
import requests
import json
import pickle


### Load the webpage

In [2]:
r = requests.get("https://en.wikipedia.org/wiki/Toy_Story_3")

# Convert to a beautiful soup object
soup = bs(r.content)

# Print out the HTML
contents = soup.prettify()
#print(contents)

In [3]:
info_box = soup.find(class_="infobox vevent")
info_rows = info_box.find_all("tr")
# for row in info_rows:
#     print(row.prettify())


In [4]:
def get_content_value(row_data):
    if row_data.find("li"):  # if row_data has a list we need to return it to the text word
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

movie_info = {}
for index, row in enumerate(info_rows):
    if index == 0:
        movie_info['title'] = row.find("th").get_text(" ", strip=True)
    elif index == 1:
        continue
    else:
        content_key = row.find("th").get_text(" ", strip=True)
        content_value = get_content_value(row.find("td"))
        movie_info[content_key] = content_value
    
movie_info

{'title': 'Toy Story 3',
 'Directed by': 'Lee Unkrich',
 'Screenplay by': 'Michael Arndt',
 'Story by': ['John Lasseter', 'Andrew Stanton', 'Lee Unkrich'],
 'Produced by': 'Darla K. Anderson',
 'Starring': ['Tom Hanks',
  'Tim Allen',
  'Joan Cusack',
  'Don Rickles',
  'Wallace Shawn',
  'John Ratzenberger',
  'Estelle Harris',
  'Ned Beatty',
  'Michael Keaton',
  'Jodi Benson',
  'John Morris'],
 'Cinematography': ['Jeremy Lasky', 'Kim White'],
 'Edited by': 'Ken Schretzmann',
 'Music by': 'Randy Newman',
 'Production companies': ['Walt Disney Pictures', 'Pixar Animation Studios'],
 'Distributed by': 'Walt Disney Studios Motion Pictures',
 'Release date': ['June 12, 2010 ( 2010-06-12 ) ( Taormina Film Fest )',
  'June 18, 2010 ( 2010-06-18 ) (United States)'],
 'Running time': '103 minutes [1]',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$200 million [1]',
 'Box office': '$1.067 billion [1]'}

 #### Task #2: Get info box for all movies

In [5]:
r = requests.get('https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films')

# Convert to a beautiful soup object
soup = bs(r.content)


# Print out the HTML
contents = soup.prettify()
#print(contents)


In [6]:
movies = soup.select(".wikitable.sortable i")
movies[0]

<i><a href="/wiki/Academy_Award_Review_of_Walt_Disney_Cartoons" title="Academy Award Review of Walt Disney Cartoons">Academy Award Review of Walt Disney Cartoons</a></i>

In [7]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    elif row_data.find("br"):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

def clean_tags(soup):
    for tag in soup.find_all(["sup", "span"]):
        tag.decompose()
        
def get_info_box(url):

    r = requests.get(url)
    soup = bs(r.content)
    info_box = soup.find(class_="infobox vevent")
    info_rows = info_box.find_all("tr")
    
    clean_tags(soup)

    movie_info = {}
    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info['title'] = row.find("th").get_text(" ", strip=True)
        else:
            header = row.find('th')
            if header:
                content_key = row.find("th").get_text(" ", strip=True)
                content_value = get_content_value(row.find("td"))
                movie_info[content_key] = content_value
            
    return movie_info    

In [8]:
get_info_box("https://en.wikipedia.org/wiki/One_Little_Indian_(film)")

{'title': 'One Little Indian',
 'Directed by': 'Bernard McEveety',
 'Written by': 'Harry Spalding',
 'Produced by': 'Winston Hibler',
 'Starring': ['James Garner',
  'Vera Miles',
  'Pat Hingle',
  'Morgan Woodward',
  'Jodie Foster'],
 'Cinematography': 'Charles F. Wheeler',
 'Edited by': 'Robert Stafford',
 'Music by': 'Jerry Goldsmith',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': ['June 20, 1973'],
 'Running time': '90 Minutes',
 'Country': 'United States',
 'Language': 'English',
 'Box office': '$2 million'}

In [9]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
soup = bs(r.content)
movies = soup.select(".wikitable.sortable i a")

base_path = "https://en.wikipedia.org/"

movie_info_list = []
for index, movie in enumerate(movies):
    if index % 10 == 0:
        print(index)
    try:
        relative_path = movie['href']
        full_path = base_path + relative_path
        title = movie['title']
        
        movie_info_list.append(get_info_box(full_path))
        
    except Exception as e:
        print(movie.get_text())
        print(e)

0
10
20
30
40
Zorro the Avenger
'NoneType' object has no attribute 'find'
The Sign of Zorro
'NoneType' object has no attribute 'find'
50
60
70
80
90
100
110
120
True-Life Adventures
'NoneType' object has no attribute 'find_all'
130
140
The London Connection
'NoneType' object has no attribute 'find'
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
Better Nate Than Never
'NoneType' object has no attribute 'find_all'


In [10]:
movie_info_list[0]

{'title': 'Academy Award Review of',
 'Production company': 'Walt Disney Productions',
 'Release date': ['May 19, 1937'],
 'Running time': '41 minutes (74 minutes 1966 release)',
 'Country': 'United States',
 'Language': 'English',
 'Box office': '$45.472'}

#### Save/Reload Movie Data

In [11]:
def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [12]:
def load_data(title):
    with open(title, encoding="utf-8") as f:
        return json.load(f)

In [13]:
save_data("disney_data_cleaned.json", movie_info_list)

### Task #3: Clean our data!

In [14]:
movie_info_list[-40]

{'title': 'Mary Poppins Returns',
 'Directed by': 'Rob Marshall',
 'Screenplay by': 'David Magee',
 'Story by': ['David Magee', 'Rob Marshall', 'John DeLuca'],
 'Based on': ['Mary Poppins', 'by', 'P. L. Travers'],
 'Produced by': ['Rob Marshall', 'John DeLuca', 'Marc Platt'],
 'Starring': ['Emily Blunt',
  'Lin-Manuel Miranda',
  'Ben Whishaw',
  'Emily Mortimer',
  'Julie Walters',
  'Colin Firth',
  'Meryl Streep'],
 'Cinematography': 'Dion Beebe',
 'Edited by': 'Wyatt Smith',
 'Music by': 'Marc Shaiman',
 'Production companies': ['Walt Disney Pictures',
  'Lucamar Productions',
  'Marc Platt Productions'],
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Release date': ['November 29, 2018 ( Dolby Theatre )',
  'December 19, 2018 (United States)'],
 'Running time': '131 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$130 million',
 'Box office': '$349.5 million'}

In [15]:
print([movie.get('Running time', 'N/A') for movie in movie_info_list])

['41 minutes (74 minutes 1966 release)', '83 minutes', '88 minutes', '126 minutes', '74 minutes', '64 minutes', '70 minutes', '42 minutes', '70 min', '71 minutes', '75 minutes', '94 minutes', '73 minutes', '75 minutes', '82 minutes', '68 minutes', '74 minutes', '96 minutes', '75 minutes', '84 minutes', '77 minutes', '92 minutes', '69 minutes', '81 minutes', ['60 minutes (VHS version)', '71 minutes (original)'], '127 minutes', '92 minutes', '76 minutes', '75 minutes', '73 minutes', '85 minutes', '81 minutes', '70 minutes', '90 min.', '80 minutes', '75 minutes', '83 minutes', '83 minutes', '72 minutes', '97 minutes', '75 minutes', '104 minutes', '93 minutes', '105 minutes', '95 minutes', '97 minutes', '134 minutes', '69 minutes', '92 minutes', '131 minutes', '79 minutes', '97 minutes', '128 minutes', '74 minutes', '91 minutes', '105 minutes', '98 minutes', '130 minutes', '89 min.', '93 minutes', '67 minutes', '98 minutes', '100 minutes', '118 minutes', '103 Minutes', '110 minutes', '80 m

In [16]:
# "85 minutes"
def minutes_to_integer(running_time):
    if running_time == "N/A":
        return None
    
    if isinstance(running_time, list):
        return int(running_time[0].split(" ")[0])
    else: # is a string
        return int(running_time.split(" ")[0])

for movie in movie_info_list:
    movie['Running time (int)'] = minutes_to_integer(movie.get('Running time', "N/A"))



In [17]:
print([movie.get('Running time (int)', 'N/A') for movie in movie_info_list])

[41, 83, 88, 126, 74, 64, 70, 42, 70, 71, 75, 94, 73, 75, 82, 68, 74, 96, 75, 84, 77, 92, 69, 81, 60, 127, 92, 76, 75, 73, 85, 81, 70, 90, 80, 75, 83, 83, 72, 97, 75, 104, 93, 105, 95, 97, 134, 69, 92, 131, 79, 97, 128, 74, 91, 105, 98, 130, 89, 93, 67, 98, 100, 118, 103, 110, 80, 79, 91, 91, 97, 118, 139, 92, 131, 87, 116, 93, 110, 110, 131, 101, 108, 84, 78, 75, 164, 106, 110, 99, 113, 108, 112, 93, 91, 93, 100, 100, 79, 96, 113, 89, 118, 92, 88, 92, 87, 93, 93, 93, 90, 83, 96, 88, 89, 91, 93, 92, 97, 100, 100, 89, 91, 112, 115, 95, 91, 95, 104, 74, 48, 77, 104, 128, 101, 94, 104, 90, 100, 88, 93, 98, 112, 84, 98, 97, 114, 96, 100, 109, 83, 90, 107, 96, 103, 91, 95, 105, 113, 80, 101, 89, 74, 90, 89, 110, 74, 93, 84, 83, 74, 77, 107, 93, 88, 108, 84, 121, 89, 104, 90, 86, 84, 108, 107, 96, 98, 105, 108, 94, 106, 102, 88, 102, 102, 97, 111, 100, 96, 98, 78, 81, 108, 89, 99, 89, 81, 92, 100, 89, 79, 91, 101, 104, 103, 86, 105, 93, 92, 98, 95, 93, 87, 93, 87, 128, 86, 95, 114, 93, 83, 8

In [18]:
print([movie.get('Budget', 'N/A') for movie in movie_info_list])

['N/A', '$1.49 million', '$2.6 million', '$2.28 million', '$600,000', '$950,000', '$858,000', 'N/A', '$788,000', 'N/A', '$1.35 million', '$2.125 million', 'N/A', '$1.5 million', '$1.5 million', 'N/A', '$2.2 million', '$1,800,000', '$3 million', 'N/A', '$4 million', '$2 million', '$300,000', '$1.8 million', 'N/A', '$5 million', 'N/A', '$4 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$700,000', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$6 million', 'under $1 million or $1,250,000', 'N/A', '$2 million', 'N/A', 'N/A', '$2.5 million', 'N/A', 'N/A', '$4 million', '$3.6 million', 'N/A', 'N/A', 'N/A', 'N/A', '$3 million', 'N/A', '$3 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$3 million', 'N/A', 'N/A', 'N/A', 'N/A', '$4.4–6 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$4 million', 'N/A', '$5 million', 'N/A', 'N/A', 'N/A', 'N/A', '$5 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$4 million', 'N/A', 'N/A', 'N/A', '

In [19]:
import re

amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"

word_re = rf"\${number}(-|\sto\s|–)?({number})?\s({amounts})"
value_re = rf"\${number}"

def word_to_value(word):
    value_dict = {"thousand": 1000, "million": 1000000, "billion": 1000000000}
    return value_dict[word]

def parse_word_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    word = re.search(amounts, string, flags=re.I).group().lower()
    word_value = word_to_value(word)
    return value*word_value

def parse_value_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    return value

'''
money_conversion("$12.2 million") --> 12200000 ## Word syntax
money_conversion("$790,000") --> 790000        ## Value syntax
'''
def money_conversion(money):
    if money == "N/A":
        return None

    if isinstance(money, list):
        money = money[0]
        
    word_syntax = re.search(word_re, money, flags=re.I)
    value_syntax = re.search(value_re, money)

    if word_syntax:
        return parse_word_syntax(word_syntax.group())

    elif value_syntax:
        return parse_value_syntax(value_syntax.group())

    else:
        return None

In [20]:
for movie in movie_info_list:
    movie['Budget (float)'] = money_conversion(movie.get('Budget', "N/A"))
    movie['Box office (float)'] = money_conversion(movie.get('Box office', "N/A"))

In [21]:
money_conversion(str(movie_info_list[-40]["Budget"]))

130000000.0

In [22]:
# Convert Dates into datetimes
print([movie.get('Release date', 'N/A') for movie in movie_info_list])

[['May 19, 1937'], ['December 21, 1937 ( Carthay Circle Theatre )'], ['February 7, 1940 ( Center Theatre )', 'February 23, 1940 (United States)'], ['November 13, 1940'], ['June 27, 1941'], ['October 23, 1941 (New York City)', 'October 31, 1941 (U.S.)'], ['August 9, 1942 (World Premiere – London)', 'August 13, 1942 (Premiere – New York City)', 'August 21, 1942 (U.S.)'], ['August 24, 1942 (World Premiere – Rio de Janeiro)', 'February 6, 1943 (U.S. Premiere – Boston)', 'February 19, 1943 (U.S.)'], ['July 17, 1943'], ['December 21, 1944 (Mexico City)', 'February 3, 1945 (US)'], ['April 20, 1946 (New York City premiere)', 'August 15, 1946 (U.S.)'], ['November 12, 1946 (Premiere: Atlanta, Georgia)', 'November 20, 1946', 'March 30, 1947 (Stanford Theatre, Palo Alto, California)'], ['September 27, 1947'], 'May 27, 1948', ['November 29, 1948 (Chicago, Illinois)', 'January 19, 1949 (Indianapolis, Indiana)'], ['October 5, 1949'], ['February 15, 1950 (Boston)', 'March 4, 1950 (United States)'], ['

In [23]:
movie_info_list[-50]

{'title': 'Pirates of the Caribbean: Dead Men Tell No Tales',
 'Directed by': ['Joachim Rønning', 'Espen Sandberg'],
 'Screenplay by': 'Jeff Nathanson',
 'Story by': ['Jeff Nathanson', 'Terry Rossio'],
 'Based on': ['Characters by Ted Elliott Terry Rossio Stuart Beattie Jay Wolpert',
  'Pirates of the Caribbean by Walt Disney'],
 'Produced by': 'Jerry Bruckheimer',
 'Starring': ['Johnny Depp',
  'Javier Bardem',
  'Geoffrey Rush',
  'Brenton Thwaites',
  'Kaya Scodelario',
  'Kevin McNally'],
 'Cinematography': 'Paul Cameron',
 'Edited by': ['Roger Barton', 'Leigh Folsom Boyd'],
 'Music by': 'Geoff Zanelli',
 'Production companies': ['Walt Disney Pictures', 'Jerry Bruckheimer Films'],
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Release date': ['May 11, 2017 ( Shanghai Disney Resort )',
  'May 26, 2017 (United States)'],
 'Running time': '129 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$230–320 million',
 'Box office': '$794.9 million'

In [24]:
# June 28, 1950
from datetime import datetime

dates = [movie.get('Release date', 'N/A') for movie in movie_info_list]

def clean_date(date):
    return date.split("(")[0].strip()

def date_conversion(date):
    if isinstance(date, list):
        date = date[0]
        
    if date == "N/A":
        return None
        
    date_str = clean_date(date)

    fmts = ["%B %d, %Y", "%d %B %Y"]
    for fmt in fmts:
        try:
            return datetime.strptime(date_str, fmt)
        except:
            pass
    return None


In [25]:
for movie in movie_info_list:
    movie['Release date (datetime)'] = date_conversion(movie.get('Release date', 'N/A'))

In [26]:
movie_info_list[50]

{'title': 'One Hundred and One Dalmatians',
 'Directed by': ['Clyde Geronimi', 'Hamilton Luske', 'Wolfgang Reitherman'],
 'Story by': 'Bill Peet',
 'Based on': ['The Hundred and One Dalmatians', 'by', 'Dodie Smith'],
 'Produced by': 'Walt Disney',
 'Starring': ['Rod Taylor',
  'Cate Bauer',
  'Betty Lou Gerson',
  'Ben Wright',
  'Bill Lee (singing voice)',
  'Lisa Davis',
  'Martha Wentworth'],
 'Edited by': ['Roy M. Brewer, Jr.', 'Donald Halliday'],
 'Music by': 'George Bruns',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': ['January 25, 1961'],
 'Running time': '79 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$3.6 million',
 'Box office': '$303 million',
 'Running time (int)': 79,
 'Budget (float)': 3600000.0,
 'Box office (float)': 303000000.0,
 'Release date (datetime)': datetime.datetime(1961, 1, 25, 0, 0)}

In [27]:
def save_data_pickle(name, data):
    with open(name, 'wb') as f:
        pickle.dump(data, f)

In [28]:
def load_data_pickle(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [29]:
save_data_pickle("disney_movie_data_cleaned_more.pickle", movie_info_list)

In [30]:
a = load_data_pickle("disney_movie_data_cleaned_more.pickle")

In [31]:
a == movie_info_list

True

#### Task #4: Attach IMDB/Rotten Tomatoes/Metascore scores

In [32]:
movie_info_list = load_data_pickle('disney_movie_data_cleaned_more.pickle')

In [33]:
movie_info_list[-60]

{'title': 'Alice Through the Looking Glass',
 'Directed by': 'James Bobin',
 'Written by': 'Linda Woolverton',
 'Based on': ['Characters', 'by', 'Lewis Carroll'],
 'Produced by': ['Joe Roth', 'Suzanne Todd', 'Jennifer Todd', 'Tim Burton'],
 'Starring': ['Johnny Depp',
  'Anne Hathaway',
  'Mia Wasikowska',
  'Rhys Ifans',
  'Helena Bonham Carter',
  'Sacha Baron Cohen',
  'Alan Rickman',
  'Stephen Fry',
  'Michael Sheen',
  'Timothy Spall'],
 'Cinematography': 'Stuart Dryburgh',
 'Edited by': 'Andrew Weisblum',
 'Music by': 'Danny Elfman',
 'Production companies': ['Walt Disney Pictures',
  'Roth Films',
  'Team Todd',
  'Tim Burton Productions'],
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Release date': ['May 10, 2016 ( London )', 'May 27, 2016 (United States)'],
 'Running time': '114 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$170 million',
 'Box office': '$299.5 million',
 'Running time (int)': 114,
 'Budget (float)': 170000000.

In [34]:
import requests
import urllib
import os

def get_omdb_info(title):
    base_url = "http://www.omdbapi.com/?"
    parameters = {"apikey":'d0ba764a' , 't': title}
    params_encoded = urllib.parse.urlencode(parameters)
    full_url = base_url + params_encoded
    return requests.get(full_url).json()

def get_rotten_tomato_score(omdb_info):
    ratings = omdb_info.get('Ratings', [])
    for rating in ratings:
        if rating['Source'] == 'Rotten Tomatoes':
            return rating['Value']
    return None

get_omdb_info("woods")

{'Title': 'The Cabin in the Woods',
 'Year': '2011',
 'Rated': 'R',
 'Released': '13 Apr 2012',
 'Runtime': '95 min',
 'Genre': 'Horror, Mystery, Thriller',
 'Director': 'Drew Goddard',
 'Writer': 'Joss Whedon, Drew Goddard',
 'Actors': 'Kristen Connolly, Chris Hemsworth, Anna Hutchison',
 'Plot': 'Five friends go for a break at a remote cabin, where they get more than they bargained for, discovering the truth behind the cabin in the woods.',
 'Language': 'English, Japanese',
 'Country': 'United States',
 'Awards': '20 wins & 34 nominations',
 'Poster': 'https://m.media-amazon.com/images/M/MV5BNTUxNzYyMjg2N15BMl5BanBnXkFtZTcwMTExNzExNw@@._V1_SX300.jpg',
 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '7.0/10'},
  {'Source': 'Rotten Tomatoes', 'Value': '92%'},
  {'Source': 'Metacritic', 'Value': '72/100'}],
 'Metascore': '72',
 'imdbRating': '7.0',
 'imdbVotes': '390,696',
 'imdbID': 'tt1259521',
 'Type': 'movie',
 'DVD': '06 May 2014',
 'BoxOffice': '$42,073,277',
 'Product

In [35]:
for movie in movie_info_list:
    title = movie['title']
    omdb_info = get_omdb_info(title)
    movie['imdb'] = omdb_info.get('imdbRating', None)
    movie['metascore'] = omdb_info.get('Metascore', None)
    movie['rotten_tomatoes'] = get_rotten_tomato_score(omdb_info)

In [36]:
movie_info_list[-30]

{'title': 'Togo',
 'Directed by': 'Ericson Core',
 'Written by': 'Tom Flynn',
 'Produced by': 'Kim Zubick',
 'Starring': ['Willem Dafoe'],
 'Cinematography': 'Ericson Core',
 'Edited by': 'Martin Pensa',
 'Music by': 'Mark Isham',
 'Production company': 'Walt Disney Pictures',
 'Distributed by': 'Walt Disney Studios Motion Pictures',
 'Release date': ['December 20, 2019 (United States)'],
 'Running time': '114 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$40 million',
 'Running time (int)': 114,
 'Budget (float)': 40000000.0,
 'Box office (float)': None,
 'Release date (datetime)': datetime.datetime(2019, 12, 20, 0, 0),
 'imdb': '8.0',
 'metascore': '69',
 'rotten_tomatoes': '92%'}

In [37]:
save_data_pickle('disney_movie_data_final.pickle', movie_info_list)

### Task #5: Save data as JSON & CSV

In [38]:
movie_info_list[50]

{'title': 'One Hundred and One Dalmatians',
 'Directed by': ['Clyde Geronimi', 'Hamilton Luske', 'Wolfgang Reitherman'],
 'Story by': 'Bill Peet',
 'Based on': ['The Hundred and One Dalmatians', 'by', 'Dodie Smith'],
 'Produced by': 'Walt Disney',
 'Starring': ['Rod Taylor',
  'Cate Bauer',
  'Betty Lou Gerson',
  'Ben Wright',
  'Bill Lee (singing voice)',
  'Lisa Davis',
  'Martha Wentworth'],
 'Edited by': ['Roy M. Brewer, Jr.', 'Donald Halliday'],
 'Music by': 'George Bruns',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': ['January 25, 1961'],
 'Running time': '79 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$3.6 million',
 'Box office': '$303 million',
 'Running time (int)': 79,
 'Budget (float)': 3600000.0,
 'Box office (float)': 303000000.0,
 'Release date (datetime)': datetime.datetime(1961, 1, 25, 0, 0),
 'imdb': '7.3',
 'metascore': '83',
 'rotten_tomatoes': '98%'}

In [39]:
movie_info_copy = [movie.copy() for movie in movie_info_list]

In [40]:
for movie in movie_info_copy:
    current_date = movie['Release date (datetime)']
    if current_date:
        movie['Release date (datetime)'] = current_date.strftime("%B %d, %Y")
    else:
        movie['Release date (datetime)'] = None

In [41]:
save_data("disney_data_final.json", movie_info_copy)

#### Convert data to CSV

In [42]:
import pandas as pd

df = pd.DataFrame(movie_info_list)

In [43]:
df.head()

Unnamed: 0,title,Production company,Release date,Running time,Country,Language,Box office,Running time (int),Budget (float),Box office (float),...,Screenplay by,Countries,Production companies,Color process,Japanese,Hepburn,Adaptation by,Animation by,Traditional,Simplified
0,Academy Award Review of,Walt Disney Productions,"[May 19, 1937]",41 minutes (74 minutes 1966 release),United States,English,$45.472,41.0,,45.472,...,,,,,,,,,,
1,Snow White and the Seven Dwarfs,Walt Disney Productions,"[December 21, 1937 ( Carthay Circle Theatre )]",83 minutes,United States,English,$418 million,83.0,1490000.0,418000000.0,...,,,,,,,,,,
2,Pinocchio,Walt Disney Productions,"[February 7, 1940 ( Center Theatre ), February...",88 minutes,United States,English,$164 million,88.0,2600000.0,164000000.0,...,,,,,,,,,,
3,Fantasia,Walt Disney Productions,"[November 13, 1940]",126 minutes,United States,English,$76.4–$83.3 million (United States and Canada),126.0,2280000.0,83300000.0,...,,,,,,,,,,
4,The Reluctant Dragon,Walt Disney Productions,"[June 27, 1941]",74 minutes,United States,English,"$960,000 (worldwide rentals)",74.0,600000.0,960000.0,...,,,,,,,,,,


In [44]:
df.to_csv("disney_movie_data_final.csv")

In [45]:
running_times = df.sort_values(['Running time (int)'],  ascending=False)
running_times.head()

Unnamed: 0,title,Production company,Release date,Running time,Country,Language,Box office,Running time (int),Budget (float),Box office (float),...,Screenplay by,Countries,Production companies,Color process,Japanese,Hepburn,Adaptation by,Animation by,Traditional,Simplified
302,Pirates of the Caribbean: At World's End,,"[May 19, 2007 ( Disneyland Resort ), May 25, 2...",167 minutes,United States,English,$960.9 million,167.0,300000000.0,960900000.0,...,,,"[Walt Disney Pictures, Jerry Bruckheimer Films]",,,,,,,
86,The Happiest Millionaire,Walt Disney Productions,"[June 23, 1967, November 30, 1967]","[164 minutes, (, Los Angeles, premiere), 144 m...",United States,English,$5 million (U.S./Canada rentals),164.0,5000000.0,5000000.0,...,A. J. Carothers,,,,,,,,,
402,Jagga Jasoos,,[14 July 2017],162 minutes,India,Hindi,83 crore,162.0,,,...,,,"[Walt Disney Pictures India, Picture Shuru Ent...",,,,,,,
395,Dangal,,"[21 December 2016 (United States), 23 December...",161 minutes,India,Hindi,"[est., (, )]",161.0,,,...,,,"[Aamir Khan Productions, Walt Disney Pictures ...",,,,,,,
426,Hamilton,,"[July 3, 2020]",160 minutes,United States,English,,160.0,12500000.0,,...,,,"[Walt Disney Pictures, 5000 Broadway Productio...",,,,,,,
