In [1]:
from bs4 import BeautifulSoup as bs
import requests

## Task 1: Getting all disney movies 

### Grabbing the wikipedia page with the movies 

In [2]:
url = 'https://en.wikipedia.org'
r = requests.get(url +'/wiki/List_of_Walt_Disney_Pictures_films')
movies_webpage = bs(r.content)

### Grabbing the Movies from the wiki page

In [3]:
movies = movies_webpage.select('.wikitable.sortable i a')
movies[0]

<a href="/wiki/Academy_Award_Review_of_Walt_Disney_Cartoons" title="Academy Award Review of Walt Disney Cartoons">Academy Award Review of Walt Disney Cartoons</a>

### Functions to grab the informations

In [4]:
# for div in soup3.find_all("div", attrs={"class" : "story-body story-content"}):
#     text_list = [text for text in div.stripped_strings]


def get_content_values(row_data):
    if row_data.find('li'):
        return [li.get_text(' ', strip=True).replace(u'\xa0', u' ') for li in row_data.find_all('li')]
    elif row_data.find('br'):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(' ', strip=True).replace(u'\xa0', u' ')
    
def clean_tags(page):
    for tag in page.find_all(['sup', 'span']):
        tag.decompose()

def get_info_box(url):
    r = requests.get(url)
    movie_page = bs(r.content)
    info_box = movie_page.find(class_='infobox vevent')
    info_rows = info_box.find_all('tr')
    clean_tags(movie_page)
    movie_info = {}
    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info['Title'] = row.th.get_text(' ', strip=True).strip()
        else:
            header = row.find('th')
            content = row.find('td')
            if header and content:
                content_key = row.th.get_text(' ', strip=True)
                content_value = get_content_values(row.td)
                movie_info[content_key] = content_value
    return movie_info

In [5]:
try:
    get_info_box('https://en.m.wikipedia.org/wiki/The_Omega_Connection')
except Exception as e:
    print(e)

### Grabbing the dictionaries of informations and putting them into a list

In [6]:
movie_info_list = []
for index, movie in enumerate(movies):
    if index % 15 == 0:
        print(index)
    try:
        relative_path = movie['href']
        full_path = url + relative_path
        title = movie['title']
        movie_info_list.append(get_info_box(full_path))
    except Exception as e:
        print(movie.get_text())
        print(e)

0
15
30
45
60
75
90
105
120
True-Life Adventures
'NoneType' object has no attribute 'find_all'
135
150
165
180
195
210
225
240
255
270
285
300
315
330
345
360
375
390
405
420
435
450
Better Nate Than Never
'NoneType' object has no attribute 'find_all'


In [26]:
print(movie_info_list[8])

{'Title': 'Victory Through Air Power', 'Directed by': ['Animated sequences:', 'James Algar', 'Clyde Geronimi', 'Jack Kinney', 'de Seversky scenes:', 'H.C. Potter'], 'Written by': ['Story direction:', 'Perce Pearce', 'Story adaptation:', 'T. Hee', 'Erdman Penner', 'Bill Cottrell', 'James Brodero', 'George Stallings', 'Jose Rodriguez'], 'Based on': 'Victory Through Air Power by Maj. Alexander P. Seversky', 'Produced by': 'Walt Disney', 'Starring': 'Alexander de Seversky', 'Narrated by': 'Art Baker', 'Cinematography': 'Ray Rennahan', 'Edited by': 'Jack Dennis', 'Music by': ['Edward H. Plumb', 'Paul J. Smith', 'Oliver Wallace'], 'Production company': 'Walt Disney Productions', 'Distributed by': 'United Artists', 'Release date': ['July 17, 1943'], 'Running time': '70 min', 'Country': 'United States', 'Language': 'English', 'Budget': '$788,000', 'Box office': '$799,000'}


####  Saving/Reload the data

In [17]:
import json

def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [12]:
import json

def load_data(title):
    with open(title, encoding='utf-8') as f:
        return json.load(f)

In [9]:
# save_data('disney_data_cleaned.json', movie_info_list)

## Task 2: Clean our data

In [229]:
movie_info_list = load_data('disney_data_cleaned.json')

##### We have to:
- Clean up references (like [1], [2], etc.) - Done ✔
 - Adding the function clean_tags(page) at 'Functions to grab the informations' and it's used at the function get_info_box(url)
 ------------------------------------------------
- Convert running time into an integer - Done ✔
 - Used RegExp to do it
 ------------------------------------------------
- Convert dates to a datetime object
------------------------------------------------
- Split up the long strings - Done ✔
 - Adding the elif row_data.find('br') at function get_content_values(row_data)
------------------------------------------------
- Convert Budget and Box Office to numbers - Done ✔

### Convert running time into an integer

In [230]:
import re

pattern = r'(?:\d{1,3}){1}'

for index, movie in enumerate(movie_info_list):
    if 'Running time' in movie:
        if isinstance(movie['Running time'], list):
            running_time = movie['Running time'][0]
            time_min = int(re.findall(pattern, running_time)[0])
        else:
            time_min = int(re.findall(pattern, movie['Running time'])[0])
        movie['Running time (min)'] = time_min

In [231]:
for index, movie in enumerate(movie_info_list):
    print(movie.get('Running time (min)', None))
    if index >= 10:
        break

41
83
88
126
74
64
70
42
70
71
75


### Convert Budget and Box Office to numbers

In [232]:
import re

amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"
standard = fr"\${number}(-|\sto\s|–)?({number})?\s({amounts})"

def word_to_value(word):
    value_dict = {"thousand": 1000, "million": 1000000, "billion": 1000000000}
    return value_dict.get(word.lower(), 1)

def parse_word_syntax(string):
    stripped_string = string.replace(",", "")
    value = float(re.search(number, stripped_string).group())
    modifier = word_to_value(re.search(amounts, string, flags=re.I).group())
    return value*modifier

def parse_value_syntax(string):
    stripped_string = string.replace(",", "")
    return float(re.search(number, stripped_string).group())

def money_conversion(money):
    if type(money) == list:
        money = money[0]

    word_syntax = re.search(standard, money, flags=re.I)
    value_syntax = re.search(fr"\${number}", money)

    if word_syntax:
        return parse_word_syntax(word_syntax.group())
    elif value_syntax:
        return parse_value_syntax(value_syntax.group())
    else:
        return None

In [233]:
for movie in movie_info_list:
    movie['Budget (float)'] = money_conversion(movie.get('Budget', 'N/A'))
    movie['Box office (float)'] = money_conversion(movie.get('Box office', 'N/A'))

### Convert dates to a datetime object

In [291]:
[movie['Release date'] for index, movie in enumerate(movie_info_list) if 'Release date' in movie.keys() and index <=10]

[['May 19, 1937'],
 ['December 21, 1937 ( Carthay Circle Theatre , Los Angeles , CA , premiere)'],
 ['February 7, 1940 ( Center Theatre )', 'February 23, 1940 (United States)'],
 ['November 13, 1940'],
 ['June 27, 1941'],
 ['October 23, 1941 (New York City)', 'October 31, 1941 (U.S.)'],
 ['August 9, 1942 (World Premiere – London)',
  'August 13, 1942 (Premiere – New York City)',
  'August 21, 1942 (U.S.)'],
 ['August 24, 1942 (World Premiere – Rio de Janeiro)',
  'February 6, 1943 (U.S. Premiere – Boston)',
  'February 19, 1943 (U.S.)'],
 ['July 17, 1943'],
 ['December 21, 1944 (Mexico City)', 'February 3, 1945 (US)'],
 ['April 20, 1946 (New York City premiere)', 'August 15, 1946 (U.S.)']]

In [300]:
from datetime import datetime

def clean_date(date):
    return date.split('(')[0].strip()

def date_conversion(date):
   
    if date == 'N/A':
        return None
    
    elif isinstance(date, list):
        date = date[0]
        
    date_str = clean_date(date)
    fmts = ['%B %d, %Y', '%d %B %Y']
    for fmt in fmts:
        try:
            return datetime.strptime(date_str, fmt)
        except:
            pass
    return None  

In [301]:
for movie in movie_info_list:
    movie['Release date (datetime)'] = date_conversion(movie.get('Release date', 'N/A'))

In [7]:
movie_info_list[5]

{'Title': 'Dumbo',
 'Directed by': ['Ben Sharpsteen',
  '(supervising director)',
  'Norman Ferguson',
  'Wilfred Jackson',
  'Bill Roberts',
  'Jack Kinney',
  'Samuel Armstrong'],
 'Story by': ['Otto Englander', 'Joe Grant', 'Dick Huemer'],
 'Based on': ['Dumbo, the Flying Elephant',
  'by',
  'Helen Aberson',
  'Harold Pearl'],
 'Produced by': 'Walt Disney',
 'Starring': ['Edward Brophy',
  'Verna Felton',
  'Cliff Edwards',
  'Herman Bing',
  'Sterling Holloway',
  'Margaret Wright',
  'Hall Johnson Choir'],
 'Narrated by': 'John McLeish',
 'Music by': ['Frank Churchill', 'Oliver Wallace'],
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'RKO Radio Pictures',
 'Release date': ['October 23, 1941 (New York City)',
  'October 31, 1941 (U.S.)'],
 'Running time': '64 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$950,000',
 'Box office': '$1.3 million (est. United States/Canada rentals, 1941)',
 'Running time (min)': 64,
 'Budget (float)

### Saving the data with pickle

In [10]:
import pickle

def save_data_pickle(name, data):
    with open(name, 'wb') as f:
        pickle.dump(data, f)

In [1]:
import pickle

def load_data_pickle(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [317]:
# save_data_pickle('disney_data_cleaned.pickle', movie_info_list)

## Task 3: Attach IMDB/Rotten Tomatoes/Metascores scores

In [2]:
movie_info_list = load_data_pickle('disney_data_cleaned.pickle')

In [3]:
movie_info_list[1]['Title']

'Snow White and the Seven Dwarfs'

### Functions to grab the scores

In [4]:
from bs4 import BeautifulSoup as bs
import requests, json, re, os, urllib

# print(os.environ['OMDB_API_KEY'])

def get_omdb_info(title):
    base_url = 'http://www.omdbapi.com/?'
    parameter = {'apikey': os.environ['OMDB_API_KEY'], 't':title}
    params_encoded = urllib.parse.urlencode(parameter)
    full_url = base_url + params_encoded
    return requests.get(full_url).json()

def get_rotten_tomatoes(omdb_info):
    ratings = omdb_info.get('Ratings', [])
    for rating in ratings:
        if rating['Source'] == 'Rotten Tomatoes':
            return rating['Value']
    return None

dumbo_info = get_omdb_info('dumbo')
dumbo_info

{'Title': 'Dumbo',
 'Year': '1941',
 'Rated': 'G',
 'Released': '31 Oct 1941',
 'Runtime': '64 min',
 'Genre': 'Animation, Adventure, Drama',
 'Director': 'Samuel Armstrong, Norman Ferguson, Wilfred Jackson',
 'Writer': 'Joe Grant, Dick Huemer, Otto Englander',
 'Actors': 'Sterling Holloway, Edward Brophy, Herman Bing',
 'Plot': 'Ridiculed because of his enormous ears, a young circus elephant is assisted by a mouse to achieve his full potential.',
 'Language': 'English',
 'Country': 'United States',
 'Awards': 'Won 1 Oscar. 5 wins & 1 nomination total',
 'Poster': 'https://m.media-amazon.com/images/M/MV5BNWVmNWQ2OTQtNzJlNC00ZmQwLTg4ZTktZTNmM2IxZTlkOGM3L2ltYWdlXkEyXkFqcGdeQXVyNTAyODkwOQ@@._V1_SX300.jpg',
 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '7.2/10'},
  {'Source': 'Rotten Tomatoes', 'Value': '98%'},
  {'Source': 'Metacritic', 'Value': '96/100'}],
 'Metascore': '96',
 'imdbRating': '7.2',
 'imdbVotes': '122,914',
 'imdbID': 'tt0033563',
 'Type': 'movie',
 'DVD': '1

In [5]:
for index, movie in enumerate(movie_info_list):
    if index % 10 == 0:
        print(index)
    title = movie['Title']
    omdb_info = get_omdb_info(title)
    movie['imdb'] = omdb_info.get('imdbRating', None)
    movie['Metascore'] = omdb_info.get('Metascore', None)
    movie['rotten_tomatoes'] = get_rotten_tomatoes(omdb_info)
    

0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450


In [23]:
save_data_pickle('disney_final_data.pickle', movie_info_list)

## Task 4:  Saving data as JSON & CSV

### Saving as JSON

In [14]:
movie_info_copy = [movie.copy() for movie in movie_info_list]

for movie in movie_info_copy:
    current_date = movie['Release date (datetime)']
    if current_date:
        movie['Release date (datetime)'] = current_date.strftime("%B %d, %Y")
    else:
        movie['Release date (datetime)'] = None

In [22]:
print(movie_info_copy[10]['Release date (datetime)'])
print(movie_info_list[10]['Release date (datetime)'])

April 20, 1946
1946-04-20 00:00:00


In [24]:
save_data('disney_final_data.json', movie_info_copy)

### Saving as CSV

In [27]:
import pandas as pd

df = pd.DataFrame(movie_info_list)

In [32]:
df.head()

Unnamed: 0,Title,Production company,Release date,Running time,Country,Language,Box office,Running time (min),Budget (float),Box office (float),...,Executive producer,Producers,Editors,Distributor,Japanese,Hepburn,Adaptation by,Animation by,Traditional,Simplified
0,Academy Award Review of,Walt Disney Productions,"[May 19, 1937]",41 minutes (74 minutes 1966 release),United States,English,$45.472,41.0,,45.472,...,,,,,,,,,,
1,Snow White and the Seven Dwarfs,Walt Disney Productions,"[December 21, 1937 ( Carthay Circle Theatre , ...",83 minutes,United States,English,$418 million,83.0,1490000.0,418000000.0,...,,,,,,,,,,
2,Pinocchio,Walt Disney Productions,"[February 7, 1940 ( Center Theatre ), February...",88 minutes,United States,English,$164 million,88.0,2600000.0,164000000.0,...,,,,,,,,,,
3,Fantasia,Walt Disney Productions,"[November 13, 1940]",126 minutes,United States,English,$76.4–$83.3 million (United States and Canada),126.0,2280000.0,83300000.0,...,,,,,,,,,,
4,The Reluctant Dragon,Walt Disney Productions,"[June 27, 1941]",74 minutes,United States,English,"$960,000 (worldwide rentals)",74.0,600000.0,960000.0,...,,,,,,,,,,


In [33]:
df.to_csv('disney_final_data.csv')