# Disney Database Creation & Analysis

## Imports Libraries

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import requests
import re
import json
import pickle
import urllib

from datetime import datetime
from matplotlib import pyplot as plt
from bs4 import BeautifulSoup as bs

## Tasks

### Task #1 - Get the info box
Get the information on the info box and save it in the Python dictionary.

In [2]:
def clean_soup(soup):
    ''' Cleans up the soup, removes some tags, refrences, ... '''
    
    for tag in soup.find_all(['span', 'sup']):
        tag.decompose()
        

def clean_value(key, value):
    ''' Cleans the value '''

    if key != 'Release date':
        value = re.sub(r'\([^)]*\)', '', value)
        
    return value.strip().replace('\xa0', ' ').replace('\n', ',')


def get_movie_info(url):
    ''' Scraps the movie information from Wikipedia '''
    
    # Get the webpage content
    content = requests.get(url).content
    
    # Convert the content into BeautifulSoup object
    soup = bs(content)
    
    # Cleans up the Beautiful soup
    clean_soup(soup)
    
    # Search for the info box
    info_box = soup.find(class_ = 'infobox vevent')
    
    # Get table rows
    info_rows = info_box.find_all('tr')
    
    movie = {}    
    for index, row in enumerate(info_rows):
        
        # Movie title
        if index == 0: movie['title'] = row.find('th').get_text()
        # Movie cover
        elif index == 1:
            movie['Cover Url'] = 'https:' + row.find('img')['src']
        # Other movie properties
        elif index != 1:
            
            # Get key
            key = row.find('th').get_text(' ', strip = True)
            
            # Get raw value
            raw_value = row.find('td').get_text()
            
            # Processed value
            processed_value = clean_value(key, raw_value)
            
            # Add to the dictionary
            movie[key] = processed_value

    # Add movie URL
    movie['url'] = url
    
    return movie


# get_movie_info('https://en.wikipedia.org/wiki/Cinderella_(1950_film)')

### Task #2 - Scape all info boxes on for Disney

In [3]:
def get_disney_movies(url):
    ''' Scrapes through all links on Disney Wikipedia page '''
    
    # Get the webpage content
    content = requests.get(url).content
    
    # Convert the content into BeautifulSoup object
    soup = bs(content)
    
    movies = soup.select('.wikitable.sortable i a')
    
    movie_info_list = []
    for index, movie in enumerate(movies):
        
        if index % 10 == 0:
            print('Scraping... ({0} / {1})'.format(index, len(movies)))

        # Create movie URL and Title
        movie_url = 'https://en.wikipedia.org/' + movie['href']
        movie_title = movie.get_text()

        try:
            # Append movie info to the list
            movie_info_list.append(get_movie_info(movie_url))

        except Exception as ex:
            # Print exception trace
            print('ERROR', movie_title, movie_url, ex, sep = ' - ')

    print('Done.')
    return movie_info_list

movie_info_list = get_disney_movies('https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films')
# save_data(movie_info_list, name = 'raw_disney_movies')

Scraping... (0 / 450)
Scraping... (10 / 450)
Scraping... (20 / 450)
Scraping... (30 / 450)
ERROR - Johnny Tremain - https://en.wikipedia.org//wiki/Johnny_Tremain_(film) - 'NoneType' object has no attribute 'find_all'
Scraping... (40 / 450)
ERROR - Zorro the Avenger - https://en.wikipedia.org//wiki/Zorro_(1957_TV_series)#Theatrical - 'NoneType' object has no attribute 'get_text'
ERROR - The Sign of Zorro - https://en.wikipedia.org//wiki/Zorro_(1957_TV_series)#Theatrical - 'NoneType' object has no attribute 'get_text'
Scraping... (50 / 450)
Scraping... (60 / 450)
Scraping... (70 / 450)
Scraping... (80 / 450)
Scraping... (90 / 450)
Scraping... (100 / 450)
Scraping... (110 / 450)
ERROR - One Little Indian - https://en.wikipedia.org//wiki/One_Little_Indian_(film) - 'NoneType' object has no attribute 'get_text'
Scraping... (120 / 450)
ERROR - The Best of Walt Disney's True-Life Adventures - https://en.wikipedia.org//wiki/The_Best_of_Walt_Disney%27s_True-Life_Adventures - 'NoneType' object is

NameError: name 'save_data' is not defined

#### Save Data

##### As JSON

In [4]:
def save_data(data, name = 'raw_disney_movies'):
    ''' Saves the data into .json format '''
    
    with open('datasets/' + name + '.json', 'w', encoding = 'utf-8') as file:
        json.dump(data, file, ensure_ascii = False, indent = 2)

##### As Pickle

In [5]:
def save_pickle(data, name = 'cleaned_pickle_disney_movies'):
    ''' Saves the data info pickle object '''
    
    with open('datasets/' + name + '.pickle', 'wb') as file:
        pickle.dump(data, file)

##### As CSV

In [10]:
def save_csv(data, name = 'csv_pickle_disney_movies'):
    df = pd.DataFrame(data)
    df.to_csv('datasets/' + name + '.csv')

#### Load Data

##### As JSON

In [7]:
def load_data(name = 'raw_disney_movies'):
    ''' Reads the saved .json formatted file '''

    with open('datasets/' + name + '.json', 'r', encoding = 'utf-8') as file:
        return json.load(file)

##### As Pickle

In [3]:
def load_pickle(name = 'cleaned_pickle_disney_movies'):
    ''' Loads the data from a pickle object '''
    
    with open('datasets/' + name + '.pickle', 'rb') as file:
        return pickle.load(file)

##### As CSV

In [11]:
def load_csv(name = 'csv_pickle_disney_movies'):
    return pd.read_csv('datasets/' + name + '.csv')

### Task #3 - Cleaning the Data

In [11]:
save_data(movie_info_list, name = 'raw_disney_movies')
movie_info_list = load_data()

#### Fix Running Time

In [12]:
def minute_to_integer(running_time):
    ''' Converts the running time to integer format '''

    if isinstance(running_time, list):
        return running_time[0].split(' ')[0]
    else:
        return running_time.split(' ')[0]

In [13]:
# Convert running time to integer format

for movie in movie_info_list:
    movie['Running time'] = minute_to_integer(movie.get('Running time', 'N/A'))

#### Fix Budgets and Box Office

In [14]:
def budget_to_integer(money):
    ''' Converts budget into integer format '''
    
    if money == 'N/A': return money
    
    # Handle lists
    if isinstance(money, list): money = money[0]
    
    # Mapper
    mapper = {
        'billion': 10 ** 9,
        'million': 10 ** 6,
        'thousand': 10 ** 3,
        'default': 1,
    }
    
    # Patterns
    value_pattern = r'\d+(,\d{3})*\.*\d*'
    quantity_pattern = r'thousand|million|billion'
    
    value_re = rf'\$({value_pattern})'
    quantity_re = rf'\${value_pattern}-?{value_pattern}?\s*({quantity_pattern})?'
    
    # Get the value and quantity
    try: value = float(re.search(value_re, money).group(1).replace(',', ''))
    except: return 'N/A'
    
    try:
        quantity = re.search(quantity_re, money).group(3)
        
        # Raise exception if quantity is not found
        if quantity is None: raise Exception()
    except: quantity = 'default'
    
    return str(value * mapper[quantity])

In [15]:
def clean_null(value):
    ''' Returns N/A if value is not set '''
    return 'N/A' if value in ['', 'unknown'] else value


for index, movie in enumerate(movie_info_list):
    
    box_office = clean_null(movie.get('Box office', 'N/A'))
    budget = clean_null(movie.get('Budget', 'N/A'))
    
    if box_office == '': box_office = 'N/A'
    if budget == '': budget = 'N/A'
    
    movie['Box office'] = budget_to_integer(box_office)
    movie['Budget'] = budget_to_integer(budget)

### Task #4 - Convert dates into date object

In [16]:
def get_date(date, pattern = r'([A-Za-z]+)\s(\d+),\s(\d+)'):
    ''' Extracts the date from the string '''
    
    # Error handling
    if date == 'N/A': return date
    return re.search(pattern, date).groups()


def create_date(date_str):
    ''' Creates a date object '''

    return datetime.strptime(date_str, '%B %d %Y')


for index, movie in enumerate(movie_info_list):

    # Try with the default pattern
    try:
        date = get_date(movie.get('Release date', 'N/A'))
    except:
        # Try with alternative pattern
        try:
            date = get_date(movie.get('Release date', 'N/A'), pattern = r'^(\d+)\s([A-Za-z]+)\s(\d+)')
            date = (date[1], date[0], date[2])
        except: continue
    
    if date != 'N/A':
        date = create_date(' '.join(date))
#     print(date)
    
    movie['Release date'] = date
    
# Save as Pickle (Not JSON!!!)
save_pickle(movie_info_list)

### Attach IMDB/Rotten Tomatoes/Metascore Scores and other Stuff

In [6]:
movie_info_list = load_pickle()

In [7]:
OMDB_API_base_url = 'http://www.omdbapi.com/?'

def get_OMDB_movie_info(title):
    ''' Uses the API to capture movie information '''
    
    # Parameters
    parameters = { 'apikey': '82c41530', 't': title}
    encoded_parameters = urllib.parse.urlencode(parameters)

    # Return request results
    return requests.get(OMDB_API_base_url + encoded_parameters).json()


def get_rotten_tomatoes_score(omdb_info):
    
    ratings = omdb_info.get('Ratings', [])
    
    for rating in ratings:
        if rating['Source'] == 'Rotten Tomatoes':
            return rating['Value']
    
    # Just in case no score is present
    return 'N/A'

In [8]:
for index, movie in enumerate(movie_info_list):
    
    if index % 10 == 0:
        print('Getting API info... ({0}/{1})'.format(index, len(movie_info_list)))
        
    try:
        # Get OMDB info by title
        omdb_info = get_OMDB_movie_info(movie['title'])
    
        # Get scores
        movie['Rotten Tomatoes'] = get_rotten_tomatoes_score(omdb_info)
        movie['IMDB'] = omdb_info.get('imdbRating', 'N/A')    
        movie['Metascore'] = omdb_info.get('Metascore', 'N/A')

        # Other info
        movie['Rated'] = omdb_info.get('Rated', 'N/A')
        movie['Language'] = omdb_info.get('Language', 'N/A')
        movie['Awards'] = omdb_info.get('Awards', 'N/A')
        movie['Poster'] = omdb_info.get('Poster', 'N/A')
        movie['Genre'] = omdb_info.get('Genre', 'N/A')
        movie['Plot'] = omdb_info.get('Plot', 'N/A')    
    except:
        print('Error on {0}'.format(index))

Getting API info... (0/433)
Getting API info... (10/433)
Getting API info... (20/433)
Getting API info... (30/433)
Getting API info... (40/433)
Getting API info... (50/433)
Getting API info... (60/433)
Getting API info... (70/433)
Getting API info... (80/433)
Getting API info... (90/433)
Getting API info... (100/433)
Getting API info... (110/433)
Getting API info... (120/433)
Getting API info... (130/433)
Getting API info... (140/433)
Getting API info... (150/433)
Getting API info... (160/433)
Getting API info... (170/433)
Getting API info... (180/433)
Getting API info... (190/433)
Getting API info... (200/433)
Getting API info... (210/433)
Getting API info... (220/433)
Getting API info... (230/433)
Getting API info... (240/433)
Getting API info... (250/433)
Getting API info... (260/433)
Getting API info... (270/433)
Getting API info... (280/433)
Getting API info... (290/433)
Getting API info... (300/433)
Getting API info... (310/433)
Getting API info... (320/433)
Getting API info... (

In [12]:
save_csv(movie_info_list)
# movie_info_list = load_csv()