# Data Scraping and Preprocessing
- Reference: 

### Import necessary libraries

In [61]:
from bs4 import BeautifulSoup as bs
from datetime import datetime
import requests
import json
import re
import pandas as pd

### Scrape the infomation of a single movie on wikipedia

- Webpage: https://en.wikipedia.org/wiki/Toy_Story_3

In [2]:
response = requests.get('https://en.wikipedia.org/wiki/Toy_Story_3')
webpage = bs(response.content, 'html.parser')
contents = webpage.prettify()
# print(contents)

In [3]:
def get_value(td):
    if td.find('li'):
        return [li.get_text(' ', strip=True).replace('\xa0', ' ') for li in td.find_all('li')]
    elif td.select('a ~ br'):
        pattern = r'[,():$]'
        remove_tags(td, 'b')
        return [text for text in td.stripped_strings if not re.search(pattern, text)]
    else:
        return td.get_text(' ', strip=True).replace('\xa0', ' ')
    
def remove_tags(soup, tags):
    tags = soup.find_all(tags)
    for tag in tags:
        tag.decompose()

# Scrape the infobox of the movie wikipedia page and save it into a dictionary
movie_info = {} 
remove_tags(webpage, ['sup', 'span'])
infobox = webpage.find('table', class_='infobox vevent')
infobox_tr = infobox.find_all('tr')

for idx, row in enumerate(infobox_tr):
    if idx == 0:
        movie_info['Title'] = row.find('th').get_text(' ', strip=True)
    else:
        if row.find('th'): 
            key = row.find('th').get_text(' ', strip=True)
            value = get_value(row.find('td'))
            movie_info[key] = value
        
movie_info

{'Title': 'Toy Story 3',
 'Directed by': 'Lee Unkrich',
 'Screenplay by': 'Michael Arndt',
 'Story by': ['John Lasseter', 'Andrew Stanton', 'Lee Unkrich'],
 'Produced by': 'Darla K. Anderson',
 'Starring': ['Tom Hanks',
  'Tim Allen',
  'Joan Cusack',
  'Don Rickles',
  'Wallace Shawn',
  'John Ratzenberger',
  'Estelle Harris',
  'Ned Beatty',
  'Michael Keaton',
  'Jodi Benson',
  'John Morris'],
 'Cinematography': ['Jeremy Lasky', 'Kim White'],
 'Edited by': 'Ken Schretzmann',
 'Music by': 'Randy Newman',
 'Production companies': ['Walt Disney Pictures', 'Pixar Animation Studios'],
 'Distributed by': 'Walt Disney Studios Motion Pictures',
 'Release dates': ['June 12, 2010 ( Taormina Film Fest )',
  'June 18, 2010 (United States)'],
 'Running time': '103 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$200 million',
 'Box office': '$1.067 billion'}

### Scrape the information of all movies on wikipedia

- Webpage: https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films

In [4]:
response = requests.get('https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films')
webpage = bs(response.content, 'html.parser')
# Do not include tables 'Upcoming' and 'Undated', since those tables contain a lot of inaccurate data 
tables = webpage.select('table.wikitable.sortable')[:-2]
movie_list = []
for table in tables:
    movies = table.select('i > a')
    for movie in movies:
        movie_list.append(movie)

In [5]:
# previously defined function above
def get_value(td):
    if td.find('li'):
        return [li.get_text(' ', strip=True).replace('\xa0', ' ') for li in td.find_all('li')]
    elif td.select('a ~ br'):
        pattern = r'[,():$]'
        remove_tags(td, 'b')
        return [text for text in td.stripped_strings if not re.search(pattern, text)]
    else:
        return td.get_text(' ', strip=True).replace('\xa0', ' ')

# Clean up task 1. remove references (remove 'sup', 'span' html tags)
def remove_tags(soup, tags):
    tags = soup.find_all(tags)
    for tag in tags:
        tag.decompose()
    
# get movie information from infobox inside each wikipedia disney movie page
def get_movie_info(url):
    response = requests.get(url)
    webpage = bs(response.content, 'html.parser')
    remove_tags(webpage,  ['sup', 'span'])
    movie_info = {} 
    infobox = webpage.find('table', class_='infobox vevent')
    infobox_tr = infobox.find_all('tr')
    for idx, row in enumerate(infobox_tr):
        if idx == 0:
            movie_info['Title'] = row.find('th').get_text(' ', strip=True)
        else:
            if row.find('th'): 
                key = row.find('th').get_text(' ', strip=True)
                value = get_value(row.find('td'))
                movie_info[key] = value

    return movie_info

In [6]:
disney_movie_data = []

for idx, movie in enumerate(movie_list):
    if idx % 10 == 0:
        print(idx)
    try:
        movie_href = movie['href']
        wiki_url = 'https://en.wikipedia.org' + movie_href
        disney_movie_data.append(get_movie_info(wiki_url))
        
    except Exception as e:
        print(movie.get_text())
        print(e)

0
10
20
30
40
Zorro the Avenger
'NoneType' object has no attribute 'find'
The Sign of Zorro
'NoneType' object has no attribute 'find'
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
The Beatles: Get Back – The Rooftop Concert
'NoneType' object has no attribute 'find'
490


### Save / Reload movie data

In [2]:
def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [3]:
def reload_data(title):
    with open(title, 'r', encoding='utf-8') as f:
        return json.load(f)

In [9]:
save_data('disney_movie_data.json', disney_movie_data)

In [45]:
disney_movies = reload_data('disney_movie_data.json')
disney_movies[0]

{'Title': 'Academy Award Review of',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'United Artists',
 'Release date': ['May 19, 1937'],
 'Running time': '41 minutes (74 minutes 1966 release)',
 'Country': 'United States',
 'Language': 'English',
 'Box office': '$45.472'}

### Convert string value 'Running time' to integer

In [22]:
def convert_time(string):
    if isinstance(string, list):
        return int(string[0].split(' ')[0])
    elif string == 'N/A':
        return None
    else:
        return int(string.split(' ')[0])

for movie in disney_movies:
    movie['Running time (int)'] = convert_time(movie.get('Running time', 'N/A'))

In [23]:
disney_movies[50]

{'Title': 'One Hundred and One Dalmatians',
 'Directed by': ['Wolfgang Reitherman', 'Clyde Geronimi', 'Hamilton Luske'],
 'Story by': 'Bill Peet',
 'Based on': 'The Hundred and One Dalmatians by Dodie Smith',
 'Produced by': 'Walt Disney',
 'Starring': ['Rod Taylor',
  'Cate Bauer',
  'Betty Lou Gerson',
  'Ben Wright',
  'Bill Lee (singing voice)',
  'Lisa Davis',
  'Martha Wentworth'],
 'Edited by': ['Roy M. Brewer, Jr.', 'Donald Halliday'],
 'Music by': 'George Bruns',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': ['January 25, 1961'],
 'Running time': '79 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$3.6 million',
 'Box office': '$303 million',
 'Running time (int)': 79,
 'Budget (float)': 3600000.0,
 'Box office (float)': 303000000.0}

### Convert string value 'Budget & Box office' to integer

In [9]:
amount = r'(thousand|million|billion)'
# money = r'\d*(\.\d*)*(,\d*)*(\.\d*)*'
money = r'\d+(,\d{3})*\.*\d*'
word = rf'\${money}(-|\sto\s)?({money})?\s{amount}'
number = rf'\${money}'

def word_to_value(word):
    number_dict = {'thousand':1000, 'million': 1000000, 'billion': 1000000000}
    return number_dict[word]

def parse_number(string):
    value = re.search(number, string).group().replace(',', '').replace('$', '')
    return float(value)

def parse_word(string):
    value = re.search(number, string).group().replace(',', '').replace('$', '')
    word = word_to_value(re.search(amount, string).group())
    return float(value) * word

def convert_money(string):
    if string == 'N/A':
        return None
    if isinstance(string, list):
        string = string[0]
    if re.search(word, string):
        return parse_word(string)
    elif re.search(number, string):
        return parse_number(string)
    else:
        return None
    
for movie in disney_movies:
    movie['Budget (float)'] = convert_money(movie.get('Budget', 'N/A'))
    movie['Box office (float)'] = convert_money(movie.get('Box office', 'N/A'))

In [13]:
disney_movies[50]

{'Title': 'One Hundred and One Dalmatians',
 'Directed by': ['Wolfgang Reitherman', 'Clyde Geronimi', 'Hamilton Luske'],
 'Story by': 'Bill Peet',
 'Based on': 'The Hundred and One Dalmatians by Dodie Smith',
 'Produced by': 'Walt Disney',
 'Starring': ['Rod Taylor',
  'Cate Bauer',
  'Betty Lou Gerson',
  'Ben Wright',
  'Bill Lee (singing voice)',
  'Lisa Davis',
  'Martha Wentworth'],
 'Edited by': ['Roy M. Brewer, Jr.', 'Donald Halliday'],
 'Music by': 'George Bruns',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': ['January 25, 1961'],
 'Running time': '79 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$3.6 million',
 'Box office': '$303 million',
 'Running time (int)': 79,
 'Budget (float)': 3600000.0,
 'Box office (float)': 303000000.0}

### Convert 'Release date' & 'Release dates' to datatime object

In [57]:
def remove_parenthesis(string):
    return string.split("(")[0].strip()

def convert_date(string):
    if isinstance(string, list):
        string = string[0]
    if string == 'N/A':
        return None
    date = remove_parenthesis(string)
    formats = ['%B %d, %Y', '%d %B, %Y']
    for fmt in formats:
        try:
            return datetime.strptime(date, fmt)
        except:
            pass
    return None
    
# for the key 'Release date'    
for movie in disney_movies:
    movie['Release date (datetime)'] = convert_date(movie.get('Release date', 'N/A'))
    
# for the key 'Release dates'
for movie in disney_movies:
    if movie['Release date (datetime)']:
        pass
    else:
        movie['Release date (datetime)'] = convert_date(movie.get('Release dates', 'N/A'))

In [62]:
disney_movies[50]

{'Title': 'One Hundred and One Dalmatians',
 'Directed by': ['Wolfgang Reitherman', 'Clyde Geronimi', 'Hamilton Luske'],
 'Story by': 'Bill Peet',
 'Based on': 'The Hundred and One Dalmatians by Dodie Smith',
 'Produced by': 'Walt Disney',
 'Starring': ['Rod Taylor',
  'Cate Bauer',
  'Betty Lou Gerson',
  'Ben Wright',
  'Bill Lee (singing voice)',
  'Lisa Davis',
  'Martha Wentworth'],
 'Edited by': ['Roy M. Brewer, Jr.', 'Donald Halliday'],
 'Music by': 'George Bruns',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': ['January 25, 1961'],
 'Running time': '79 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$3.6 million',
 'Box office': '$303 million',
 'Release date (datetime)': datetime.datetime(1961, 1, 25, 0, 0)}

### Save data into a .csv format

In [64]:
df = pd.DataFrame(disney_movies)
df.to_csv('disney_movie_data.csv')