In [17]:
from multiprocessing import Pool #witness the power
import wikipedia
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib inline
from fuzzywuzzy import fuzz
from collections import defaultdict
from helper_functions import *

This notebook contains code that was used to scrape the raw movie data from the rotton tomatoes webpages.

Thankfully, rotton tomatoes has a consistent and well structured HTML/CSS codebase. As such, I was able to harnes the fullpower of beauiful soup to get exverything I needed!

The **key insight** in this notebook is the use of the **Multiprocessing library**. This library let me scrape websites in parallel saving me countless hours!

The code for that function can be found in helper_functions.py I was incredibly proud that I was able to use it. It is a library I wish to use more often - I hope to learn a lot more about it in future projects.

Towards the end of this notebook, you will see some fragments of code I was using to experiment with collecting movie information from wikipedia. Most notable is the wikipedia API and the WPtools library (essentially a wrapper of the wikipedia API).

In the end, I made use of the wikipedia API in order to obtain the unique HTML addresses for each movie - I then reverted back to the bs4 library and to scrape infoboxes. I really wish the the wikipedia API had a .infobox() method, it would have greatly simplified my codebase. Then again, if they did have that method - I would have learned a lot less!

In [18]:
def extract_rotton_info_v2(webpage):
    
    master_dict = {}
    movie_rank_index = 0
    tomato_rating_index = 1
    movie_url_index = 2
    genre_name = webpage.split("https://www.rottentomatoes.com/top/bestofrt/top_100_")[1].strip("/")


    print("-------------","Processing: ",webpage,"---------------")

    soup = BeautifulSoup(requests.get(webpage).text,'lxml')

    top_100_of_sub_genre = soup.find_all(class_='table')[0].find_all('td')

    for _ in range(1,(int(len(top_100_of_sub_genre)/4)+1)):

        rank = top_100_of_sub_genre[movie_rank_index].text.strip()

        tomato_percentage = top_100_of_sub_genre[tomato_rating_index].find(class_='tMeterScore').text.strip()

        movie_name = top_100_of_sub_genre[movie_url_index].text.strip()
        movie_name = movie_name+" (film)"

        movie_url = base_url+top_100_of_sub_genre[movie_url_index].find('a').get('href')
        
        movie_page = BeautifulSoup(requests.get(movie_url).text, 'lxml')

        #audience rating is out of 5
        audience_rating = movie_page.find(class_="audience-info hidden-xs superPageFontColor").text.split()[2]
        rotton_info_extraction = movie_page.find("div", {"id": "scoreStats"}).text.split()
        
        rotton_average_rating = rotton_info_extraction[2].split('/')[0] #out of 10
        rotton_reviews_counted = rotton_info_extraction[5]
        
        if movie_name not in master_dict: #want to avoid duplicate movies across lists.
            master_dict[movie_name] = [rank, rotton_average_rating, rotton_reviews_counted, tomato_percentage, audience_rating]
            
        
        movie_rank_index +=4
        tomato_rating_index += 4
        movie_url_index += 4
        
    return master_dict

In [14]:
# def extract_movie_names(array):
#     movie_names = []

#     for index, val in enumerate(array):
#         genre_list = array[index][list(array[index].keys())[0]]
#         for row in genre_list:
#             clean = row[0].split('(')
#             name = clean[0].strip()
#             year = clean[1].strip(')')
#             movie_names.append((name, year))
    
#     return movie_names

In [142]:
# def extract_movie_names(array): #movie names will now be the key
#     movie_names = []

#     for index, val in enumerate(array):
#         genre_list = array[index][list(array[index].keys())[0]]
        
#         for row in genre_list:
#             movie_names.append(row[0])
    
#     return movie_names

In [19]:
genre_urls_to_scrape = extract_sub_genre_links(starting_url)

In [20]:
all_rotton_data = witness_the_power(extract_rotton_info_v2, genre_urls_to_scrape)

------------- Processing:  https://www.rottentomatoes.com/top/bestofrt/top_100_action__adventure_movies/ ---------------
------------- Processing:  https://www.rottentomatoes.com/top/bestofrt/top_100_animation_movies/ ---------------
------------- Processing:  https://www.rottentomatoes.com/top/bestofrt/top_100_art_house__international_movies/ ---------------
------------- Processing:  https://www.rottentomatoes.com/top/bestofrt/top_100_classics_movies/ ---------------
------------- Processing:  https://www.rottentomatoes.com/top/bestofrt/top_100_science_fiction__fantasy_movies/ ---------------
------------- Processing:  https://www.rottentomatoes.com/top/bestofrt/top_100_sports__fitness_movies/ ---------------
------------- Processing:  https://www.rottentomatoes.com/top/bestofrt/top_100_comedy_movies/ ---------------
------------- Processing:  https://www.rottentomatoes.com/top/bestofrt/top_100_drama_movies/ ---------------
------------- Processing:  https://www.rottentomatoes.com/to

In [23]:
movie_database = extract_unique_movies_across_genre(all_rotton_data)

In [24]:
len(movie_database.keys())

932

In [27]:
pickle_object(all_rotton_data,"all_rotton_data")

In [28]:
pickle_object(movie_database,"movie_database")

In [16]:
list(movie_database.keys())

['Mad Max: Fury Road (2015)',
 'Metropolis (1927)',
 'King Kong (1933)',
 'The Adventures of Robin Hood (1938)',
 'Zootopia (2016)',
 'Seven Samurai (Shichinin no Samurai) (1956)',
 'The Treasure of the Sierra Madre (1948)',
 'Up (2009)',
 'Logan (2017)',
 'Wonder Woman (2017)',
 'Baby Driver (2017)',
 'The Dark Knight (2008)',
 'Star Wars: Episode VII - The Force Awakens (2015)',
 'The 39 Steps (1935)',
 'The Hurt Locker (2009)',
 'Skyfall (2012)',
 'The Jungle Book (2016)',
 'Star Trek (2009)',
 'Harry Potter and the Deathly Hallows - Part 2 (2011)',
 'Jaws (1975)',
 'Lawrence of Arabia (1962)',
 'WALL-E (2008)',
 'The LEGO Movie (2014)',
 'The Searchers (1956)',
 'The Terminator (1984)',
 'Moana (2016)',
 'Apocalypse Now (1979)',
 'Star Wars: Episode IV - A New Hope (1977)',
 'The French Connection (1971)',
 'Iron Man (2008)',
 'Spider-Man: Homecoming (2017)',
 'Aliens (1986)',
 'Kubo and the Two Strings (2016)',
 "Marvel's The Avengers (2012)",
 'Throne of Blood (1957)',
 'Once Upo

In [68]:
v = wikipedia.page(all_movie_names[0][0])

In [58]:
v

<WikipediaPage 'Mad Max: Fury Road'>

In [70]:
for i in dir(v)[39:]:
    print(i)

categories
content
coordinates
html
images
links
original_title
pageid
parent_id
references
revision_id
section
sections
summary
title
url


In [69]:
soup = BeautifulSoup(v.html(), 'lxml')

In [72]:
wikipedia_api_info = soup.find("table",{"class":"infobox vevent"})

In [84]:
result = {}
for tr in wikipedia_api_info.find_all('tr'):
    if tr.find('th'):
        result[tr.find('th').text] = tr.find('td')

In [98]:
result.keys()

dict_keys(['Mad Max: Fury Road', 'Directed by', 'Produced by', 'Written by', 'Starring', 'Music by', 'Cinematography', 'Edited by', 'Productioncompany ', 'Distributed by', 'Release date', 'Running time', 'Country', 'Language', 'Budget', 'Box office'])

In [97]:
result['Directed by'].text.strip()

'George Miller'

In [114]:
result['Release date'].li.text.split("\xa0")[1]

'May'

In [119]:
result['Running time'].text.strip().split(" minutes")[0]

'120'

In [126]:
result['Box office'].text.strip().split('[')[0]

'$378.9 million'

In [131]:
result['Budget'].text.strip().split("[")[0]

'$150 million'

In [134]:
result['Language'].text.strip()

'English'

In [28]:
wikipedia_api_info.strip().split("\n") # very messy - lets trip the WIP tools!

['Mad Max: Fury Road',
 'Theatrical release posterDirected by',
 'George MillerProduced by',
 '',
 ' Doug Mitchell',
 ' George Miller',
 ' PJ Voeten',
 'Written by',
 '',
 ' George Miller',
 ' Brendan McCarthy',
 ' Nico Lathouris',
 'Starring',
 '',
 ' Tom Hardy',
 ' Charlize Theron',
 ' Nicholas Hoult',
 ' Hugh Keays-Byrne',
 ' Rosie Huntington-Whiteley',
 ' Riley Keough',
 ' Zoë Kravitz',
 ' Abbey Lee',
 ' Courtney Eaton',
 'Music by',
 'Junkie XLCinematography',
 'John SealeEdited by',
 'Margaret SixelProductioncompany ',
 '',
 ' Village Roadshow Pictures',
 ' Kennedy Miller Mitchell',
 ' RatPac-Dune Entertainment',
 'Distributed by',
 '',
 ' Warner Bros. Pictures (United States/International)',
 ' Roadshow Films (Australia)',
 'Release date',
 ' 7\xa0May\xa02015\xa0(2015-05-07) (TCL Chinese Theatre)',
 ' 14\xa0May\xa02015\xa0(2015-05-14) (Australia)',
 ' 15\xa0May\xa02015\xa0(2015-05-15) (United States)',
 ' ',
 ' ',
 'Running time',
 '120 minutes[1]Country',
 '',
 ' Australia[2]',

In [29]:
import wptools
x = wptools.page(all_movie_names[0][0]).get() #got the information for mad max

Mad_Max:_Fury_Road (en)
{
  lang: en
  title: Mad_Max:_Fury_Road
}
en.wikipedia.org (query) Mad_Max:_Fury_Road
en.wikipedia.org (parse) 36426373
www.wikidata.org (wikidata) Q1757288
www.wikidata.org (claims) Q11424|Q229390|Q446960|Q1341051|Q188473|Q31922...
en.wikipedia.org (imageinfo) File:Mad Max Fury Road.jpg|File:Mad Max Fur...
Mad_Max:_Fury_Road (en)
{
  cache: <dict(5)> {claims, imageinfo, parse, query, wikidata}
  claims: <dict(29)> {Q11424, Q1341051, Q16193207, Q16728739, Q17100...
  description: 2015 Australian post-apocalyptic action film
  extext: <str(2346)> _**Mad Max: Fury Road**_ is a 2015 action film...
  extract: <str(2426)> <p><i><b>Mad Max: Fury Road</b></i> is a 2015...
  images: <list(2)>
  infobox: <dict(17)> {alt, caption, cinematography, country, direct...
  label: Mad Max: Fury Road
  lang: en
  modified: <dict(2)> {page, wikidata}
  pageid: 36426373
  parsetree: <str(106757)> <root><template><title>Use Australian Eng...
  props: <dict(9)> {P136, P161, P18, P31

In [32]:
x.wikidata #returns a nice dict of stuff that is also in the infobox.
#should use this to extract director name and date

{'IMDB': 'tt1392190',
 'cast': ['Tom Hardy',
  'Charlize Theron',
  'Nicholas Buenote Hoult',
  'Josh Helman',
  'Nathan Jones',
  'Zoë Kravitz',
  'Rosie Huntington-Whiteley',
  'Riley Keough',
  'Hugh Keays-Byrne',
  'Abbey Lee Kershaw',
  'Courtney Eaton',
  'John Howard',
  'Richard Carter',
  'Angus Sampson',
  'Megan Gale',
  'Melissa Jaffer',
  'Gillian Jones',
  'Joy Smithers',
  'Richard Norton',
  'Lee Perry'],
 'composer': 'Junkie XL',
 'director': 'George Miller',
 'genre': ['post-apocalyptic film',
  'action film',
  'adventure film',
  'science fiction film',
  'thriller film'],
 'image': 'Mad Max Fury Road film Logo.png',
 'instance': ['film', '3D film'],
 'pubdate': ['+2015-05-14T00:00:00Z',
  '+2015-05-15T00:00:00Z',
  '+2015-05-13T00:00:00Z',
  '+2015-05-22T00:00:00Z',
  '+2015-05-21T00:00:00Z'],
 'website': 'http://www.madmaxmovie.com/'}

In [34]:
director = x.wikidata['director']
director

'George Miller'

In [40]:
month_released = x.wikidata['pubdate']
datetime.strptime(month_released[0].strip('+').split('T')[0], "%Y-%m-%d").month

5

In [66]:
soup_new = BeautifulSoup(x.wikitext, 'lxml')

In [65]:
soup_new.find('table', {"class":"infobox vevent"})

In [125]:
x.infobox

{'alt': 'Theatrical release poster',
 'caption': 'Theatrical release poster',
 'cinematography': '[[John Seale]]',
 'country': '{{plainlist|\n* Australia|ref| name="Mad Max Fury Road"|{{cite web | url=http://www.bfi.org.uk/films-tv-people/5553f6745dee3 | title=\'\'Mad Max Fury Road\'\' | work=[[British Film Institute]] | date=2015 |access-date=27 November 2016}}|</ref>|\n* United States|ref| name="Mad Max Fury Road"|{{cite web | url=http://www.bfi.org.uk/films-tv-people/5553f6745dee3 | title=\'\'Mad Max Fury Road\'\' | work=[[British Film Institute]] | date=2015 |access-date=27 November 2016}}|</ref>|ref|{{cite web|url=https://www.nytimes.com/movies/movie/439675/Mad-Max-Fury-Road/overview|title=Mad Max: Fury Road (2015)|work=[[The New York Times]]|access-date=21 June 2015}}|</ref>|\n}}',
 'director': '[[George Miller (director)|George Miller]]',
 'distributor': '{{Plainlist|\n* [[Warner Bros. Pictures]] |small|(United States/International)|\n* [[Village Roadshow Pictures|Roadshow Films

In [126]:
d = x.infobox

In [127]:
for k,v in d.items():
    print(k,v)
    print()

name Mad Max: Fury Road

image Mad Max Fury Road.jpg

caption Theatrical release poster

alt Theatrical release poster

director [[George Miller (director)|George Miller]]

producer {{plainlist|
* [[Doug Mitchell (film producer)|Doug Mitchell]]
* George Miller
* PJ Voeten
}}

writer {{plainlist|
* George Miller
* [[Brendan McCarthy]]
* [[Nico Lathouris]]
}}

starring {{plainlist|
* [[Tom Hardy]]
* [[Charlize Theron]]
* [[Nicholas Hoult]]
* [[Hugh Keays-Byrne]]
* [[Rosie Huntington-Whiteley]]
* [[Riley Keough]]
* [[Zoë Kravitz]]
* [[Abbey Lee Kershaw|Abbey Lee]]
* [[Courtney Eaton]]
}}

music [[Junkie XL]]

cinematography [[John Seale]]

editing [[Margaret Sixel]]

studio {{plainlist|
* [[Village Roadshow Pictures]]
* [[Kennedy Miller Mitchell]]
* [[RatPac-Dune Entertainment]]
}}

distributor {{Plainlist|
* [[Warner Bros. Pictures]] |small|(United States/International)|
* [[Village Roadshow Pictures|Roadshow Films]] |small|(Australia)|
}}

released {{Film date|2015|05|7|[[TCL Chinese Th

In [114]:
h = d['released'].strip('{').strip('}').strip('Film date|').split("|")

In [115]:
h

['2015',
 '05',
 '7',
 '[[TCL Chinese Theatre]]',
 '2015',
 '05',
 '14',
 'Australia',
 '2015',
 '05',
 '15',
 'United States',
 'df',
 '=',
 'y']

In [120]:
for index, value in enumerate(h):
    if value == 'United States':
        month = h[index-2]
        print(month)

05
