In [2]:
from bs4 import BeautifulSoup
import urllib.request
import time
import sys
import random
from string import ascii_lowercase
import pickle

In [3]:
LETTERS = '#' + ascii_lowercase
BASE_URL = 'http://www.metacritic.com/browse/movies/title/dvd'

In [4]:
def make_soup_from_url(url, attempt=0):
    attempt = attempt + 1
    time.sleep(int(random.random()*20))
    if attempt > 10:
        return None
    try:
        req = urllib.request.Request(url, headers={'User-Agent': 'chrome'})
        html = urllib.request.urlopen(req).read()
        return BeautifulSoup(html, "lxml")
    except:
        make_soup_from_url(url, attempt)

In [5]:
def make_soup(letter, page, attempt = 0):
    if letter == '#':
        url = BASE_URL + "?page=" + str(page)
    else:
        url = BASE_URL + "/" + letter + "?page=" + str(page)


In [131]:
movie_list = []

In [132]:
for letter in LETTERS:
    print('processing letter: ' + letter + ' ...')
    page = 0
    while True:
        soup = make_soup(letter, page)
        
        if soup:
            results = soup.find_all('tr', {'class': 'summary_row'})
            links = [result.find('td', {'class': 'title_wrapper'}).find('a')['href'] for result in results]
        else:
            links = []
        
        movie_list = movie_list + links
        
        if links:
            page = page + 1
        else:
            break

processing letter: # ...
processing letter: a ...
processing letter: b ...
processing letter: c ...
processing letter: d ...
processing letter: e ...
processing letter: f ...
processing letter: g ...
processing letter: h ...
processing letter: i ...
processing letter: j ...
processing letter: k ...
processing letter: l ...
processing letter: m ...
processing letter: n ...
processing letter: o ...
processing letter: p ...
processing letter: q ...
processing letter: r ...
processing letter: s ...
processing letter: t ...
processing letter: u ...
processing letter: v ...
processing letter: w ...
processing letter: x ...
processing letter: y ...
processing letter: z ...


In [144]:
with open('../data/movielist_mc', 'wb') as f:
    pickle.dump(movie_list, f)

## Scraping individual pages

In [74]:
INDIV_URL = 'http://www.metacritic.com'
TR_FIELDS = ['runtime', 'movie_rating', 'company', 'languages', 'countries', 'genres']

In [6]:
with open('../data/movielist_mc', 'rb') as f:
    movie_list = pickle.load(f)

In [122]:
def clean_span_array(array):
    if type(array) == list:
        if len(array)==1:
            return array[0].strip()
        else:
            return [a.strip() for a in array]
    else:
        return array

In [127]:
def get_tr(soup, search_string):
    try:
        tr = soup.find('tr', {'class': search_string}).find('td', {'class': 'data'}).text
        return clean_span_array(tr.split(','))
    except:
        return

In [135]:
soup = make_soup_from_url(url)

In [180]:
def basic_fields(soup):
    title = soup.find('h1').text
    metascore = soup.select('span.metascore_w.header_size')[0].text
    user_score = soup.select('span.metascore_w.user')[0].text
    release_date = soup.find('div', {'class': 'product_info'}).find('span', {'class': 'release_date'}).text.strip().split('\n')[1]
    
    tr_array = []
    
    for field in TR_FIELDS:
        tr_array.append(get_tr(soup, field))
        
    return [title, metascore, user_score, release_date] + tr_array
    

In [10]:
movie_list[0]

'/movie/horror'

In [134]:
url = INDIV_URL + movie_list[0] + '/details'

In [132]:
a = scrape_page(url)

In [133]:
a

['$9.99',
 '68',
 '6.8',
 'June 19, 2009',
 '78 min',
 'Rated R for language and brief sexuality and nudity.',
 'Australian Film Finance Corporation (AFFC)',
 'English',
 ['Israel', 'Australia'],
 ['Drama', 'Animation']]

In [167]:
def get_people_from_category(soup, category):
    people = []
    for role in category.parent.parent.parent.find_all('td', {'class': 'role'}):
        name = role.previous_sibling.previous_sibling.text.strip()
        role = role.text.strip()
        if ('Producer' in role) and (role != 'Producer'):
            pass
        else:
            people.append(name)
            
    return people

In [173]:
def get_people(soup):
    # flag not to get cast if there was a principal cast
    principal_cast_flag = False
    people = {}
    
    for category in soup.find_all('th', {'class': 'person'}):
        category_name = category.text.strip()
        
        if category_name == 'Principal Cast':
            principal_cast_flag = True
            people['Cast'] = get_people_from_category(soup, category)
        elif category_name == 'Cast'  and principal_cast_flag:
            pass
        else:
            people[category_name] = get_people_from_category(soup, category)
    
    return people
            
    


In [174]:
get_people(soup)

{'Cast': ['Balthazar Getty',
  'Blue Lindeberg',
  'Bridget McGarry',
  'Chloë Sevigny',
  'Emma Adler',
  'Haley Murphy',
  'Mina Sundwall',
  'Sadie Seelert',
  'Timothy Hutton'],
 'Director': ['Tara Subkoff'],
 'Producer': ['Brendan Walsh', 'Jason Ludman', 'Oren Segal', 'Tara Subkoff'],
 'Writer': ['Tara Subkoff']}

In [189]:
mega_array = []

for movie in movie_list[0:5]:
    url = INDIV_URL + movie + '/details'
    print(url)
    soup = make_soup_from_url(url)
    basic_info = basic_fields(soup)
    ppl = get_people(soup)
    array = [movie] + basic_info + [ppl['Director']] + [ppl['Producer']] + [ppl['Cast']] + [ppl['Writer']]
    mega_array.append(array)
    
    

http://www.metacritic.com/movie/horror/details
http://www.metacritic.com/movie/999/details
http://www.metacritic.com/movie/pent/details
http://www.metacritic.com/movie/71/details
http://www.metacritic.com/movie/r-xmas/details


In [176]:
import pandas as pd

In [190]:
df = pd.DataFrame(mega_array, columns = ['id', 'name', 'metascore', 'usr_score',
                             'date', 'runtime', 'rating', 'company', 'languages',
                             'countries', 'genres', 'director', 'producer', 'cast', 'writer'])

In [191]:
df

Unnamed: 0,id,name,metascore,usr_score,date,runtime,rating,company,languages,countries,genres,director,producer,cast,writer
0,/movie/horror,#Horror,42,3.1,"November 20, 2015",101 min,Not Rated,Lowland Pictures,English,USA,"[Drama, Mystery, Thriller, Horror]",[Tara Subkoff],"[Brendan Walsh, Jason Ludman, Oren Segal, Tara...","[Balthazar Getty, Blue Lindeberg, Bridget McGa...",[Tara Subkoff]
1,/movie/999,$9.99,68,6.8,"June 19, 2009",78 min,Rated R for language and brief sexuality and n...,Australian Film Finance Corporation (AFFC),English,"[Israel, Australia]","[Drama, Animation]",[Tatia Rosenthal],"[Amir Harel, Emile Sherman]","[Anthony LaPaglia, Barry Otto, Ben Mendelsohn,...","[Etgar Keret, Tatia Rosenthal]"
2,/movie/pent,$pent,34,tbd,"July 21, 2000",91 min,,Spent LLC,English,USA,Drama,[Gil Cates Jr.],"[Gil Cates Jr., Jordan Summers, Rana Joy Glick...","[Barbara Barrie, Bassia Loebel, Charlie Spradl...",[Gil Cates Jr.]
3,/movie/71,'71,83,7.5,"February 27, 2015",99 min,"[Rated R for strong violence, disturbing image...",Warp Films,English,UK,"[Action, Drama, Thriller, War]",[Yann Demange],"[Angus Lamont, Robin Gutch]","[Charlie Murphy, Jack O'Connell, Paul Anderson...",[Gregory Burke]
4,/movie/r-xmas,'R Xmas,55,6.9,"November 8, 2002",85 min,"[Rated R for strong language, drug content and...",Valence Films Inc.,"[English, Spanish]","[USA, France]","[Drama, Crime]",[Abel Ferrara],[Pierre Kalfon],"[Andrew Fiscella, Anne Ackerman, Denia Brache,...","[Abel Ferrara, Cassandra De Jesus, Scott Pardo]"


## scraping critics

In [194]:
url = INDIV_URL + movie_list[0] + '/critic-reviews'

In [195]:
soup = make_soup_from_url(url)

In [201]:
soup.find(text='Village Voice').parent.parent.parent.parent.parent

<div class="review pad_top1 pad_btm1">
<div class="left fl">
<div class="metascore_w large movie positive indiv">70</div>
</div>
<div class="right fl">
<div class="title pad_btm_half"><span class="source"><a href="/publication/village-voice?filter=movies">Village Voice</a></span><span class="author"><a href="/critic/rob-staeger?filter=movies">Rob Staeger</a></span><span class="date">Nov 17, 2015</span></div>
<div class="summary">
<a class="no_hover" href="http://www.villagevoice.com/film/you-won-t-be-able-to-look-away-from-hashtags-and-selfies-horror-flick-horror-7912780" rel="noopener" target="_blank">
                                Not every gamble works: The girls' intrusive Bejeweled-like social-media game annoys at every turn, and the plot itself is murky. But #Horror mesmerizes nonetheless, filled with tension, cruelty, and can't-look-away style.
                                    </a>
<a class="read_full" href="http://www.villagevoice.com/film/you-won-t-be-able-to-look-away-fr

In [203]:
review = soup.find('div', {'class': 'review'})

In [216]:
movie_list[0]

'/movie/horror'

In [217]:

url = INDIV_URL + '/movie/avatar/critic-reviews'

In [221]:
soup = make_soup_from_url(url)

In [222]:
for review in soup.find_all('div', {'class': 'review'}):
    try:
        score = review.find('div', {'class': 'movie'}).text
        publication = review.find('span', {'class': 'source'}).text
        author = review.find('span', {'class': 'author'}).text
        print([score, publication, author])
    except:
        pass

['100', 'The Hollywood Reporter', 'Kirk Honeycutt']
['100', 'Empire', 'Chris Hewitt (1)']
['100', 'Chicago Sun-Times', 'Roger Ebert']
['100', 'Premiere', 'Nick Starkey']
['100', 'Chicago Reader', 'J.R. Jones']
['100', 'Los Angeles Times', 'Kenneth Turan']
['100', 'The New York Times', 'Manohla Dargis']
['100', 'ReelViews', 'James Berardinelli']
['100', 'L.A. Weekly', 'Scott Foundas']
['100', 'New York Magazine (Vulture)', 'David Edelstein']
['90', 'Variety', 'Todd McCarthy']
['90', 'Time', 'Richard Corliss']
['90', 'The New Yorker', 'David Denby']
['90', 'Wall Street Journal', 'Joe Morgenstern']
['90', 'Slate', 'Dana Stevens']
['89', 'Austin Chronicle', 'Marc Savlov']
['88', 'Rolling Stone', 'Peter Travers']
['88', 'New York Post', 'Lou Lumenick']
['88', 'Boston Globe', 'Ty Burr']
['88', 'Philadelphia Inquirer', 'Steven Rea']
['83', 'Christian Science Monitor', 'Peter Rainer']
['83', 'Portland Oregonian', 'Shawn Levy']
['75', 'Chicago Tribune', 'Michael Phillips']
['75', 'Entertainment

In [223]:
url = INDIV_URL + '/movie/avatar/user-reviews'

In [224]:
soup = make_soup_from_url(url)

In [235]:
soup.find(text='Positive:').parent.parent.find('div',{'class': 'count'}).text

'2,152'

In [236]:
soup.find(text='Negative:').parent.parent.find('div',{'class': 'count'}).text

'413'

In [237]:
soup.find(text='Mixed:').parent.parent.find('div',{'class': 'count'}).text

'441'