In [1]:
import requests
import re
from bs4 import BeautifulSoup
import dateutil.parser
from pprint import pprint
import numpy as np
import pandas as pd

In [2]:
def urlToSoup(url):
    '''Take an http request and return a soup object'''
    response = requests.get(url)
    return BeautifulSoup(response.text, 'lxml')

def formatLinks(listolinks):
    '''Reformat links scraped from box office mojo'''
    links=[]
    for link in listolinks:
        full_link = 'http://www.boxofficemojo.com'+link.get('href')
        links.append(full_link)
    return links

def getTopThreeHundreds(url):
    top_hundreds = urlToSoup(url).find('center').find_all('a', href=re.compile('^/yearly/chart/'))
    top_three = formatLinks(top_hundreds) 
    return top_three[:2]  

def getMovieLinks(spec_years):
    '''generate links to top three hundred movies per year'''
    year_urls = []
    for year in spec_years:
        top_hundred_url = 'http://www.boxofficemojo.com/yearly/chart/?yr='+str(year)+'&p=.htm'
        year_urls.append(top_hundred_url)
        year_urls += getTopThreeHundreds(top_hundred_url)
        
    links_to_movies = []
    for url in year_urls:
        links_to_movies += urlToSoup(url).find(id='body').find_all('a', href=re.compile('^/movies/\?id'))
    
    links = formatLinks(links_to_movies)
    return links

In [3]:
#
#   SCRAPE LINKS TO TOP THREE HUNDRED MOVIES PER YEAR in 'YEARS'
#
years=[]
for i in range(1980,2018):
    years.append(i)
alllinks=getMovieLinks(years)

9915

9915

In [22]:
def get_movie_value(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    obj = soup.find(text=re.compile(field_name))
    if not obj: 
        return None
    # this works for most of the values
    next_sibling = obj.findNextSibling()
    if next_sibling:
        return next_sibling.text 
    else:
        return None

def get_movie_link_value(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Alternative to get_movie_value that will work when
    text is located in an a tag
    '''
    obj = soup.find(text=re.compile(field_name))
    if not obj: 
        return None
    # this works for most of the values
    next_sibling = obj.findNext()
    if next_sibling:
        return next_sibling.text.strip() 
    else:
        return None

def to_date(datestring):
    if not datestring:
        return dateutil.parser.parse("January 1, 2019")
    date = dateutil.parser.parse(datestring)
    return date

def money_to_int(moneystring):
    if not moneystring:
        return np.NaN
    moneystring = moneystring.replace('$', '').replace(',', '')
    return int(moneystring)

def runtime_to_minutes(runtimestring):
    if not runtimestring:
        return np.NaN
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

def parse_production_budget(productionbudgetstring):
    if productionbudgetstring=='N/A' or not productionbudgetstring:
        return np.NaN
    productionbudgetstring = productionbudgetstring.replace('$', '').replace(',', '')
    if "million" in productionbudgetstring:
        productionbudgetstring=productionbudgetstring[:-8]
        productionbudgetstring+='000000'
    productionbudgetint=int(float(productionbudgetstring))
    return productionbudgetint

def in_release_to_days(inreleasestring):
    if not inreleasestring:
        return np.NaN
    return inreleasestring.split(' ')[0]

def parse_genre(genrestring):
    if not genrestring:
        return "N/A"
    return genrestring.split(" / ")

def parse_awards(awardsstring):
    '''Take the string of Academy Award information and return a list. 
    First value is nominations
    Second value (if present) is wins'''
    if not awardsstring:
        return [0,0,0]
    numberdict={'One':1,'Two':2,'Three':3,'Four':4,'Five':5,'Six':6,'Seven':7,'Eight':8,'Nine':9,'Ten':10,'Eleven':11,'Twelve':12,'Thirteen':13,'Fourteen':14,'Fifteen':15}
    awards=awardsstring.split()
    aa=[]
    for wrd in awards:
        if wrd in numberdict:
            aa.append(numberdict[wrd])
    aa.append(0)
    aa.append(0)
    return(aa)
    
def parse_single_movie(soup):
    '''Scrape features for a single movie and return them in a dictionary'''
    entry={}
    
    #TITLE
    raw_title = soup.find('title').text
    title = raw_title.split('(')[0].strip()
    entry['Title']=title
    
    #DOMESTIC TOTAL GROSS
    raw_domestic_total_gross = get_movie_value(soup,'Domestic Total')
    domestic_total_gross = money_to_int(raw_domestic_total_gross)
    entry['DomesticTotalGross']=domestic_total_gross
    
    #RUN TIME IN MINUTES
    raw_runtime = get_movie_value(soup,'Runtime')
    runtime = runtime_to_minutes(raw_runtime)
    entry['RunTime']=runtime
    
    #RELEASE DATE
    raw_release_date = get_movie_value(soup,'Release Date')
    release_date = to_date(raw_release_date)
    entry['ReleaseDate']=release_date
    
    #DAY OF THE YEAR
    entry['DayOfTheYear']=release_date.strftime('%j')
    
    #MONTH OF THE YEAR
    entry['MonthOfTheYear']=release_date.strftime('%m')
    
    #DIRECTOR
    director=get_movie_link_value(soup,'Director')
    entry['Director']=director
    
    #PRODUCTION BUDGET
    raw_production_budget=get_movie_value(soup,'Production Budget')
    production_budget=parse_production_budget(raw_production_budget)
    entry['ProductionBudget']=production_budget
    
    #IN RELEASE IN DAYS
    raw_in_release=get_movie_link_value(soup,'In Release:')
    inrelease=in_release_to_days(raw_in_release)
    entry['InRelease']=inrelease
    
    #MPAA RATING
    mpaa=get_movie_value(soup,'MPAA Rating')
    entry['MpaaRating']=mpaa
    
    #DISTRIBUTOR
    distributor=get_movie_value(soup, 'Distributor')
    entry['Distributor']=distributor
    
    #GENRE
    genre=get_movie_value(soup, "Genre:")
    entry['Genre']=genre
    
    #AWARDS
    raw_awards=get_movie_link_value(soup, 'Academy')
    awards=parse_awards(raw_awards)
    entry['Nominations']=awards[0]
    entry['Wins']=awards[1]
    
    return entry
    

In [23]:
url="http://www.boxofficemojo.com/movies/?id=wonderwoman.htm"
soup=urlToSoup(url)
release=get_movie_link_value(soup, "Director")

In [24]:
release

'Patty Jenkins'

### Scraping
* use a dataframe, append small chunks of the list of movies
* save work partway through, make sure to overwrite dataframe
* when starting over be sure to overwrite years variable

In [307]:
import copy

In [308]:
def parseLinks(start, end):
    '''Go through the links in the "alllinks" list
    in the range specified by "start" and "end", and
    scrape the data for each movie.
    Returns dataframe of all movies parsed'''
    movies={}
    for i in range(start,end):
        link=alllinks[i]
        movies[link]=parse_single_movie(urlToSoup(link))
    return(pd.DataFrame.from_dict(movies, orient="index"))

In [None]:
#data=None #for use when starting the whole process over
data=parseLinks(0, 1001)

In [None]:
data=data.append(parseLinks(1001,2001))

In [312]:
data=data.append(parseLinks(2001,3001))

In [313]:
data=data.append(parseLinks(3001,4001))

In [314]:
#Save current work to csv file, start dataframe over empty. Remember to make your filename unique to this batch.
data.to_csv("1980-2017_0001-4000.csv")
data=None

In [None]:
data=parseLinks(4001, 5001)

In [317]:
data=data.append(parseLinks(5001, 6001))

In [318]:
data=data.append(parseLinks(6001, 7001))

In [319]:
data=data.append(parseLinks(7001, 8001))

In [320]:
data=data.append(parseLinks(8001, len(alllinks)-1))

In [322]:
data.to_csv("1980-2017_4001-end.csv")        

* Concatenate all of the csv files into one larger dataframe

In [323]:
df1=pd.read_csv("1980-2017_0001-4000.csv")
df2=pd.read_csv("1980-2017_4001-end.csv")

In [324]:
import pickle

In [325]:
df1.append(df2)
with open('movies_1980-2017.pkl', 'wb') as picklefile:
    pickle.dump(data, picklefile)

In [326]:
!ls

1980-2017_0001-4000.csv
1980-2017_4001-end.csv
2013_movies.csv
Untitled.ipynb
challenge_set_03_katie.ipynb
movies_150-2017.pkl
oscar-scraping.ipynb
[34moscars[m[m
oscars_noms_scraping.ipynb
pairprobJuly11.ipynb
web_scraping_beautifulsoup_kaszklar.ipynb
web_scraping_selenium-kaszklar.ipynb
