## Import all needed modules

In [1]:
import sys
import subprocess

if 'darwin' in sys.platform:
    print('Running \'caffeinate\' on MacOSX to prevent the system from sleeping')
    subprocess.Popen('caffeinate')

Running 'caffeinate' on MacOSX to prevent the system from sleeping


In [9]:
import requests 
import string
from bs4 import BeautifulSoup  
import logging  
logging.basicConfig(level=logging.DEBUG)
import dateutil.parser
import time
import re
from collections import OrderedDict
import pandas as pd
from datetime import datetime
import numpy as np

## Get movie urls from boxofficemojo.com

In [3]:
def get_all_movies():  
    """ returns all the movie urls from boxofficemojo.com in a list"""

    # Alphabet loop for how movies are indexed including
    # movies that start with a special character or number
    index = ["NUM"] + list(string.ascii_uppercase)

    # List of movie urls
    movies_list = []

    # Loop through the pages for each letter
    for letter in index:

        # Loop through the pages within each letter
        for num in range(1, 20):
            url = ("http://www.boxofficemojo.com/movies/alphabetical.htm?"
                   "letter=" + letter + "&page=" + str(num))
            try:
                response = requests.get(url)
                page = response.text
                soup = BeautifulSoup(page, "lxml")
                rows = soup.find(id="body").find("table").find("table").find_all(
                    "table")[1].find_all("tr")

                # skip index row
                if len(rows) > 1:
                    counter = 1
                    for row in rows:
                        # skip index row
                        if counter > 1:
                            link = row.td.font.a['href']
                            # don't add duplicates
                            if link not in movies_list:
                                movies_list.append(link)

                        counter += 1
            except (Exception, e):
                logging.exception(e)

    return movies_list

#### To get the url of the movies, call the function defined above 

In [4]:
all_movies = get_all_movies()

## Define certain useful functions

In [5]:
def get_movie_value(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    obj = soup.find(text=re.compile(field_name))
    if not obj: 
        return None
    # this works for most of the values
    next_sibling = obj.findNextSibling()
    if next_sibling:
        return next_sibling.text # RETURN TEXT OF NEXT SIBLING
    else:
        return None


def get_movie_value_next(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    obj = soup.find(text=re.compile(field_name))
    if not obj: 
        return None
    # this works for most of the values
    next_sibling = obj.findNextSibling()
    if next_sibling:
        return next_sibling.text # RETURN TEXT OF NEXT SIBLING
    else:
        return None



def get_movie_value_next_next(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    obj = soup.find(text=re.compile(field_name))
    
    if obj:
        return obj.next.next.text
    else:
        return None
    

def money_to_int(moneystring):
    moneystring = moneystring.replace('$', '').replace(',', '')
    return int(moneystring)


#fix this here. ARGH!
#def money_to_int_2(moneystring):
#    if moneystring != 'N/A':
#        moneystring = moneystring.replace('$', '').replace('.','').replace(',','').replace(' million', '000000')
#        return int(moneystring)

#def money_to_int_2(moneystring):
#    if moneystring != 'N/A':
#        moneystring = moneystring.replace('$','').replace(',','')
#        if 'million' in moneystring:
#            moneystring.replace('million','')
#            return float(moneystring)*1000000
#        else:
#            return float(moneystring)
        
def money_to_int_2(string):
    if string is not None:
        i = string.replace('$','').replace(',','').replace(' ','')
        if "million" in i:
            i = i.replace('million','')
            i = float(i) * 1000000
            i = int(i)
        return i
    else:
       return np.nan

def runtime_to_minutes(runtimestring):
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

def strip_and_return_int(s):
    t = s.replace('$', '').replace(',','')
    t = int(t)
    return t

def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

## Define functions to get movie information

In [6]:
def movie_title(soup):        
    #try:
    title_string = soup.find('title').text
    title = title_string.split('(')[0].strip()
    #except:
        #title = float('NaN')
        
    return title 
    
def movie_producers(soup):
    try:
        producer = get_movie_value_next(soup, 'Producer')
        producers = re.sub('([a-z()])([A-Z])', '\g<1>, \g<2>', producer).split(",") 
    except TypeError:
        producers = float('NaN')
    except AttributeError:
        producers = float('NaN')
        
    return producers
        
def movie_directors(soup):
    try:
        director = get_movie_value_next(soup,'Director')
        director = re.sub('([a-z()])([A-Z])', '\g<1>, \g<2>', director).split(",") 
    except TypeError:
        director = float('NaN')
    
    return director
    
#def movie_actors(soup):
#    try:
#        actor = get_movie_value_next(soup,'Actor')
#        v = re.sub('([a-z()])([A-Z])', '\g<1>, \g<2>', actor)
#        actors = re.sub('[(*\']','', v)
#        actors = actors.split(",") 
#    except TypeError:
#        actors = float('NaN')
#
#    return actors

#def movie_actors(soup):
#    try:
#        actor = get_movie_value_next(soup,'Actor')
#        if '*' in actor:
#            actors =  actor.split('*')
#        else:
#            v = re.sub('([a-z()])([A-Z])', '\g<1>, \g<2>', actor)
#            actors = re.sub('[(*\']','', v)
#            actors = actors.split(",") 
#    except TypeError:
#        actors = float('NaN')
#    return actors

def movie_actors(soup): # still doesn't deal well with McPeople
    try:
        actor = get_movie_value_next_next(soup,'Actor') # try get_movie_value_next if it doesnt work 
        v = re.sub('([a-z()])([A-Z])', '\g<1>, \g<2>', actor)
        v = v.replace('(, Voice)','')
        if "*" in v:
            actors = v.split('*')
        else:
            actors = re.sub('[(*\']','', v)
            actors = actors.split(",")

    except TypeError:
        actors = float('NaN')
    return actors





def movie_genre(soup):
    try:
        genre = get_movie_value(soup, 'Genre[^a-z]')
    except:
        genre = float('NaN')
    return genre

def movie_rating(soup):
    try:
        rating = get_movie_value(soup,'MPAA Rating')
    except:
        rating = float('NaN')
    return rating

#def release_date(soup):
#    try:
#        raw_release_date = get_movie_value(soup,'Release Date')
#        if raw_release_date != 'TBD' and raw_release_date != 'N/A' and raw_release_date[0].isnumeric() :
#            release_date = to_date(raw_release_date)
#        else:
#            release_date = float('NaN')
#    except AttributeError:
#        release_date = float('NaN')
#    
#    return release_date

def release_date(soup):
    try:
        raw_release_date = get_movie_value(soup,'Release Date')
        if raw_release_date != 'TBD' and raw_release_date != 'N/A' and raw_release_date.split()[0] != 'Spring' and raw_release_date.split()[0] != 'Summer' and raw_release_date.split()[0] != 'Fall' and raw_release_date.split()[0] != 'Winter' :
            release_date = to_date(raw_release_date)
        else:
            release_date = float('NaN')
    except AttributeError:
        release_date = float('NaN')
    return release_date
    
def domestic_gross(soup):
    
    try:
        raw_domestic_total_gross = get_movie_value(soup,'Domestic Total')
        domestic_total_gross = money_to_int(raw_domestic_total_gross)
    except AttributeError:
        domestic_total_gross = float('NaN')
    
    return domestic_total_gross
    
def opening_weekend_gross(soup):
    
    try:
        opening_weekend_gross = get_movie_value_next(soup,'Opening Weekend')
        opening_weekend_gross = money_to_int(opening_weekend_gross)
    except AttributeError:
        opening_weekend_gross = float('NaN')
    return opening_weekend_gross
    
def production_budget(soup):
    try:
        production_budget = get_movie_value(soup, 'Production Budget')
        production_budget = money_to_int_2(production_budget)
    except AttributeError:
        production_budget = float('NaN')
    return production_budget
    
def runtime(soup):
    try:
        raw_runtime = get_movie_value(soup,'Runtime')
        runtime = runtime_to_minutes(raw_runtime)
    except AttributeError:
        runtime = float('NaN')
    return runtime
    
def widest_release(soup):
    try:
        widest_release = get_movie_value_next_next(soup, 'Widest')
        widest_release = strip_and_return_int(widest_release.split()[0])
    except AttributeError:
        widest_release = float('NaN')
    return widest_release

def distributor(soup):
    try:
        if str(soup.find(id='body')) != 'None':
            distributor = soup.find(id="body").find(text=re.compile("Distributor"))
            if str(distributor) != 'None':
                distributor = distributor.findNextSibling().text
                return distributor
    except LookupError:
        distributor = float('NaN')


## MOVIE SCRAPER

In [7]:
def movie_scraper(all_movies):    
    movie_blocks = list(range(0, len(all_movies)-1, int(len(all_movies)/30)))
    movie_blocks.append(len(all_movies)-1) 
            
    print("The length of urls_chunks, minus one is %s" % str(len(movie_blocks)-1))
    
    movie_data_list = OrderedDict()
    count = 0 
    try:
        for block_index in range(len(movie_blocks) - 1):
                    
            print ("URL chunk ... %s of %s in 10 seconds ... \n" % (str(block_index), str(len(movie_blocks) - 1)))
            
            time.sleep(10)
            
            print ("Fetching URLs in movie_list location: %s to %s \n" % (movie_blocks[block_index], movie_blocks[block_index + 1]))
            for movie in all_movies[movie_blocks[block_index]:movie_blocks[block_index + 1]]:
                count +=1
                url = "http://www.boxofficemojo.com/" + movie
                response = requests.get(url)
                time.sleep(0.1)
                page = response.text
                soup = BeautifulSoup(page, "lxml")
                print(count,movie)
                movie_data_list[movie_title(soup)] = [movie, movie_genre(soup), release_date(soup), 
                                                        distributor(soup), runtime(soup), 
                                                        movie_rating(soup), production_budget(soup), 
                                                        domestic_gross(soup), movie_actors(soup), 
                                                        opening_weekend_gross(soup), widest_release(soup),
                                                        movie_producers(soup), movie_directors(soup) ]
                        
                
                
                #try:
                #    time.sleep(0.2)
                #    response = requests.get(url)
                #    page = response.text
                #    #print(str(re.search('\w+(?=\.htm)', url).group()))
                #    soupObjects[str(re.search('\w+(?=\.htm)',url).group())] = BeautifulSoup(page,'lxml')
                #except requests.exceptions.RequestException as e:
                #    print(e)
                #    sys.exit(1)
                #
                    
            print("continuing in 10 seconds ...")
    except requests.exceptions.RequestException as e:
        print(e)
        pass
        #sys.exit(1)
            
    return movie_data_list


In [None]:
dict_1 = movie_scraper(all_movies)

The length of urls_chunks, minus one is 30
URL chunk ... 0 of 30 in 10 seconds ... 

Fetching URLs in movie_list location: 0 to 556 

1 /movies/?id=horrorifc.htm
2 /movies/?id=9dot99.htm
3 /movies/?id=supercapitalist.htm
4 /movies/?id=500daysofsummer.htm
5 /movies/?id=untitled.htm
6 /movies/?id=andjusticeforall.htm
7 /movies/?id=1mileabove.htm
8 /movies/?id=1plus1.htm
9 /movies/?id=1000rupeenote.htm
10 /movies/?id=1000times.htm
11 /movies/?id=10.htm
12 /movies/?id=badrobot2016.htm
13 /movies/?id=10daysinamadhouse.htm
14 /movies/?id=10itemsorless.htm
15 /movies/?id=10questionsforthedalailama.htm
16 /movies/?id=10rules.htm
17 /movies/?id=10thingsihateaboutyou.htm
18 /movies/?id=10tomidnight.htm
19 /movies/?id=10years.htm
20 /movies/?id=10000bc.htm
21 /movies/?id=10000km.htm
22 /movies/?id=100bloodyacres.htm
23 /movies/?id=100yearoldman.htm
24 /movies/?id=1001grams.htm
25 /movies/?id=101dalmations.htm
26 /movies/?id=101dalmatiansliveaction.htm
27 /movies/?id=101dalmatians69.htm
28 /movies

In [None]:
df = pd.DataFrame(dict_1)

In [None]:
df = df.transpose()

In [None]:
df.tail()

In [None]:
df.columns = ['movie', 'genre','release date', 'distributor', 'runtime', 'rating', 'budget', 'domestic gross', 'actors', '1st weekend gross', 'widest release', 'producers', 'directors']

In [None]:
df.to_pickle('movies_1.pkl')

In [None]:
df['title'] = df.index

In [None]:
df.index = range(len(df))

In [None]:
# /movies/?id=punchline.htm

In [None]:
df.to_pickle('movies_1_v2.pkl')

In [None]:
df.shape

In [None]:
dict_2 = movie_scraper(all_movies[11924:])

In [None]:
df2 = pd.DataFrame(dict_2)

In [None]:
df2 = df2.transpose()
df2.columns = ['movie', 'genre','release date', 'distributor', 'runtime', 'rating', 'budget', 'domestic gross', 'actors', '1st weekend gross', 'widest release', 'producers', 'directors']
df2['title'] = df2.index
df2.index = range(len(df2))

In [None]:
df2.to_pickle('movies_2.pkl')


## Miscellaneous

In [None]:
def get_genres(soup):
    """ returns all genres from specific movie page at boxofficemojo.com"""
    genres_list = []
    try:
        genres = soup.find(id="body").find(text=re.compile("Genres"))
        genres = genres.findParent().findNextSibling().find_all('tr')
        genre_count = 0
        for genre in genres:
            if genre_count > 0:
                genres_list.append(genre.td.font.a.text)
            genre_count += 1
    except LookupError:
        try:
            genres = soup.find(id="body").find(text=re.compile("Genre"))
            genres = genres.findNextSibling().text
            genres_list.append(genres)
        except:
            genres_list.append("N/A")
    return genres_list


def get_title(soup):
    """returns title from specific movie page at boxofficemojo.com"""
    try:
        title = soup.find("title").text.rsplit('(', 1)[0].strip()
    except LookupError:
        title = "N/A"
    return title


def get_release_date(soup):
    """returns datetime value of release date from specific movie
    page at boxofficemojo.com
    """
    try:
        date = soup.find(id="body").find(text=re.compile("Release Date"))
        date = date.findNextSibling().text
        date = datetime.strptime(date, "%B %d, %Y")
        return date
    except LookupError:
        return "N/A"


def get_distributor(soup):
    """returns movie distributor from specific movie page at boxofficemojo.com"""
    try:
        distributor = soup.find(id="body").find(text=re.compile("Distributor"))
        distributor = distributor.findNextSibling().text
        return distributor
    except LookupError:
        return "N/A"


def get_rating(soup):
    """returns MPAA Rating from specific movie page at boxofficemojo.com"""
    try:
        rating = soup.find(id="body").find(text=re.compile("MPAA Rating"))
        rating = rating.findNextSibling().text
        return rating
    except LookupError:
        return "N/A"


def get_runtime(soup):
    """returns integer value of runtime from specific movie page at boxofficemojo.com"""
    try:
        runtime = soup.find(id="body").find(text=re.compile("Runtime"))
        runtime = runtime.findNextSibling().text
        time_splits = runtime.split("hrs.")
        try:
            hrs = int(time_splits[0]) * 60
        except LookupError:
            hrs = 0
        mins = int(time_splits[1].split(" min.")[0].strip())
        total = hrs + mins
        return total
    except LookupError:
        return "N/A"


def get_budget(soup):
    """returns movie budget from specific movie page at boxofficemojo.com"""
    try:
        budget = soup.find(id="body").find(text=re.compile("Production Budget"))
        budget = budget.findNextSibling().text
        if budget != "N/A":
            budget = int(budget.split("million")[0].split("$")[1].strip()) * 1000000
        return budget
    except LookupError:
        return "N/A"


def get_domestic_gross(soup):
    """returns integer value of domestic gross from specific movie page at boxofficemojo.com"""
    try:
        gross = soup.find(id="body").find(text=re.compile("Domestic Total Gross: "))
        gross = gross.findNextSibling().text
        gross = int(gross.replace("$", "").replace(",", ""))
        return gross
    except LookupError:
        try:
            gross = soup.find(id="body").find(tex=re.compile("Domestic:"))
            gross = gross.findParent().findNextSibling().text
            return gross
        except:
            return "N/A"
        
        
def get_movie_value_next(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    obj = soup.find(text=re.compile(field_name))
    
    if obj:
        return obj.next.text
    else:
        return None

def get_actors(soup):
    actor = get_movie_value_next(soup,'Actor')
    v = re.sub('([a-z()])([A-Z])', '\g<1>, \g<2>', actor)
    actors = re.sub('[(*\']','', v)
    actors = actors.split(",")
    return actors


def money_to_int(moneystring):
    moneystring = moneystring.replace('$', '').replace(',', '')
    return int(moneystring)

def get_opening_weekend(soup):
    opening_weekend_gross = get_movie_value_next(soup,'Opening Weekend')
    opening_weekend_gross = money_to_int(opening_weekend_gross)
    return opening_weekend_gross
    
    
def scrape_movie_data(movie_list, start=0, end=20000):
    """returns dictionary of movies and relevant data from boxofficemojo.com:
    genres(as a list), release date, distributor, runtime, MPAA rating,
    budget, gross domestic revenue
    """
    movie_data_list = {}
    counter = 0
    for movie in movie_list:
        try:
            if start < counter < end and counter < len(movie_list):
                url = "http://www.boxofficemojo.com/" + movie
                
                response = requests.get(url)
                time.sleep(0.1)
                page = response.text
                soup = BeautifulSoup(page, "lxml")
                print(movie)
                movie_data_list[get_title(soup)] = [get_genres(soup), get_release_date(soup), 
                                                    get_distributor(soup), get_runtime(soup), 
                                                    get_rating(soup), get_budget(soup), 
                                                    get_domestic_gross(soup), get_actors(soup), 
                                                    get_opening_weekend(soup)]
                
            counter += 1
        except: #(Exception, e):
            pass

    return movie_data_list


#def main():
#    pass

#if __name__ == "__main__":
#    main()

In [None]:
movie_data_subset = scrape_movie_data(all_movies_subset, start=0, end=20000)

In [None]:
movie_data_subset

In [None]:
df = pd.DataFrame(movie_data_subset)

In [None]:
df = df.transpose()

In [None]:
movies_list_A = []
url = 'http://www.boxofficemojo.com/movies/alphabetical.htm?letter=A&page=1'
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, 'lxml')
rows = soup.find(id = 'body').find('table').find('table').find_all('table')[1].find_all("tr")
if len(rows) > 1:
    counter = 1
    for row in rows:
    # skip index row
        if counter > 1:
            link = row.td.font.a['href']
            # don't add duplicates
            if link not in movies_list:
                movies_list_A.append(link)

        counter += 1

In [None]:
def movie_actors(soup):
    try:
        actor = get_movie_value_next_next(soup,'Actor')
        v = re.sub('([a-z()])([A-Z])', '\g<1>, \g<2>', actor)
        if "*" in v:
            actors = v.split('*')
        else:
            actors = re.sub('[(*\']','', v)
            actors = actors.split(",")
        #if "*" in actors:
        #    actors =  actors.split('*')

    except TypeError:
        actors = float('NaN')
    return actors

In [None]:
def get_movie_value_next_next(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    obj = soup.find(text=re.compile(field_name))
    
    if obj:
        return obj.next.next.text
    else:
        return None

In [None]:
def get_movie_value_next(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    obj = soup.find(text=re.compile(field_name))
    if not obj: 
        return None
    # this works for most of the values
    next_sibling = obj.findNextSibling()
    if next_sibling:
        return next_sibling.text # RETURN TEXT OF NEXT SIBLING
    else:
        return None

In [None]:
def release_date(soup):
    try:
        raw_release_date = get_movie_value(soup,'Release Date')
        if raw_release_date != 'TBD' and raw_release_date != 'N/A' and raw_release_date.split()[0] != 'Spring' and raw_release_date.split()[0] != 'Summer' and raw_release_date.split()[0] != 'Fall' and raw_release_date.split()[0] != 'Winter' :
            release_date = to_date(raw_release_date)
        else:
            release_date = float('NaN')
    except AttributeError:
        release_date = float('NaN')
    return release_date

In [None]:
!pwd

In [None]:
def get_actors(soup):
    actor = get_movie_value_next(soup,'Actor')
    v = re.sub('([a-z()])([A-Z])', '\g<1>, \g<2>', actor)
    v = v.replace('(, Voice)','')
    actors = re.sub('[(*\']','', v)
    actors = actors.split(",")
    return actors

In [None]:
def movie_actors(soup):
    try:
        actor = get_movie_value_next_next(soup,'Actor')
        v = re.sub('([a-z()])([A-Z])', '\g<1>, \g<2>', actor)
        if "*" in v:
            actors = v.split('*')
        else:
            actors = re.sub('[(*\']','', v)
            actors = actors.split(",")
        #if "*" in actors:
        #    actors =  actors.split('*')

    except TypeError:
        actors = float('NaN')
    return actors

In [None]:
#url = 'http://www.boxofficemojo.com/movies/?id=disney2016.htm'
url = 'http://www.boxofficemojo.com/movies/?id=ghostbusters2016.htm'
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, "lxml")

In [None]:
get_movie_value_next_next(soup, 'Actor')

In [None]:
actor = get_movie_value_next_next(soup,'Actor')

In [None]:
actor

In [None]:
def movie_actors(soup): # still doesn't deal well with McPeople
    try:
        actor = get_movie_value_next_next(soup,'Actor') # try get_movie_value_next if it doesnt work 
        actor = actor.replace('*','')
        v = re.sub('([a-z()])([A-Z])', '\g<1>,\g<2>', actor)
        v = v.replace(' (,Voice)','')
        v = v.replace(' (,Cameo)','')
        actors = re.sub('[(*\']','', v)
        actors = actors.split(',')
        #if "*" in v:
        #    actors = v.split('*')
        #else:
        #    actors = re.sub('[(*\']','', v)
        #    actors = actors.split(",")

    except TypeError:
        actors = float('NaN')
    return actors

In [None]:
# startswith()
# Mc, De, van, Mac, Du, Le

In [None]:
print (movie_actors(soup))

In [None]:
actor = get_movie_value_next_next(soup, 'Actor')

In [None]:
actor = actor.replace('*','')

In [None]:
actor

In [None]:
s = actor.replace('(Cameo)','')

In [None]:
v = re.sub('([a-z()])([A-Z])', '\g<1>, \g<2>', actor)

In [None]:
v