## Import all needed modules

In [1]:
import sys
import subprocess

if 'darwin' in sys.platform:
    print('Running \'caffeinate\' on MacOSX to prevent the system from sleeping')
    subprocess.Popen('caffeinate')

Running 'caffeinate' on MacOSX to prevent the system from sleeping


In [2]:
import requests 
import string
from bs4 import BeautifulSoup  
import logging  
logging.basicConfig(level=logging.DEBUG)
import dateutil.parser
import time
import re
from collections import OrderedDict
import pandas as pd
from datetime import datetime
import numpy as np

## Get movie urls from boxofficemojo.com

In [3]:
def get_all_movies():  
    """ returns all the movie urls from boxofficemojo.com in a list"""

    # Alphabet loop for how movies are indexed including
    # movies that start with a special character or number
    index = ["NUM"] + list(string.ascii_uppercase)

    # List of movie urls
    movies_list = []

    # Loop through the pages for each letter
    for letter in index:

        # Loop through the pages within each letter
        for num in range(1, 20):
            url = ("http://www.boxofficemojo.com/movies/alphabetical.htm?"
                   "letter=" + letter + "&page=" + str(num))
            try:
                response = requests.get(url)
                page = response.text
                soup = BeautifulSoup(page, "lxml")
                rows = soup.find(id="body").find("table").find("table").find_all(
                    "table")[1].find_all("tr")

                # skip index row
                if len(rows) > 1:
                    counter = 1
                    for row in rows:
                        # skip index row
                        if counter > 1:
                            link = row.td.font.a['href']
                            # don't add duplicates
                            if link not in movies_list:
                                movies_list.append(link)

                        counter += 1
            except (Exception, e):
                logging.exception(e)

    return movies_list

#### To get the url of the movies, call the function defined above 

In [4]:
all_movies = get_all_movies()

## Define certain useful functions

In [5]:
def get_movie_value(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    obj = soup.find(text=re.compile(field_name))
    if not obj: 
        return None
    # this works for most of the values
    next_sibling = obj.findNextSibling()
    if next_sibling:
        return next_sibling.text # RETURN TEXT OF NEXT SIBLING
    else:
        return None


def get_movie_value_next(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    obj = soup.find(text=re.compile(field_name))
    if not obj: 
        return None
    # this works for most of the values
    next_sibling = obj.findNextSibling()
    if next_sibling:
        return next_sibling.text # RETURN TEXT OF NEXT SIBLING
    else:
        return None



#def get_movie_value_next_next(soup, field_name):
#    '''Grab a value from boxofficemojo HTML
#    
#    Takes a string attribute of a movie on the page and
#    returns the string in the next sibling object
#    (the value for that attribute)
#    or None if nothing is found.
#    '''
#    obj = soup.find(text=re.compile(field_name))
#    
#    if obj:
#        return obj.next.next.text
#    else:
#        return None

def get_movie_value_next_next(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    
    for br in soup.findAll('br'):
        br.replace_with('\n')
    
    obj = soup.find(text=re.compile(field_name))
    
    if obj:
        return obj.next.next.text 
    else:
        return None

def money_to_int(moneystring):
    moneystring = moneystring.replace('$', '').replace(',', '')
    return int(moneystring)


def money_to_int_2(string):
    if string is not None:
        i = string.replace('$','').replace(',','').replace(' ','')
        if "million" in i:
            i = i.replace('million','')
            i = float(i) * 1000000
            i = int(i)
        return i
    else:
        return np.nan

def money_to_int_3(moneystring):
    moneystring = moneystring.replace('\xa0','').replace('$', '').replace(',','')
    

def runtime_to_minutes(runtimestring):
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

def strip_and_return_int(s):
    t = s.replace('$', '').replace(',','')
    t = int(t)
    return t

def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

## Define functions to get movie information

In [6]:
def movie_title(soup):        
    #try:
    title_string = soup.find('title').text
    title = title_string.split('(')[0].strip()
    #except:
        #title = float('NaN')
        
    return title 
    
def movie_producers(soup):
    try:
        producer = get_movie_value_next(soup, 'Producer')
        producers = re.sub('([a-z()])([A-Z])', '\g<1>, \g<2>', producer).split(",") 
    except TypeError:
        producers = float('NaN')
    except AttributeError:
        producers = float('NaN')
        
    return producers
        
def movie_directors(soup):
    try:
        director = get_movie_value_next(soup,'Director')
        director = re.sub('([a-z()])([A-Z])', '\g<1>, \g<2>', director).split(",") 
    except TypeError:
        director = float('NaN')
    
    return director
    

def movie_actors(soup): # still doesn't deal well with McPersons, DeVitos, DaSouzas, etc. 
    try:
        actor = get_movie_value_next_next(soup,'Actor') # try get_movie_value_next if it doesnt work 
        if str(actor) != 'None':
            actors = actor.replace('*','').replace(' (Cameo)','').replace('(Voice)','').replace('\n',', ')
            actors = actors.split(", ")
            return actors

    except TypeError:
        return actors

def movie_genre(soup):
    try:
        genre = get_movie_value(soup, 'Genre[^a-z]')
    except:
        genre = float('NaN')
    return genre

def movie_rating(soup):
    try:
        rating = get_movie_value(soup,'MPAA Rating')
    except:
        rating = float('NaN')
    return rating

#def release_date(soup):
#    try:
#        raw_release_date = get_movie_value(soup,'Release Date')
#        if raw_release_date != 'TBD' and raw_release_date != 'N/A' and raw_release_date[0].isnumeric() :
#            release_date = to_date(raw_release_date)
#        else:
#            release_date = float('NaN')
#    except AttributeError:
#        release_date = float('NaN')
#    
#    return release_date

def release_date(soup):
    try:
        raw_release_date = get_movie_value(soup,'Release Date')
        if raw_release_date != 'TBD' and raw_release_date != 'N/A' and raw_release_date.split()[0] != 'Spring' and raw_release_date.split()[0] != 'Summer' and raw_release_date.split()[0] != 'Fall' and raw_release_date.split()[0] != 'Winter' :
            release_date = to_date(raw_release_date)
        else:
            release_date = float('NaN')
    except AttributeError:
        release_date = float('NaN')
    return release_date
    
def domestic_gross(soup):
    
    try:
        raw_domestic_total_gross = get_movie_value(soup,'Domestic Total')
        domestic_total_gross = money_to_int(raw_domestic_total_gross)
    except AttributeError:
        domestic_total_gross = float('NaN')
    
    return domestic_total_gross
    
def opening_weekend_gross(soup):
    
    try:
        opening_weekend_gross = get_movie_value_next(soup,'Opening\xa0Weekend:')
        opening_weekend_gross = money_to_int_3(opening_weekend_gross)
    except AttributeError:
        opening_weekend_gross = float('NaN')
    return opening_weekend_gross
    
def production_budget(soup):
    try:
        production_budget = get_movie_value(soup, 'Production Budget')
        production_budget = money_to_int_2(production_budget)
    except AttributeError:
        production_budget = float('NaN')
    return production_budget
    
def runtime(soup):
    try:
        raw_runtime = get_movie_value(soup,'Runtime')
        runtime = runtime_to_minutes(raw_runtime)
    except AttributeError:
        runtime = float('NaN')
    return runtime
    
def widest_release(soup):
    try:
        widest_release = get_movie_value_next_next(soup, 'Widest')
        widest_release = strip_and_return_int(widest_release.split()[0])
    except AttributeError:
        widest_release = float('NaN')
    return widest_release

def distributor(soup):
    try:
        if str(soup.find(id='body')) != 'None':
            distributor = soup.find(id="body").find(text=re.compile("Distributor"))
            if str(distributor) != 'None':
                distributor = distributor.findNextSibling().text
                return distributor
    except LookupError:
        distributor = float('NaN')


## MOVIE SCRAPER

In [18]:
def movie_scraper(all_movies):    
    movie_blocks = list(range(0, len(all_movies)-1, int(len(all_movies)/10)))
    movie_blocks.append(len(all_movies)-1) 
            
    print("The length of urls_chunks, minus one is %s" % str(len(movie_blocks)-1))
    
    movie_data_list = OrderedDict()
    count = 0 
    try:
        for block_index in range(len(movie_blocks) - 1):
                    
            print ("URL chunk ... %s of %s in 10 seconds ... \n" % (str(block_index), str(len(movie_blocks) - 1)))
            
            time.sleep(10)
            
            print ("Fetching URLs in movie_list location: %s to %s \n" % (movie_blocks[block_index], movie_blocks[block_index + 1]))
            for movie in all_movies[movie_blocks[block_index]:movie_blocks[block_index + 1]]:
                count +=1
                url = "http://www.boxofficemojo.com/" + movie
                response = requests.get(url)
                time.sleep(0.1)
                page = response.text
                soup = BeautifulSoup(page, "lxml")
                print(count,movie)
                movie_data_list[movie_title(soup)] = [movie, movie_genre(soup), release_date(soup), 
                                                        distributor(soup), runtime(soup), 
                                                        movie_rating(soup), production_budget(soup), 
                                                        domestic_gross(soup), movie_actors(soup), 
                                                        opening_weekend_gross(soup), widest_release(soup),
                                                        movie_producers(soup), movie_directors(soup) ]
                        
                
                
                #try:
                #    time.sleep(0.2)
                #    response = requests.get(url)
                #    page = response.text
                #    #print(str(re.search('\w+(?=\.htm)', url).group()))
                #    soupObjects[str(re.search('\w+(?=\.htm)',url).group())] = BeautifulSoup(page,'lxml')
                #except requests.exceptions.RequestException as e:
                #    print(e)
                #    sys.exit(1)
                #
                    
            print("continuing in 10 seconds ...")
    except requests.exceptions.RequestException as e:
        print(e)
        pass
        #sys.exit(1)
            
    return movie_data_list


In [8]:
dict_1 = movie_scraper(all_movies)

The length of urls_chunks, minus one is 31
URL chunk ... 0 of 31 in 10 seconds ... 

Fetching URLs in movie_list location: 0 to 556 

1 /movies/?id=horrorifc.htm
2 /movies/?id=9dot99.htm
3 /movies/?id=supercapitalist.htm
4 /movies/?id=500daysofsummer.htm
5 /movies/?id=untitled.htm
6 /movies/?id=andjusticeforall.htm
7 /movies/?id=1mileabove.htm
8 /movies/?id=1plus1.htm
9 /movies/?id=1000rupeenote.htm
10 /movies/?id=1000times.htm
11 /movies/?id=10.htm
12 /movies/?id=badrobot2016.htm
13 /movies/?id=10daysinamadhouse.htm
14 /movies/?id=10itemsorless.htm
15 /movies/?id=10questionsforthedalailama.htm
16 /movies/?id=10rules.htm
17 /movies/?id=10thingsihateaboutyou.htm
18 /movies/?id=10tomidnight.htm
19 /movies/?id=10years.htm
20 /movies/?id=10000bc.htm
21 /movies/?id=10000km.htm
22 /movies/?id=100bloodyacres.htm
23 /movies/?id=100yearoldman.htm
24 /movies/?id=1001grams.htm
25 /movies/?id=101dalmations.htm
26 /movies/?id=101dalmatiansliveaction.htm
27 /movies/?id=101dalmatians69.htm
28 /movies

In [9]:
df = pd.DataFrame(dict_1)

In [24]:
df.to_pickle('001.pkl')

In [12]:
dict_2 = movie_scraper(all_movies[1522:])

The length of urls_chunks, minus one is 31
URL chunk ... 0 of 31 in 10 seconds ... 

Fetching URLs in movie_list location: 0 to 505 

1 /movies/?id=benhur25.htm
2 /movies/?id=benhur2016.htm
3 /movies/?id=benchwarmers.htm
4 /movies/?id=benditlikebeckham.htm
5 /movies/?id=bendabilili.htm
6 /movies/?id=beneaththedarkness.htm
7 /movies/?id=beneaththeharvestsky.htm
8 /movies/?id=beneaththeplanetoftheapes.htm
9 /movies/?id=thebenefactor.htm
10 /movies/?id=benjaminsmoke.htm
11 /movies/?id=benji.htm
12 /movies/?id=benji04.htm
13 /movies/?id=benjithehunted.htm
14 /movies/?id=bennyandjoon.htm
15 /movies/?id=bent.htm
16 /movies/?id=beowulf.htm
17 /movies/?id=beowulfandgrendel.htm
18 /movies/?id=berberiansoundstudio.htm
19 /movies/?id=bereavement.htm
20 /movies/?id=berkeley.htm
21 /movies/?id=berlinfile.htm
22 /movies/?id=bernie.htm
23 /movies/?id=bertrigbyyoureafool.htm
24 /movies/?id=bertstern.htm
25 /movies/?id=besharam.htm
26 /movies/?id=besieged.htm
27 /movies/?id=besotted.htm
28 /movies/?id=

In [13]:
df2 = pd.DataFrame(dict_2)

In [25]:
df2.to_pickle('002.pkl')

In [None]:
dict_3 = movie_scraper(all_movies[7200:])

The length of urls_chunks, minus one is 11
URL chunk ... 0 of 11 in 10 seconds ... 

Fetching URLs in movie_list location: 0 to 948 

1 /movies/?id=ifyoubuildit.htm
2 /movies/?id=ifyoucouldseewhatihear.htm
3 /movies/?id=ifyoudontiwill.htm
4 /movies/?id=if.htm
5 /movies/?id=igbygoesdown.htm
6 /movies/?id=igor.htm
7 /movies/?id=ikiru02.htm
8 /movies/?id=ilbarbieredisiviglia.htm
9 /movies/?id=ilcuorealtrove.htm
10 /movies/?id=ildivo.htm
11 /movies/?id=ilfuturo.htm
12 /movies/?id=ilnefautjurerderien.htm
13 /movies/?id=ilpostino.htm
14 /movies/?id=ilposto.htm
15 /movies/?id=iltritticoatlascala.htm
16 /movies/?id=illgottengains.htm
17 /movies/?id=illegal2011.htm
18 /movies/?id=illegaltender.htm
19 /movies/?id=illegallyyours.htm
20 /movies/?id=illuminata.htm
21 /movies/?id=illumination2019.htm
22 /movies/?id=illumination2020.htm
23 /movies/?id=illusion.htm
24 /movies/?id=illusionist.htm
25 /movies/?id=illusionist2010.htm
26 /movies/?id=illustratedman.htm
27 /movies/?id=iloilo.htm
28 /movies/?

In [21]:
df3 = pd.DataFrame(dict_3)

In [26]:
df3.to_pickle('003.pkl')

In [27]:
dict_4 = movie_scraper(all_movies[10284:])

The length of urls_chunks, minus one is 11
URL chunk ... 0 of 11 in 10 seconds ... 

Fetching URLs in movie_list location: 0 to 639 

1 /movies/?id=mylifesofar.htm
2 /movies/?id=mylifewithoutme.htm
3 /movies/?id=mylifesinturnaround.htm
4 /movies/?id=mylittlechickadee.htm
5 /movies/?id=mylittlegirl.htm
6 /movies/?id=mylittlepony.htm
7 /movies/?id=mylittlepony2017.htm
8 /movies/?id=dontcrossthatriver.htm
9 /movies/?id=myluckystar.htm
10 /movies/?id=mymexicanshivah.htm
11 /movies/?id=mymotherlikeswomen.htm
12 /movies/?id=mymotherscastle.htm
13 /movies/?id=mymotherssmile.htm
14 /movies/?id=mynameisbruce.htm
15 /movies/?id=mynameisjoe.htm
16 /movies/?id=mynameiskhan.htm
17 /movies/?id=mynewgun.htm
18 /movies/?id=myoldlady.htm
19 /movies/?id=myoneandonly.htm
20 /movies/?id=myownprivateidaho.htm
21 /movies/?id=myperestroika.htm
22 /movies/?id=mypieceofthepie.htm
23 /movies/?id=myreincarnation.htm
24 /movies/?id=myscienceproject.htm
25 /movies/?id=mysisterskeeper.htm
26 /movies/?id=mysonthefan

In [28]:
df4 = pd.DataFrame(dict_4)

In [34]:
df4.to_pickle('004.pkl')

In [35]:
dict_5 = movie_scraper(all_movies[15372:])

The length of urls_chunks, minus one is 10
URL chunk ... 0 of 10 in 10 seconds ... 

Fetching URLs in movie_list location: 0 to 131 

1 /movies/?id=undefeated.htm
2 /movies/?id=undefeated11.htm
3 /movies/?id=undefeated2012.htm
4 /movies/?id=underfire.htm
5 /movies/?id=undermyskin.htm
6 /movies/?id=underourskin.htm
7 /movies/?id=undersiege.htm
8 /movies/?id=undersiege2.htm
9 /movies/?id=undersuspicion.htm
10 /movies/?id=undersuspicion92.htm
11 /movies/?id=undertheboardwalk.htm
12 /movies/?id=underboardwalk.htm
13 /movies/?id=underthecherrymoon.htm
14 /movies/?id=underthedomimtree.htm
15 /movies/?id=undertheelectricsky.htm
16 /movies/?id=undertherainbow.htm
17 /movies/?id=underthesamemoon.htm
18 /movies/?id=underthesand.htm
19 /movies/?id=underthesea3d.htm
20 /movies/?id=undertheshadow.htm
21 /movies/?id=undertheskin.htm
22 /movies/?id=undertheskin2014.htm
23 /movies/?id=undertheskinofthecity.htm
24 /movies/?id=underthesun.htm
25 /movies/?id=underthesun2016.htm
26 /movies/?id=underthesun

In [38]:
df5 = pd.DataFrame(dict_5)

In [39]:
df5.to_pickle('005.pkl')

In [40]:
dict_6 = movie_scraper(all_movies[15881:])

The length of urls_chunks, minus one is 11
URL chunk ... 0 of 11 in 10 seconds ... 

Fetching URLs in movie_list location: 0 to 80 

1 /movies/?id=wasabituna.htm
2 /movies/?id=wasabituna05.htm
3 /movies/?id=wash.htm
4 /movies/?id=washdryandspinout.htm
5 /movies/?id=washingtonheights.htm
6 /movies/?id=washingtonsquare.htm
7 /movies/?id=wassuprockers.htm
8 /movies/?id=wasteland.htm
9 /movies/?id=thewastedtimes.htm
10 /movies/?id=neighborhoodwatch.htm
11 /movies/?id=watcher.htm
12 /movies/?id=watcherinthewoods.htm
13 /movies/?id=watchers.htm
14 /movies/?id=watchersofthesky.htm
15 /movies/?id=watchingtvwiththeredchinese.htm
16 /movies/?id=watchmen.htm
17 /movies/?id=water.htm
18 /movies/?id=water06.htm
19 /movies/?id=waterandpower.htm
20 /movies/?id=watercanada.htm
21 /movies/?id=waterdiviner.htm
22 /movies/?id=waterforelephants.htm
23 /movies/?id=waterhorse.htm
24 /movies/?id=waterlilies.htm
25 /movies/?id=waterboy.htm
26 /movies/?id=watercolors.htm
27 /movies/?id=waterdance.htm
28 /movie

In [41]:
df6 = pd.DataFrame(dict_6)

In [42]:
df6.to_pickle('006.pkl')

## transpose the dfs

In [43]:
df = df.transpose()
df2 = df2.transpose()
df3 = df3.transpose()
df4 = df4.transpose()
df5 = df5.transpose()
df6 = df6.transpose()

In [None]:
df.columns = ['movie', 'genre','release date', 'distributor', 'runtime', 'rating', 'budget', 'domestic gross', 'actors', '1st weekend gross', 'widest release', 'producers', 'directors']

In [44]:
df_all = pd.concat([df, df2, df3, df4, df5, df6])

In [48]:
df_all.shape
df_all.columns = ['movie', 'genre','release date', 'distributor', 'runtime', 'rating', 'budget', 'domestic gross', 'actors', '1st weekend gross', 'widest release', 'producers', 'directors']

In [49]:
df_all.head()

Unnamed: 0,movie,genre,release date,distributor,runtime,rating,budget,domestic gross,actors,1st weekend gross,widest release,producers,directors
#Horror,/movies/?id=horrorifc.htm,Horror,2015-11-20 00:00:00,IFC,90,Unknown,,,"[Balthazar Getty, Timothy Hutton, Natasha Lyon...",,,,
$9.99,/movies/?id=9dot99.htm,Animation,2008-12-12 00:00:00,Regent Releasing,78,R,,52384.0,,,4.0,,
$upercapitalist,/movies/?id=supercapitalist.htm,Thriller,2012-08-10 00:00:00,Truly Indie,96,Unrated,,15919.0,,,1.0,,
,/movies/?id=untitled.htm,Comedy,2009-10-23 00:00:00,Samuel Goldwyn,96,R,,230600.0,"[Adam Goldberg, Marley Shelton]",,25.0,,
...And Justice for All,/movies/?id=andjusticeforall.htm,Drama,1979-10-19 00:00:00,Columbia,119,R,,33300000.0,"[Craig T. Nelson, Al Pacino]",,,,


In [50]:
df_all['title'] = df_all.index

In [51]:
df_all.head()

Unnamed: 0,movie,genre,release date,distributor,runtime,rating,budget,domestic gross,actors,1st weekend gross,widest release,producers,directors,title
#Horror,/movies/?id=horrorifc.htm,Horror,2015-11-20 00:00:00,IFC,90,Unknown,,,"[Balthazar Getty, Timothy Hutton, Natasha Lyon...",,,,,#Horror
$9.99,/movies/?id=9dot99.htm,Animation,2008-12-12 00:00:00,Regent Releasing,78,R,,52384.0,,,4.0,,,$9.99
$upercapitalist,/movies/?id=supercapitalist.htm,Thriller,2012-08-10 00:00:00,Truly Indie,96,Unrated,,15919.0,,,1.0,,,$upercapitalist
,/movies/?id=untitled.htm,Comedy,2009-10-23 00:00:00,Samuel Goldwyn,96,R,,230600.0,"[Adam Goldberg, Marley Shelton]",,25.0,,,
...And Justice for All,/movies/?id=andjusticeforall.htm,Drama,1979-10-19 00:00:00,Columbia,119,R,,33300000.0,"[Craig T. Nelson, Al Pacino]",,,,,...And Justice for All


In [52]:
df_all.to_pickle('all_movies_20161003.pkl')

In [57]:
DF = df_all.sort_values('release date')

In [59]:
DF.index = range(len(DF))

In [62]:
DF['release date'][0]

datetime.datetime(1919, 10, 3, 0, 0)

In [66]:
DF_90s = DF[DF['release date'] > datetime(1989, 12, 31, 0, 0)]

In [72]:
DF_90s.to_pickle('movies_from_the_90s.pkl')

# Work

In [1]:
import pandas as pd 

In [2]:
DF_90s = pd.read_pickle('movies_from_the_90s.pkl')

In [3]:
DF_90s

Unnamed: 0,movie,genre,release date,distributor,runtime,rating,budget,domestic gross,actors,1st weekend gross,widest release,producers,directors,title
3446,/movies/?id=henryportraitserialkiller.htm,Horror Thriller,1990-01-05 00:00:00,Greycat,83,Unrated,,609939,,,8,,,Henry: Portrait of a Serial Killer
3447,/movies/?id=internalaffairs.htm,Crime Thriller,1990-01-12 00:00:00,Paramount,117,R,,27734391,"[Richard Gere, Elijah Wood]",,986,,,Internal Affairs
3448,/movies/?id=texaschainsaw3.htm,Horror,1990-01-12 00:00:00,New Line,81,R,,5765562,[Viggo Mortensen],,1107,,,Leatherface: The Texas Chainsaw Massacre III
3449,/movies/?id=skipatrol.htm,Sports Comedy,1990-01-12 00:00:00,Triumph,88,PG,,8533973,,,565,,,Ski Patrol
3450,/movies/?id=skipatrol.htm,Sports Comedy,1990-01-12 00:00:00,Triumph,88,PG,,8533973,,,565,,,Ski Patrol
3451,/movies/?id=downtown.htm,Action Comedy,1990-01-12 00:00:00,Fox,96,R,,2346150,[Forest Whitaker],,349,,,Downtown
3452,/movies/?id=sweetie.htm,Drama,1990-01-19 00:00:00,Avenue Pict.,99,R,,938065,,,1,,,Sweetie
3453,/movies/?id=tremors.htm,Horror Comedy,1990-01-19 00:00:00,Universal,95,PG-13,,16667084,"[Kevin Bacon, Fred Ward]",,1477,,,Tremors
3454,/movies/?id=everybodywins.htm,Drama / Thriller,1990-01-19 00:00:00,Orion Pictures,97,R,,1372350,,,451,,,Everybody Wins
3455,/movies/?id=streets.htm,Drama / Thriller,1990-01-19 00:00:00,Concorde,83,R,,1510053,[Christina Applegate],,77,,,Streets


In [4]:
DF_90s['actors_no_nan'] = DF_90s['actors'].fillna('')

In [5]:
def unnest(df, col, reset_index=False):
    import pandas as pd
    col_flat = pd.DataFrame([[i, x] 
                       for i, y in df[col].apply(list).iteritems() 
                           for x in y], columns=['I', col])
    col_flat = col_flat.set_index('I')
    df = df.drop(col, 1)
    df = df.merge(col_flat, left_index=True, right_index=True)
    if reset_index:
        df = df.reset_index(drop=True)
    return df

In [6]:
expected = unnest(DF_90s, 'actors_no_nan')

In [7]:
DF_90s.shape

(14962, 15)

In [8]:
expected.shape

(27392, 15)

In [15]:
expected['count'] = 1

### WORK TO BE DONE HERE

In [41]:
expected
# loop thru all actors in actors_no_nan and get rid of space at the end of string if there is one. 

Unnamed: 0,movie,genre,release date,distributor,runtime,rating,budget,domestic gross,actors,1st weekend gross,widest release,producers,directors,title,actors_no_nan,count
3447,/movies/?id=internalaffairs.htm,Crime Thriller,1990-01-12 00:00:00,Paramount,117,R,,27734391,"[Richard Gere, Elijah Wood]",,986,,,Internal Affairs,Richard Gere,1
3447,/movies/?id=internalaffairs.htm,Crime Thriller,1990-01-12 00:00:00,Paramount,117,R,,27734391,"[Richard Gere, Elijah Wood]",,986,,,Internal Affairs,Elijah Wood,1
3448,/movies/?id=texaschainsaw3.htm,Horror,1990-01-12 00:00:00,New Line,81,R,,5765562,[Viggo Mortensen],,1107,,,Leatherface: The Texas Chainsaw Massacre III,Viggo Mortensen,1
3451,/movies/?id=downtown.htm,Action Comedy,1990-01-12 00:00:00,Fox,96,R,,2346150,[Forest Whitaker],,349,,,Downtown,Forest Whitaker,1
3453,/movies/?id=tremors.htm,Horror Comedy,1990-01-19 00:00:00,Universal,95,PG-13,,16667084,"[Kevin Bacon, Fred Ward]",,1477,,,Tremors,Kevin Bacon,1
3453,/movies/?id=tremors.htm,Horror Comedy,1990-01-19 00:00:00,Universal,95,PG-13,,16667084,"[Kevin Bacon, Fred Ward]",,1477,,,Tremors,Fred Ward,1
3455,/movies/?id=streets.htm,Drama / Thriller,1990-01-19 00:00:00,Concorde,83,R,,1510053,[Christina Applegate],,77,,,Streets,Christina Applegate,1
3458,/movies/?id=heartcondition.htm,Fantasy Comedy,1990-02-02 00:00:00,New Line,100,R,,4134992,[Denzel Washington],,885,,,Heart Condition,Denzel Washington,1
3459,/movies/?id=stella.htm,Drama,1990-02-02 00:00:00,Buena Vista,109,PG-13,,20240128,"[Bette Midler, Ben Stiller]",,1393,,,Stella,Bette Midler,1
3459,/movies/?id=stella.htm,Drama,1990-02-02 00:00:00,Buena Vista,109,PG-13,,20240128,"[Bette Midler, Ben Stiller]",,1393,,,Stella,Ben Stiller,1


In [23]:
expected['domestic gross'][3447]

3447    27734391
3447    27734391
Name: domestic gross, dtype: object

In [27]:
df_converted = expected.convert_objects(convert_dates='coerce', convert_numeric=True)

  if __name__ == '__main__':


In [31]:
df_converted.head()

Unnamed: 0,movie,genre,release date,distributor,runtime,rating,budget,domestic gross,actors,1st weekend gross,widest release,producers,directors,title,actors_no_nan,count
3447,/movies/?id=internalaffairs.htm,Crime Thriller,1990-01-12,Paramount,117.0,R,,27734391.0,"[Richard Gere, Elijah Wood]",,986.0,,,NaT,Richard Gere,1
3447,/movies/?id=internalaffairs.htm,Crime Thriller,1990-01-12,Paramount,117.0,R,,27734391.0,"[Richard Gere, Elijah Wood]",,986.0,,,NaT,Elijah Wood,1
3448,/movies/?id=texaschainsaw3.htm,Horror,1990-01-12,New Line,81.0,R,,5765562.0,[Viggo Mortensen],,1107.0,,,NaT,Viggo Mortensen,1
3451,/movies/?id=downtown.htm,Action Comedy,1990-01-12,Fox,96.0,R,,2346150.0,[Forest Whitaker],,349.0,,,NaT,Forest Whitaker,1
3453,/movies/?id=tremors.htm,Horror Comedy,1990-01-19,Universal,95.0,PG-13,,16667084.0,"[Kevin Bacon, Fred Ward]",,1477.0,,,NaT,Kevin Bacon,1


In [32]:
df_sum = df_converted.groupby('actors_no_nan').sum()

In [34]:
df_sum.index

Index(['""Stone Cold"" Steve Austin', '50 Cent', 'A.J. Cook', 'Aaron Eckhart',
       'Aaron Johnson', 'Aaron Paul', 'Aaron Paul ', 'Aaron Stanford',
       'Aasif Mandvi', 'Aasif Mandvi ',
       ...
       'Zana Marjanovic', 'Zhang Ziyi', 'Zhang Ziyi ', 'Zoe Bell', 'Zoe Kazan',
       'Zoe Saldana', 'Zoe Saldana ', 'Zoey Deutch', 'Zooey Deschanel',
       'Zooey Deschanel '],
      dtype='object', name='actors_no_nan', length=3201)

In [40]:
df_sum.drop_duplicates()

Unnamed: 0_level_0,runtime,budget,domestic gross,1st weekend gross,widest release,producers,directors,count
actors_no_nan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
"""""Stone Cold"""" Steve Austin",295.0,8.000000e+07,1.030715e+08,,3400.0,,,3
50 Cent,793.0,2.600000e+08,2.997157e+08,,17571.0,,,7
A.J. Cook,93.0,,3.567000e+03,,1.0,,,1
Aaron Eckhart,3857.0,1.044950e+09,1.026238e+09,,60963.0,,,37
Aaron Johnson,1159.0,5.040000e+08,7.576350e+08,,21760.0,,,13
Aaron Paul,1231.0,3.220000e+08,3.114486e+08,,16612.0,,,11
Aaron Paul,110.0,,,,26.0,,,1
Aaron Stanford,293.0,,3.433370e+05,,43.0,,,3
Aasif Mandvi,845.0,1.875000e+08,1.683686e+08,,16443.0,,,8
Aasif Mandvi,,,,,,,,1


In [None]:
df.index = range(len(df))

In [None]:
# /movies/?id=punchline.htm

In [None]:
df.to_pickle('movies_1_v2.pkl')

In [None]:
df.shape

In [None]:
dict_2 = movie_scraper(all_movies[11924:])

In [None]:
df2 = pd.DataFrame(dict_2)

In [None]:
df2 = df2.transpose()
df2.columns = ['movie', 'genre','release date', 'distributor', 'runtime', 'rating', 'budget', 'domestic gross', 'actors', '1st weekend gross', 'widest release', 'producers', 'directors']
df2['title'] = df2.index
df2.index = range(len(df2))

In [None]:
df2.to_pickle('movies_2.pkl')


## Miscellaneous

In [None]:
def get_genres(soup):
    """ returns all genres from specific movie page at boxofficemojo.com"""
    genres_list = []
    try:
        genres = soup.find(id="body").find(text=re.compile("Genres"))
        genres = genres.findParent().findNextSibling().find_all('tr')
        genre_count = 0
        for genre in genres:
            if genre_count > 0:
                genres_list.append(genre.td.font.a.text)
            genre_count += 1
    except LookupError:
        try:
            genres = soup.find(id="body").find(text=re.compile("Genre"))
            genres = genres.findNextSibling().text
            genres_list.append(genres)
        except:
            genres_list.append("N/A")
    return genres_list


def get_title(soup):
    """returns title from specific movie page at boxofficemojo.com"""
    try:
        title = soup.find("title").text.rsplit('(', 1)[0].strip()
    except LookupError:
        title = "N/A"
    return title


def get_release_date(soup):
    """returns datetime value of release date from specific movie
    page at boxofficemojo.com
    """
    try:
        date = soup.find(id="body").find(text=re.compile("Release Date"))
        date = date.findNextSibling().text
        date = datetime.strptime(date, "%B %d, %Y")
        return date
    except LookupError:
        return "N/A"


def get_distributor(soup):
    """returns movie distributor from specific movie page at boxofficemojo.com"""
    try:
        distributor = soup.find(id="body").find(text=re.compile("Distributor"))
        distributor = distributor.findNextSibling().text
        return distributor
    except LookupError:
        return "N/A"


def get_rating(soup):
    """returns MPAA Rating from specific movie page at boxofficemojo.com"""
    try:
        rating = soup.find(id="body").find(text=re.compile("MPAA Rating"))
        rating = rating.findNextSibling().text
        return rating
    except LookupError:
        return "N/A"


def get_runtime(soup):
    """returns integer value of runtime from specific movie page at boxofficemojo.com"""
    try:
        runtime = soup.find(id="body").find(text=re.compile("Runtime"))
        runtime = runtime.findNextSibling().text
        time_splits = runtime.split("hrs.")
        try:
            hrs = int(time_splits[0]) * 60
        except LookupError:
            hrs = 0
        mins = int(time_splits[1].split(" min.")[0].strip())
        total = hrs + mins
        return total
    except LookupError:
        return "N/A"


def get_budget(soup):
    """returns movie budget from specific movie page at boxofficemojo.com"""
    try:
        budget = soup.find(id="body").find(text=re.compile("Production Budget"))
        budget = budget.findNextSibling().text
        if budget != "N/A":
            budget = int(budget.split("million")[0].split("$")[1].strip()) * 1000000
        return budget
    except LookupError:
        return "N/A"


def get_domestic_gross(soup):
    """returns integer value of domestic gross from specific movie page at boxofficemojo.com"""
    try:
        gross = soup.find(id="body").find(text=re.compile("Domestic Total Gross: "))
        gross = gross.findNextSibling().text
        gross = int(gross.replace("$", "").replace(",", ""))
        return gross
    except LookupError:
        try:
            gross = soup.find(id="body").find(tex=re.compile("Domestic:"))
            gross = gross.findParent().findNextSibling().text
            return gross
        except:
            return "N/A"
        
        
def get_movie_value_next(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    obj = soup.find(text=re.compile(field_name))
    
    if obj:
        return obj.next.text
    else:
        return None

def get_actors(soup):
    actor = get_movie_value_next(soup,'Actor')
    v = re.sub('([a-z()])([A-Z])', '\g<1>, \g<2>', actor)
    actors = re.sub('[(*\']','', v)
    actors = actors.split(",")
    return actors


def money_to_int(moneystring):
    moneystring = moneystring.replace('$', '').replace(',', '')
    return int(moneystring)

def get_opening_weekend(soup):
    opening_weekend_gross = get_movie_value_next(soup,'Opening Weekend')
    opening_weekend_gross = money_to_int(opening_weekend_gross)
    return opening_weekend_gross
    
    
def scrape_movie_data(movie_list, start=0, end=20000):
    """returns dictionary of movies and relevant data from boxofficemojo.com:
    genres(as a list), release date, distributor, runtime, MPAA rating,
    budget, gross domestic revenue
    """
    movie_data_list = {}
    counter = 0
    for movie in movie_list:
        try:
            if start < counter < end and counter < len(movie_list):
                url = "http://www.boxofficemojo.com/" + movie
                
                response = requests.get(url)
                time.sleep(0.1)
                page = response.text
                soup = BeautifulSoup(page, "lxml")
                print(movie)
                movie_data_list[get_title(soup)] = [get_genres(soup), get_release_date(soup), 
                                                    get_distributor(soup), get_runtime(soup), 
                                                    get_rating(soup), get_budget(soup), 
                                                    get_domestic_gross(soup), get_actors(soup), 
                                                    get_opening_weekend(soup)]
                
            counter += 1
        except: #(Exception, e):
            pass

    return movie_data_list


#def main():
#    pass

#if __name__ == "__main__":
#    main()

In [None]:
movie_data_subset = scrape_movie_data(all_movies_subset, start=0, end=20000)

In [None]:
movie_data_subset

In [None]:
df = pd.DataFrame(movie_data_subset)

In [None]:
df = df.transpose()

In [None]:
movies_list_A = []
url = 'http://www.boxofficemojo.com/movies/alphabetical.htm?letter=A&page=1'
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, 'lxml')
rows = soup.find(id = 'body').find('table').find('table').find_all('table')[1].find_all("tr")
if len(rows) > 1:
    counter = 1
    for row in rows:
    # skip index row
        if counter > 1:
            link = row.td.font.a['href']
            # don't add duplicates
            if link not in movies_list:
                movies_list_A.append(link)

        counter += 1

In [None]:
def movie_actors(soup):
    try:
        actor = get_movie_value_next_next(soup,'Actor')
        v = re.sub('([a-z()])([A-Z])', '\g<1>, \g<2>', actor)
        if "*" in v:
            actors = v.split('*')
        else:
            actors = re.sub('[(*\']','', v)
            actors = actors.split(",")
        #if "*" in actors:
        #    actors =  actors.split('*')

    except TypeError:
        actors = float('NaN')
    return actors

In [None]:
def get_movie_value_next_next(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    obj = soup.find(text=re.compile(field_name))
    
    if obj:
        return obj.next.next.text
    else:
        return None

In [None]:
def get_movie_value_next(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    obj = soup.find(text=re.compile(field_name))
    if not obj: 
        return None
    # this works for most of the values
    next_sibling = obj.findNextSibling()
    if next_sibling:
        return next_sibling.text # RETURN TEXT OF NEXT SIBLING
    else:
        return None

In [None]:
def release_date(soup):
    try:
        raw_release_date = get_movie_value(soup,'Release Date')
        if raw_release_date != 'TBD' and raw_release_date != 'N/A' and raw_release_date.split()[0] != 'Spring' and raw_release_date.split()[0] != 'Summer' and raw_release_date.split()[0] != 'Fall' and raw_release_date.split()[0] != 'Winter' :
            release_date = to_date(raw_release_date)
        else:
            release_date = float('NaN')
    except AttributeError:
        release_date = float('NaN')
    return release_date

In [None]:
!pwd

In [None]:
def get_actors(soup):
    actor = get_movie_value_next(soup,'Actor')
    v = re.sub('([a-z()])([A-Z])', '\g<1>, \g<2>', actor)
    v = v.replace('(, Voice)','')
    actors = re.sub('[(*\']','', v)
    actors = actors.split(",")
    return actors

In [None]:
def movie_actors(soup):
    try:
        actor = get_movie_value_next_next(soup,'Actor')
        v = re.sub('([a-z()])([A-Z])', '\g<1>, \g<2>', actor)
        if "*" in v:
            actors = v.split('*')
        else:
            actors = re.sub('[(*\']','', v)
            actors = actors.split(",")
        #if "*" in actors:
        #    actors =  actors.split('*')

    except TypeError:
        actors = float('NaN')
    return actors

In [None]:
#url = 'http://www.boxofficemojo.com/movies/?id=disney2016.htm'
url = 'http://www.boxofficemojo.com/movies/?id=ghostbusters2016.htm'
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, "lxml")

In [None]:
get_movie_value_next_next(soup, 'Actor')

In [None]:
actor = get_movie_value_next_next(soup,'Actor')

In [None]:
actor

In [None]:
def movie_actors(soup): # still doesn't deal well with McPeople
    try:
        actor = get_movie_value_next_next(soup,'Actor') # try get_movie_value_next if it doesnt work 
        actor = actor.replace('*','')
        v = re.sub('([a-z()])([A-Z])', '\g<1>,\g<2>', actor)
        v = v.replace(' (,Voice)','')
        v = v.replace(' (,Cameo)','')
        actors = re.sub('[(*\']','', v)
        actors = actors.split(',')
        #if "*" in v:
        #    actors = v.split('*')
        #else:
        #    actors = re.sub('[(*\']','', v)
        #    actors = actors.split(",")

    except TypeError:
        actors = float('NaN')
    return actors

In [None]:
# startswith()
# Mc, De, van, Mac, Du, Le

In [None]:
print (movie_actors(soup))

In [None]:
actor = get_movie_value_next_next(soup, 'Actor')

In [None]:
actor = actor.replace('*','')

In [None]:
actor

In [None]:
s = actor.replace('(Cameo)','')

In [None]:
v = re.sub('([a-z()])([A-Z])', '\g<1>, \g<2>', actor)

In [None]:
v

In [None]:
#url = 'http://www.boxofficemojo.com/movies/?id=disney2016.htm'
#url = 'http://www.boxofficemojo.com/movies/?id=ghostbusters2016.htm'
#url = "http://www.boxofficemojo.com/movies/?id=13goingon30.htm"
url = "http://www.boxofficemojo.com/movies/?id=9dot99.htm"
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, "lxml")

In [None]:
def get_movie_value_next_next(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    
    for br in soup.findAll('br'):
        br.replace_with('\n')
    
    obj = soup.find(text=re.compile(field_name))
    
    if obj:
        return obj.next.next.text 
    else:
        return None

In [None]:
actor = get_movie_value_next_next(soup, 'Actor')

In [None]:
actor.replace('*','').replace(' (Cameo)','').replace('(Voice)','').replace('\n',', ')

In [None]:
def movie_actors(soup): # still doesn't deal well with McPersons, DeVitos, DaSouzas, etc. 
    try:
        actor = get_movie_value_next_next(soup,'Actor') # try get_movie_value_next if it doesnt work 
        if str(actor) != 'None':
            actors = actor.replace('*','').replace(' (Cameo)','').replace('(Voice)','').replace('\n',', ')
            actors = actors.split(", ")
            #v = v.replace('(Voice)','')
            #v = v.replace('(Cameo)', '')
            #v = v.replace('\n','')
            #if "*" in v:
            #    actors = v.split('*')
            #else:
            #    actors = re.sub('[(*\']','', v)
            return actors

    except TypeError:
        #actors = float('NaN')
        return actors

In [None]:
def movie_actors(soup): # still doesn't deal well with McPersons, DeVitos, DaSouzas, etc. 
    try:
        actor = get_movie_value_next_next(soup,'Actor') # try get_movie_value_next if it doesnt work 
        if str(actor) != 'None':
            actors = actor.replace('*','').replace(' (Cameo)','').replace('(Voice)','').replace('\n',', ')
            actors = actors.split(", ")
            return actors

    except TypeError:
        return actors

In [None]:
print(movie_actors(soup))

In [None]:
opening_weekend_gross(soup)

In [None]:
#fix this here. ARGH!
#def money_to_int_2(moneystring):
#    if moneystring != 'N/A':
#        moneystring = moneystring.replace('$', '').replace('.','').replace(',','').replace(' million', '000000')
#        return int(moneystring)

#def money_to_int_2(moneystring):
#    if moneystring != 'N/A':
#        moneystring = moneystring.replace('$','').replace(',','')
#        if 'million' in moneystring:
#            moneystring.replace('million','')
#            return float(moneystring)*1000000
#        else:
#            return float(moneystring)

In [None]:
#def get_movie_value_next_next(soup, field_name):
#    '''Grab a value from boxofficemojo HTML
#    
#    Takes a string attribute of a movie on the page and
#    returns the string in the next sibling object
#    (the value for that attribute)
#    or None if nothing is found.
#    '''
#    obj = soup.find(text=re.compile(field_name))
#    
#    if obj:
#        return obj.next.next.text
#    else:
#        return None

def get_movie_value_next_next(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    
    for br in soup.findAll('br'):
        br.replace_with('\n')
    
    obj = soup.find(text=re.compile(field_name))
    
    if obj:
        return obj.next.next.text 
    else:
        return None

In [None]:
#def movie_actors(soup):
#    try:
#        actor = get_movie_value_next(soup,'Actor')
#        v = re.sub('([a-z()])([A-Z])', '\g<1>, \g<2>', actor)
#        actors = re.sub('[(*\']','', v)
#        actors = actors.split(",") 
#    except TypeError:
#        actors = float('NaN')
#
#    return actors

#def movie_actors(soup):
#    try:
#        actor = get_movie_value_next(soup,'Actor')
#        if '*' in actor:
#            actors =  actor.split('*')
#        else:
#            v = re.sub('([a-z()])([A-Z])', '\g<1>, \g<2>', actor)
#            actors = re.sub('[(*\']','', v)
#            actors = actors.split(",") 
#    except TypeError:
#        actors = float('NaN')
#    return actors

#def movie_actors(soup): # still doesn't deal well with McPersons, DeVitos, DaSouzas, etc. 
#    try:
#        actor = get_movie_value_next_next(soup,'Actor') # try get_movie_value_next if it doesnt work 
#        v = re.sub('([a-z()])([A-Z])', '\g<1>, \g<2>', actor)
#        v = v.replace('(, Voice)','')
#        v = v.replace('(, Cameo)', '')
#        if "*" in v:
#            actors = v.split('*')
#        else:
#            actors = re.sub('[(*\']','', v)
#            actors = actors.split(",")
#
#    except TypeError:
#        actors = float('NaN')
#    return actors

#def movie_actors(soup): # still doesn't deal well with McPersons, DeVitos, DaSouzas, etc. 
#    try:
#        actor = get_movie_value_next_next(soup,'Actor') # try get_movie_value_next if it doesnt work 
#        actors = actor.replace('*','').replace(' (Cameo)','').replace('(Voice)','').replace('\n',', ')
#        #v = v.replace('(Voice)','')
#        #v = v.replace('(Cameo)', '')
#        #v = v.replace('\n','')
#        #if "*" in v:
#        #    actors = v.split('*')
#        #else:
#        #    actors = re.sub('[(*\']','', v)
#        actors = actors.split(", ")
#
#    except TypeError:
#        actors = float('NaN')
#    return actors
