In [45]:
import requests
import re
from bs4 import BeautifulSoup
import dateutil.parser
from pprint import pprint
import numpy as np
import pandas as pd
import time
import random

In [2]:
def urlToSoup(url):
    '''Take an http request and return a soup object'''
    response = requests.get(url)
    return BeautifulSoup(response.text, 'lxml')

In [80]:
def money_to_int(moneystring):
    if not moneystring:
        return np.NaN
    moneystring = moneystring.replace('$', '').replace(',', '')
    return int(moneystring)

def parseOpeningRank(openingrankstring):
    openingmatch=re.search('#[0-9] [R|r]ank', openingrankstring)
    return int(openingmatch.group(0)[1:-5])

    
def getOpeningVals(soup):
    '''Scrape opening weekend gross & rank for a single movie and save them in a data frame'''
    entries={}
    #OPENING WEEKEND GROSS
    obj_openinggross= soup.find_all(class_='mp_box_content')[1].find_all('td')[1].text.strip()
    opening_gross= np.NaN
    try:
        opening_gross=money_to_int(obj_openinggross)
        entries['OpeningGross'] = opening_gross
    except:
        pass
    
    #OPENING WEEKEND RANK
    obj_openingrank=soup.find_all(class_="mp_box_content")[1].find_all('td')[2].text
    opening_rank=np.NaN
    try:
        opening_rank=parseOpeningRank(obj_openingrank)
        entries['OpeningRank'] = opening_rank
    except:
        pass
    
    return entries
    

In [25]:
url='http://www.boxofficemojo.com/movies/?id=wonderwoman.htm'
soup=urlToSoup(url)
soupify=soup[1].find_all('td')[2].text


KeyError: 1

In [28]:
print(soup.find_all(class_="mp_box_content")[1].find_all('td')[2].text)

(#1 rank, 4,165 theaters, $24,790 average)


### Scraping
* use a dataframe, append small chunks of the list of movies
* save work partway through, make sure to overwrite dataframe
* when starting over be sure to overwrite years variable

In [4]:
df=pd.read_csv("oscars_dropped_cleaned2.csv")
df.columns

Index(['Unnamed: 0', 'Title', 'DomesticTotalGross', 'RunTime', 'ReleaseDate',
       'DayOfTheYear', 'MonthOfTheYear', 'Director', 'ProductionBudget',
       'InRelease', 'MpaaRating', 'Distributor', 'Genre', 'Nominations',
       'Wins', 'GenreMap', 'DayOfTheYear^2'],
      dtype='object')

In [5]:
df['OpeningGross']=pd.Series()
df['OpeningRank']=pd.Series()
df=df.drop(columns=['Title', 'DomesticTotalGross', 'RunTime', 'ReleaseDate','DayOfTheYear', 'MonthOfTheYear', 'Director',\
                 'ProductionBudget','InRelease', 'MpaaRating', 'Distributor', \
                 'Genre', 'Nominations','Wins', 'GenreMap', 'DayOfTheYear^2'])

In [6]:
df=df.set_index('Unnamed: 0')

In [7]:
df.head()

Unnamed: 0_level_0,OpeningGross,OpeningRank
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1
http://www.boxofficemojo.com/movies/?id=ali.htm,,
http://www.boxofficemojo.com/movies/?id=amelie.htm,,
http://www.boxofficemojo.com/movies/?id=americassweethearts.htm,,
http://www.boxofficemojo.com/movies/?id=anygivensunday.htm,,
http://www.boxofficemojo.com/movies/?id=babyboy.htm,,


In [51]:
smaller_df=df.iloc[0:50,:]

In [85]:
def parseLinks(start, end):
    '''Go through the links in the "alllinks" list
    in the range specified by "start" and "end", and
    scrape the data for each movie.
    Returns dataframe of all movies parsed'''
    movies={}
    for i in range(start,end):
        time.sleep(.5+2*random.random())
        link=df.index[i]
        soup=urlToSoup(df.index[i])
        movies[link]=getOpeningVals(soup)
        #print(df.loc[[smaller_df.index[i],['OpeningGross']]])
        #smaller_df[smaller_df.loc[i,'OpeningRank']=values[1]
    return(pd.DataFrame.from_dict(movies, orient="index"))

In [91]:
data=None #for use when starting the whole process over

In [92]:
data=parseLinks(0,500)

In [93]:
data=data.append(parseLinks(500,1001))

In [94]:
data=data.append(parseLinks(1001, len(df.index)))

In [95]:
data.to_csv("cleaned-1497-openingweekend.csv")        

In [97]:
data.isnull().sum()

OpeningGross      0
OpeningRank     161
dtype: int64

* Concatenate all of the csv files into one larger dataframe

In [323]:
df1=pd.read_csv("1980-2017_0001-4000.csv")
df2=pd.read_csv("1980-2017_4001-end.csv")

In [324]:
import pickle

In [325]:
df1.append(df2)
with open('movies_1980-2017.pkl', 'wb') as picklefile:
    pickle.dump(data, picklefile)

In [326]:
!ls

1980-2017_0001-4000.csv
1980-2017_4001-end.csv
2013_movies.csv
Untitled.ipynb
challenge_set_03_katie.ipynb
movies_150-2017.pkl
oscar-scraping.ipynb
[34moscars[m[m
oscars_noms_scraping.ipynb
pairprobJuly11.ipynb
web_scraping_beautifulsoup_kaszklar.ipynb
web_scraping_selenium-kaszklar.ipynb
