# WebScraping

In [1]:
from dogpile.cache import make_region
import requests
from lxml import html
import csv
import pandas as pd
import pickle

In [2]:
from dogpile.cache import make_region

cache = make_region().configure(
    'dogpile.cache.memory',
    expiration_time = 24*2600
)

In [3]:
# Visualisation utils from Kris Joanidis
import IPython.display
def display_html(string, script = False, iframe = False, style  = False):
    '''Display HTML string in iPython notebook. <script> tags are removed by default.'''
    root = html.fromstring(string) 
    if not script:
        for element in root.iter("script"):
            element.drop_tree()
    if not iframe:
        for element in root.iter("iframe"):
            element.drop_tree()
    if not style:
        for element in root.iter("style"):
            element.drop_tree()
        for element in root.iter("link"):
            if element.attrib['rel'] == "stylesheet":
                element.drop_tree()
    IPython.display.display(IPython.display.HTML(html.tostring(root).decode('utf-8')))

## WebScrape Box office mojo by year
http://www.boxofficemojo.com/yearly/chart/?page="+str(pg)+"&view=releasedate&view1=domestic&yr="+str(yr)+"&p=.htm"


In [None]:
# @cache.cache_on_arguments()
def pull_page(yr):
    url = "http://www.boxofficemojo.com/oscar/chart/?yr="+str(yr)+"&view=allmovies&p=.htm"
    data = requests.get(url).text
    return data

def pull_mpage(yr,pg):
    url = "http://www.boxofficemojo.com/yearly/chart/?page="+str(pg)+"&view=releasedate&view1=domestic&yr="+str(yr)+"&p=.htm"
    data = requests.get(url).text
    return data
    
def make_soup(data):
    soup = BeautifulSoup(data,"lxml")
    return soup

In [None]:
## 2002 is the first year they add 'close' column to table. 

def mojo_movies():
    nmdf_list = []
    for yr in range(2002,2016):
        pg = 1
        page = html.fromstring(pull_mpage(yr,pg))
        tables = page.cssselect('table table table')
        while len(tables) > 0 :
            df = pd.read_html(html.tostring(tables[1]),header=0)[0]
            df.columns = ["Rank","Title","Studio", "Total_Gross","Gross_no_Theaters", "Opening",
                          "Opening_no_Theaters","Open_date","Close_date"]
            df = df.ix[df.index[1:-3]]

            df['Year'] = yr
            nmdf_list.append(df)

            pg += 1
            page = html.fromstring(pull_mpage(yr,pg))
            tables = page.cssselect('table table table')

    return nmdf_list

In [None]:
## PIPELINE
recent = mojo_movies()
df1 = pd.concat(recent)
df1.to_pickle('2002to2016movie_data.pkl')

## WebScrape Box office mojo by oscars
http://www.boxofficemojo.com/oscar/chart/?yr="+str(yr)+"&view=allmovies&p=.htm

In [None]:
def mojo_oscars():
    nmdf_list = []
    for yr in range(1980,2016):
        page = html.fromstring(pull_page(yr))
        tables = page.cssselect('table table')
#         display_html(html.tostring(tables[0]))
#         print tables
        df = pd.read_html(html.tostring(tables[0]),header=0)[0]
        df.columns = ["Row","YRRank","Picture","Studio","Box Office","Noms","Wins"]
        df['Year'] = yr
        nmdf_list.append(df)
    return nmdf_list

In [None]:
## PIPELINE
# recent = mojo_oscars()
df1 = pd.concat(recent)
# df1.to_pickle('oscar2.pkl')

In [None]:
df1.head()

## IMDB WebScrape
1. http://www.imdb.com/find?ref_=nv_sr_fn&q=  (search for movie title)
2. http://www.imdb.com/title/tt0076759/?ref_=fn_al_tt_1 (pull movie from search results)
3. capture ['IMDB_Score','Users','Metascore','Director','Writers','Actors','Genres','Rating','Title']

In [4]:
BASE_URL = 'http://www.imdb.com'


In [5]:
@cache.cache_on_arguments()


def imdb_data(title):
#     movie = '+'.join(title.split())
    title = '"%s"' % title
    movie = '+'.join(title.split())
    print movie
    url = "%s/find?s=tt&q=%s" % (BASE_URL, movie)
    print url
    page = html.fromstring(requests.get(url).text)
#     display(page)
    try: 
#         movie_list = page.cssselect('table.findList tr td.result_text a')
        movie_list = page.cssselect('table.findList')

#     display_html(html.tostring(page.cssselect('table.findList')[0]))
    except:
        movie_list = []
        failed_movies.append(title)
        print "failedHere"
    return movie_list[0]

def imdb_title():
    return "nothing"

def imdb_page(movie_list):
    movie_link = movie_list[0].get('href')
    movie_page_url = BASE_URL + movie_link
    print movie_page_url
    next_page = html.fromstring(requests.get(movie_page_url).text)
    print "next_page gotten"
    return next_page

def imdb_pull(next_page):
    
#     next_page = html.fromstring(requests.get(next_page).text)
    imdb_rating = next_page.cssselect('table div[class=star-box-details] span[itemprop=ratingValue]')[0].text_content()
#     print imdb_rating
    genres = next_page.cssselect('table div[class=infobar] span[itemprop="genre"]')
    genre_list = tuple([genre.text_content() for genre in genres])
#     rating = next_page.cssselect('table div[class=infobar] meta.content')[0]
#     print genre_list
    rating = next_page.cssselect('table div[class=infobar] meta')[0].get('content')
#     print rating
    users = next_page.cssselect('table div[class=star-box-details] span[itemprop=ratingCount]')[0].text_content()
#     print users
    metascore = next_page.cssselect('table div[class=star-box-details] a[href="criticreviews?ref_=tt_ov_rt"]')[0].text_content()
#     print metascore
    director = next_page.cssselect('table div[itemprop="director"] span[itemprop=name]')[0].text_content()
#     print director
    writers = next_page.cssselect('table div[itemprop="creator"] span[itemprop=name]')
    writer_list = tuple([writer.text_content() for writer in writers])
#     print writer_list
    actors = next_page.cssselect('table div[itemprop="actors"] span[itemprop=name]')
    actor_list = tuple([actor.text_content() for actor in actors])
#     print actor_list
    return [imdb_rating, users, metascore, director, writer_list, actor_list, genre_list,rating]



In [26]:
columns=["IMDB_Score","Users","Metascore","Director","Writers","Actors","Genres","Rating"]
imdb_df =pd.DataFrame(columns=columns) 
n_failed_movies = []
for title in ['Zoolander','Star Wars']:
    movie_table = imdb_data(title) 
#     for i in range(0,5):
#         if 
    print html.tostring(movie_table.cssselect('tr td.result_text a')[0])
#     .get('href')
    print movie_table.cssselect('tr td.result_text a')[0].text_content()
    print movie_table.cssselect('tr td.result_text a')[0].get('href')
    print movie_table.cssselect('tr td.result_text a')[0].get('href')
    for test in movie_table.cssselect('tr td.result_text')[0]:
        print html.tostring(test)
    
#     print html.tostring(movie_table.cssselect('tr td.result_text a')[1])
#     print html.tostring(movie_table.cssselect('tr td.result_text a')[2])
#     print html.tostring(movie_table.cssselect('tr td.result_text a')[1])


    
#     print html.tostring(movie_table.cssselect('')
    if len(movie_list) > 0:
        next_page = imdb_page(movie_list)
#         display_html(html.tostring(next_page))
        try:
            row = imdb_pull(next_page)
            dictionary = dict(zip(columns,row))
            dictionary['Title'] = title
            imdb_df = imdb_df.append(dictionary, ignore_index=True)
        except:
            print "failed"
            n_failed_movies.append(title)
            continue

<a href="/title/tt1608290/?ref_=fn_tt_tt_1">Zoolander 2</a> (2016) 
Zoolander 2
/title/tt1608290/?ref_=fn_tt_tt_1
/title/tt1608290/?ref_=fn_tt_tt_1
<a href="/title/tt1608290/?ref_=fn_tt_tt_1">Zoolander 2</a> (2016) 


NameError: name 'movie_list' is not defined

In [None]:
imdb_df.to_pickle('imdb_2')

In [None]:
# failed_movies
with open('imdb_failed_movies.pkl', 'w') as picklefile:
    pickle.dump(failed_movies, picklefile)

In [None]:
len(n_failed_movies)


In [None]:
#unpickle
with open("titlelist.pkl", 'r') as picklefile: 
    movie_titles = pickle.load(picklefile)
with open("imdb_t6.pkl", 'r') as picklefile: 
    completed = pickle.load(picklefile)
with open("imdb_failed_movies.pkl", 'r') as picklefile: 
    failed_movies = pickle.load(picklefile)

In [None]:
m_titles = list(movie_titles.values)
c_titles = list(completed.Title.values)


In [None]:
ic_titles = set(m_titles) - (set(m_titles) & (set().union(*[c_titles,failed_movies])))
ic = sorted(ic_titles, reverse=True)


In [None]:
len(ic)

In [None]:
ic = set(ic) - set([u"Le combat dans l'\xeele (1962)",u'A Prophet (Un proph\xe8te)'])
#  u"Le combat dans l'\xeele (1962)"

In [None]:
ic

In [None]:
failed_movies

In [None]:
## http://data.bls.gov/pdq/SurveyOutputServlet 
cpi_excel = open('SeriesReport.xlsx','r')
cpidf = pd.read_excel(cpi_excel,header=9).set_index('Year')
cpi = cpidf.drop({'HALF1','HALF2'},axis=1)

In [None]:
completed.head()

In [None]:
## Base Period:  1982-84=100
%matplotlib inline
from datetime import datetime
import matplotlib.pyplot as plt
cpi.head(n=10)
# cpi['Jan'][1980]
# plt.plot()
cpi.keys()pandas.to_datetime(*args, 


In [None]:
unstacked_cpi = cpi.unstack()

In [None]:
unstacked_cpi.reset_index()

In [None]:
unstacked_cpi_df = pd.DataFrame(unstacked_cpi).reset_index()

In [None]:
unstacked_cpi_df.columns = ['month', 'year','cpi']

In [None]:
unstacked_cpi_df.head()

In [None]:
unstacked_cpi_df["Date"] = df.bar.map(str) + " is " + df.foo