### Get the HTML from a page, and convert to a BeautifulSoup object
#### We'll start by scraping some information from Box Office Mojo.

In [1]:
import urllib2
import re
from bs4 import BeautifulSoup
import pickle
import datetime
import pandas as pd
import json
import csv

### Build pages

In [14]:
def build_soup_page(page = "", url = "", use_url=False):
    """
    builds a beautifulsoup object from a url if a url is passed, otherwise just creates a BeautifulSoup object
    """
    if use_url:
        page = urllib2.urlopen(url)
        soup = BeautifulSoup(page, 'xml')
    else:
        soup = BeautifulSoup(page)
    return soup

In [4]:
def build_top_index(soup, to_search, beg_url):
    """
    builds an index from a soup object with a search string
    """
    
    index = []
    
    for a in soup.find_all('a', href=True):
        if a['href'].startswith(to_search):
            index.append(beg_url + a['href'])
    end_index = len(index) / 2
    
    return index[:end_index]

In [5]:
def build_sub_index(soup, search_one, search_two, beg_url):
    """
    builds an index from a soup object with a search and count string
    """
    
    index = []
    
    for a in soup.find_all('a', href=True):
        if a['href'].startswith(search_one) and re.search(search_two, a['href']) != None:
            index.append(beg_url + a['href'])
    end_index = len(index) / 2
    
    return index[:end_index]

In [6]:
def build_single_movie_url_list(total_urls, box_url):
    """
    builds a list of every movies url
    """
    
    single_movie_url_tags = []

    for full_site in total_urls:
        soup = build_soup_page(full_site)
        for a in soup.find_all('a', href=True):
            if re.search('id', a['href']) != None and a['href'] not in single_movie_url_tags:
                single_movie_url_tags[box_url + a['href']] = soup
                #single_movie_url_tags.append(box_url + a['href'])   
    return set(single_movie_url_tags)

### Pickle (save a file) of things as needed

In [2]:
def store_pickles(filename, to_store):
    with open(filename, 'w') as f:
        pickle.dump(to_store, f)
        
def eat_pickles(filename):
    with open(filename, 'r') as f:
        return pickle.load(f)

### Title

In [3]:
def get_movie_title(soup):
    """
    takes a soup object of a movie page, and returns the title
    """
    return str(soup.find('title').text.split("(")[0].strip())

### Movie Gross

In [4]:
def get_movie_gross(soup):
    """
    takes a soup object of a movie page, finds the value for the movie gross, replaces a few characters and returns
    it as an integer
    """
    obj = soup.find(text = re.compile('Domestic Total Gross'))
    if not obj:
        return None
    movie_gross = obj.findNextSibling().text
    if movie_gross:
        return int(movie_gross.replace(',','').replace('$', ''))
    else:
        return None 

### Release Date

In [5]:
def get_movie_release_date(soup):
    """
    takes a soup object of a movie page, finds the value for the release date, converts it to a datetime object
    and returns the object in date format YYYY-MM-DD
    """
    obj = soup.find(text = re.compile('Release Date'))
    if not obj:
        return None
    release_date = obj.findNextSibling().text
    if release_date != 'N/A':
        if release_date == 'TBD':
            return str(release_date)
        elif len(release_date.split()) == 3:
            return datetime.datetime.strptime(release_date, '%B %d, %Y').date()
        elif len(release_date.split()) == 1:
            return datetime.datetime.strptime(release_date, '%Y').date()
        
        else:
            return None

### Runtime

In [6]:
def get_movie_runtime(soup):
    """
    takes a soup object of a movie page, finds the value for the runtime, splits the string and returns an integer
    of total minutes.
    """
    obj = soup.find(text = re.compile('Runtime'))
    if not obj:
        return None
    runtime = obj.findNextSibling().text
    if runtime and runtime != 'N/A':
        return int(runtime.split(' ')[0]) * 60 + int(runtime.split(' ')[2])      
    else:
        return None 

### Budget

In [7]:
def get_movie_budget(soup):
    """
    takes a soup object of a movie page, finds the value for the budget, splits the string and returns an integer
    of budget.
    """
    obj = soup.find(text = re.compile('Production Budget'))
    if not obj:
        return None
    budget = obj.findNextSibling().text
    if budget and budget != 'N/A':
        if '.' in budget:
            return int(budget.split()[0].split('.')[0].replace('$', '') + budget.split()[0].split('.')[1] + '00000')
        elif ',' in budget:
            return int(budget.split(',')[0].replace('$', '') + budget.split(',')[1])
        else:
            return int(budget.split()[0].replace('$', '') + '000000')
    else:
        return None 

### Rating

In [8]:
def get_movie_rating(soup):
    """
    takes a string attribute of a movie on the page, and returns the string in the next
    sibling object (the value for that attritube)
    """
    obj = soup.find(text = re.compile('Rating'))
    if not obj:
        return None
    rating = obj.findNextSibling().text
    if rating:
        return str(rating)
    else:
        return None 

### Foreign Total Gross

In [9]:
def get_movie_foreign_gross(soup):
    """
    takes a soup object of a movie page, finds the value for the movie gross, replaces a few characters and returns
    it as an integer
    """
    obj = soup.find_all(width="35%")
    if not obj:
        return None
    if len(obj) > 1:
        foreign_tot_gross = obj[1].text.strip()
        if foreign_tot_gross != 'n/a':
            return int(''.join(foreign_tot_gross.replace('$','').split(',')))
    else:
        return None 

### Genre

In [10]:
def get_movie_genre(soup):
    """
    takes a string attribute of a movie on the page, and returns the string in the next
    sibling object (the value for that attritube)
    """
    obj = soup.find(text = re.compile("Genre: "))
    if not obj:
        return None
    genre = obj.findNextSibling().text
    if genre:
        return str(genre)
    else:
        return None 

### Distributor

In [11]:
def get_movie_distributor(soup):
    """
    takes a string attribute of a movie on the page, and returns the string in the next
    sibling object (the value for that attritube)
    """
    obj = soup.find(text = re.compile("Distributor"))
    if not obj:
        return None
    dist = obj.findNextSibling().text
    if dist:
        return str(dist)
    else:
        return None 

### Actors

In [12]:
def get_movie_actors(soup):
    
    actor_list = []

    for a in soup.find_all('a', href=True):
        if a['href'].startswith('/people/chart/?view=Actor'):
            section = a.find_parent('font')
            for i in range(0, len(section), 2):
                try:
                    actor_list.append(str(section.contents[i].text))
                except:
                    try:
                        actor_list.append(str(section.contents[i]))
                    except:
                        actor_list.append(section.contents[i])
    return list(set(actor_list))

###Build soup document and build index of movies pages from A-Z

In [427]:
# creates a soup object containing the href for all pages A-Z & NUM
#movie_pages_az_level = build_soup_page("http://www.boxofficemojo.com/movies")

# builds a list of each url ending for NUM and A-Z movie pages
#movie_pages_az_urls = build_top_index(movie_pages_az_level, 'alphabetical', "http://www.boxofficemojo.com/movies/")

### Build sub level pages for each letter of title, also pickled (top_pages.pkl)

In [None]:
# Builds total urls for all pages of movies
total_urls = movie_pages_az_urls[:]

for movie_page in movie_pages_az_urls:
    top_level_soup = build_soup_page(movie_page)
    sub_level = build_sub_index(top_level_soup, '/movies/', 'page', "http://www.boxofficemojo.com")
    if len(sub_level) > 0:
        for sub_level_url in sub_level:
            if sub_level_url.count('id') == 0:
                total_urls.append(sub_level_url)

### Pickle the main movie urls and filter into ones that have a foreign page and aren't foreign movies

In [22]:
jar_of_pickles = eat_pickles('page_data.pkl')

In [None]:
foreign_pages = []

for url, html in jar_of_pickles.iteritems():
    movie_soup = build_soup_page(html)
    if movie_soup.find(attrs={'href': re.compile('page=intl')}) != None:
        #\ and str(movie_soup.find(text = re.compile('Genre: ')).findNextSibling().text) != 'Foreign':
        foreign_pages.append(url)

### List of urls in foreign_pages are movies that aren't a foreign genre and have a foreign box office record

In [None]:
foreign_urls = [str(foreign_pages[i]).split('?')[0] + '?page=intl&' + str(foreign_pages[i]).split('?')[1] for i in range(len(foreign_pages))]

In [None]:
foreign_urls = eat_pickles('foreign_urls.pkl')

In [522]:
foreign_url_pages = eat_pickles('foreign_url_pages.pkl')

In [523]:
len(foreign_url_pages)

7529

#### Deleting key that has odd characters in title

In [23]:
del jar_of_pickles['http://www.boxofficemojo.com/movies/?id=lecombatdanslile.htm']

### Foreign Gross for U.K.

#### Loop through pages that have foreign box office revenue and build a list of dictionaries with the url as the key and a dictionary as the value.  The values contain a dictionary with keys as countries and values as gross.

In [None]:
foreign_movie_data = []

for url, html in foreign_url_pages.iteritems():
    soup = build_soup_page(page=html)
    foreign_movie_country = []
    foreign_country_gross = []
    for a in soup.find_all('a', href=True):
        if a['href'].startswith('/movies/?page=intl&country'):
            table = a.find_parent('tr')
            foreign_movie_country.append(str(table.contents[0].text))
            try:
                foreign_country_gross.append(int(''.join(table.contents[10].text.replace('$', '').split(','))))
            except:
                continue
    foreign_movie_data.append(dict(zip(foreign_movie_country, foreign_country_gross))) 

#### Build a dictionary with the foreign urls as keys and the values as a dictionary of movie country and gross

foreign_movie_dict = dict(zip(foreign_url_pages.keys(), foreign_movie_data))

In [524]:
foreign_movie_dict = eat_pickles('foreign_movie_dict.pkl')

In [525]:
len(foreign_movie_dict)

7529

#### Build a dictionary with url as keys and U.K. gross as values

In [130]:
for k, v in foreign_movie_dict.iteritems():
    if len(v) == 1:
        soup = build_soup_page(foreign_url_pages[k])
        if soup.find(text = re.compile('United Kingdom')):
            v['United Kingdom'] = v.pop('FOREIGN TOTAL')

In [132]:
uk_movie_gross = {}

for k, v in foreign_movie_dict.iteritems():
    if 'United Kingdom' in v.keys():
        uk_movie_gross[k] =  v['United Kingdom']

### Wikipedia British Actors

In [15]:
brit_actors = build_soup_page(url='http://en.wikipedia.org/wiki/List_of_British_actors_and_actresses', use_url=True)

In [16]:
brit_actors_list = []
unicode_list = []

for a in brit_actors.find_all('a', href=True):
    if a['href'].startswith('/wiki/'):
        try:
            brit_actors_list.append(str(a.text))
        except:
            unicode_list.append(a.text)

In [17]:
brit_actors_list = brit_actors_list[11:-283]

In [18]:
to_append = ['Chloe Annett', 'Noel Coward', 'Ciaran Hinds', 'Llyr Lfans', 'Zoe Lister', 'Sinead Moynihan' \
                       'Sian Phillips', 'Zoe Wanamaker']

for name in to_append:
    brit_actors_list.append(name)

### Build a data structure of url, title, domestic gross, release date, actor, and uk total gross

In [20]:
uk_movie_gross = eat_pickles('uk_movie_gross_final.pkl')

In [24]:
title = []
domestic_total_gross = []
release_date = []
#budget = []
#rating = []
#foreign_gross = []
uk_total_gross = []
BOM_url = []
actor_list = []

for url in uk_movie_gross.iterkeys():
    main_page = url[:37] + url[47:]
    main_html = jar_of_pickles[main_page]
    soup = build_soup_page(page=main_html)
    #Title
    title.append(get_movie_title(soup))
    #Domestic Total gross
    domestic_total_gross.append(get_movie_gross(soup))
    #Release Date
    release_date.append(get_movie_release_date(soup))
    #U.K. Total Gross
    uk_total_gross.append(uk_movie_gross[url])
    #Box office mojo URL
    BOM_url.append(main_page)
    #Actor lists
    actor_list.append(get_movie_actors(soup))

In [104]:
genre = []
budget = []
title = []

for url in uk_movie_gross.iterkeys():
    main_page = url[:37] + url[47:]
    main_html = jar_of_pickles[main_page]
    soup = build_soup_page(page=main_html)
    #Genre
    genre.append(get_movie_genre(soup))
    #Budget
    budget.append(get_movie_budget(soup))
    #Title
    title.append(get_movie_title(soup))

In [25]:
#final_3 = zip(BOM_url, title, domestic_total_gross, release_date, uk_total_gross, actor_list, genre)

In [155]:
title_genre = dict(zip(['Title', 'Genres'], [title, genres]))

In [157]:
store_pickles('title_genre.pkl',title_genre)

In [158]:
#genres

In [143]:
genres = []
for genre in genre_copy:
    if genre.startswith('Action'):
        genres.append('Action')
    elif genre.startswith('Adventure'):
        genres.append('Adventure')
    elif genre.startswith('Animation'):
        genres.append('Animation')
    elif genre.startswith('Comedy'):
        genres.append('Comedy')
    elif genre.startswith('Concert'):
        genres.append('Concert')
    elif genre.startswith('Crime'):
        try:
            split_g = genre.split(' ')
            genres.append(split_g[1].strip())
        except:
            genres.append('Crime')
    elif genre.startswith('Documentary'):
        genres.append('Documentary')
    elif genre.startswith('Drama'):
        genres.append('Drama')
    elif genre.startswith('Family'):
        try:
            split_g = genre.split(' ')
            genres.append(split_g[1].strip())
        except:
            genres.append('Family')
    elif genre.startswith('Fantasy'):
        try:
            split_g = genre.split(' ')
            genres.append(split_g[1].strip())
        except:
            genres.append('Fantasy')
    elif genre.startswith('Foreign'):
        try:
            split_g = genre.split('/')
            genres.append(split_g[1].strip())
        except:
            genres.append('Foreign')
    elif genre.startswith('Historical'):
        try:
            split_g = genre.split(' ')
            genres.append(split_g[1].strip())
        except:
            genres.append('Epic')
    elif genre.startswith('Horror'):
        genres.append('Horror')
    elif genre.startswith('IMAX'):
        genres.append('IMAX')
    elif genre.startswith('Music'):
        try:
            split_g = genre.split(' ')
            genres.append(split_g[1].strip())
        except:
            genres.append('Drama')
    elif genre.startswith('Musical'):
        genres.append('Musical')
    elif genre.startswith('Period'):
        try:
            split_g = genre.split(' ')
            genres.append(split_g[1].strip())
        except:
            genres.append('Period')
    elif genre.startswith('Romance'):
        genres.append('Romance')
    elif genre.startswith('Romantic'):
        try:
            split_g = genre.split(' ')
            genres.append(split_g[1].strip())
        except:
            genres.append('Romance')
    elif genre.startswith('Sci-Fi'):
        try:
            split_g = genre.split(' ')
            genres.append(split_g[1].strip())
        except:
            genres.append('Sci-Fi')
    elif genre.startswith('Sports'):
        try:
            split_g = genre.split(' ')
            genres.append(split_g[1].strip())
        except:
            genres.append('Sports')
    elif genre.startswith('Thriller'):
        genres.append('Thriller')
    elif genre.startswith('Unknown'):
        genres.append('Unknown')
    elif genre.startswith('War'):
        try:
            split_g = genre.split(' ')
            genres.append(split_g[1].strip())
        except:
            genres.append('War')
    elif genre.startswith('Western'):
        try:
            split_g = genre.split(' ')
            genres.append(split_g[1].strip())
        except:
            genres.append('Western')

In [90]:
genre_set = set(genre)

In [159]:
#genre_set

In [62]:
#headers = ["Genre_Title", "Genre", "Budget"]
genre_budget = zip(genre, budget)

In [54]:
store_pickles('genre_budget.pkl', genre_budget_dict_2)

In [63]:
# Create a list of the movies from a tuple of the movies in final_3
final_5 = [list(el) for el in final_3]
#backup = final_4[:]

In [74]:
for i in range(len(final_5)):
    final_5[i].append(genre_budget[i][0])
    final_5[i].append(genre_budget[i][1])

In [80]:
#final_5
#genre_budget

### Loop through list of movies and gather titles to get country info

In [30]:
country_titles = []
for el in sorted(final_5):
    country_titles.append(el[1])  

### Loop through actor list and find movies that don't have a list of actors

In [31]:
indices = []

for i, actors in enumerate(actor_list):
    if len(actors) == 0:
        indices.append(i)

#### Create a list of movie titles to scrape for additional actors

In [32]:
titles_to_scrape = []

for index in indices:
    titles_to_scrape.append(title[index])

In [33]:
len(titles_to_scrape)

466

### Connects to the OMDB api to try to pull any actors for the movies that didn't have them.  Also finds the country of the movie

In [38]:
#actor_list = []
country_list = []


for movie_title in country_titles:# titles_to_scrape instead of country_titles for actors, country_titles
    title = movie_title.replace(" ", "+")
    base_url = 'http://www.omdbapi.com/?t='
    url_4 = base_url + title + '&y=&plot=short&r=json'
    omdb_error_url = [] # initialize
    
    try:
        page_4 = urllib2.urlopen(url_4).read()#.encode("utf-8", "ignore")
        json_data = json.loads(page_4) # converts API json response to dict
        #actors = json_data['Actors']
        #actors = actors.split(',')
        #try:
        #    actor_list.append([str(actor).strip() for actor in actors])
        #except:
        #    actor_list.append([actor for actor in actors])
        country = json_data['Country']
        countries = country.split(',')
        country_list.append([str(country).strip() for country in countries])
    except:
        #actor_list.append([])
        country_list.append([])

### Unpack country and movie title and append to final_4 

In [39]:
add_country_to_final_4 = zip(country_titles, country_list)

In [68]:
for el in add_country_to_final_4:
    for movie in sorted(final_5):
        if el[0] in movie:
            movie.append(el[1])

### Unpacks the title and actors to add to final_4, then loops through final_4 and adds it to the list for actors

In [35]:
add_to_final_4 = zip(titles_to_scrape,actor_list)

In [67]:
for el in add_to_final_4:
    for movie in final_5:
        if el[0] in movie and len(movie[5]) == 0:
            movie[5].extend(el[1])

### Loops through final_4 and appends a 1 or 0 depending on if there is a british actor

In [78]:
for movie in final_5:
    if len(movie[5]) > 1:
        for actor in movie[5]:
            if actor in brit_actors_list:
                movie.append(1)
                break
        if movie[-1] not in [0,1]:
            movie.append(0)
    else:
        movie.append(0)

### Creates a csv file of my data

In [81]:
with open('moviedata2015.csv', 'w') as csvfile:
    csvwriter = csv.writer(csvfile)

    i = 0
    for movie in final_5:
        csvwriter.writerow(movie)
        print 'row %i written' % i
        i += 1
    print 'success'

row 0 written
row 1 written
row 2 written
row 3 written
row 4 written
row 5 written
row 6 written
row 7 written
row 8 written
row 9 written
row 10 written
row 11 written
row 12 written
row 13 written
row 14 written
row 15 written
row 16 written
row 17 written
row 18 written
row 19 written
row 20 written
row 21 written
row 22 written
row 23 written
row 24 written
row 25 written
row 26 written
row 27 written
row 28 written
row 29 written
row 30 written
row 31 written
row 32 written
row 33 written
row 34 written
row 35 written
row 36 written
row 37 written
row 38 written
row 39 written
row 40 written
row 41 written
row 42 written
row 43 written
row 44 written
row 45 written
row 46 written
row 47 written
row 48 written
row 49 written
row 50 written
row 51 written
row 52 written
row 53 written
row 54 written
row 55 written
row 56 written
row 57 written
row 58 written
row 59 written
row 60 written
row 61 written
row 62 written
row 63 written
row 64 written
row 65 written
row 66 written
row 6