In [1]:
import numpy as np
import pandas as pd
import copy
import matplotlib.pyplot as plt
import re
import requests
from bs4 import BeautifulSoup
import pickle
import sys
%matplotlib inline  

In [2]:
def get_html(url):
    response = requests.get(url)
    return (BeautifulSoup(response.text, "lxml"))

In [3]:
def get_ranking_page(url):
    """
    input: url of one single webpage that shows ranking of a certain genre
    output: pandas df of that webpage
    Limitation: Works on boxofficemojo.com
    """
    page = get_html(url)
    movie_list = []
    id_pattern = r".*?id=(?P<id>.*).htm"
    for element in page.find_all("table")[3].find_all("td")[6:]:
        text_temp = element.text
        if text_temp == "TOTAL (All Movies):":
            break
        try:
            movie_list += [re.search(id_pattern, element.find("a")["href"]).group("id")]
        except:
            pass       
        movie_list += [text_temp]
    row_ct = int(len(movie_list) / 9)
    return(pd.DataFrame(np.array(movie_list).reshape([row_ct, 9])))

In [4]:
def get_ranking_pages(genre, total_pages):
    """
    output: one pandas df of all ranking webpages of a certain genre
    Limitation: Works on boxofficemojo.com
    """
    num = 1
    movie_main = pd.DataFrame()
    for num in range(total_pages):
        genre_url = r"http://www.boxofficemojo.com/genres/chart/?view=main&sort=gross&order=DESC&pagenum=" + str(num + 1) + r"&id=" + str(genre) + r".htm"
        movie_main = movie_main.append(get_ranking_page(genre_url))
    movie_main = movie_main.reset_index(drop = True)
    movie_main.columns = ["Rank", "movieid", "Title", "Studio", "life_gross", "life_theater", \
                          "open_gross", "open_theater", "release_date"]   
    movie_main["genre"] = genre
    return movie_main

In [5]:
def get_movie_metadata(corpus):
    movie_meta_text = corpus.find_all("table")[0].find_all("td")[5].getText()
    movie_pattern = r".*Runtime: (?P<runtime>.*)MPAA Rating: (?P<mpaa>.*)Production Budget: \$(?P<budget>.*)"
    data = np.array((movieid,) + re.search(movie_pattern, movie_meta_text).groups())
    data.shape = (1, 4)
    col_names = ["movieid","runtime", "mpaa", "budget"]
    return(pd.DataFrame(data, columns=col_names))

In [6]:
def get_gross(corpus):
    text = corpus.find_all("div", {"class": "mp_box_content"})[0].find_all("tr")[1].find_all("td")[1].getText()
    pattern = r".*\$(?P<life_frn_gross>.*)$"
    life_frn_gross = re.search(pattern, text).group("life_frn_gross")
    del text, pattern

    text = corpus.find_all("div", {"class": "mp_box_content"})[1].find_all("tr")[1].getText()
    pattern = r".*\$(?P<avg_open_gross>.*)average\)$"
    avg_open_gross = re.search(pattern, text).group("avg_open_gross")
    del text, pattern

    text = corpus.find_all("div", {"class": "mp_box_content"})[1].find_all("tr")[2].find_all("td")[1].getText()
    pattern = r"([^0-9])*(?P<pct_open_gross>.*)%$"
    pct_open_gross = float(re.search(pattern, text).group("pct_open_gross")) / 100
    del text, pattern

    data = np.array([life_frn_gross, avg_open_gross, pct_open_gross])
    data.shape = (1, 3)
    col_names = ["life_frn_gross","avg_open_gross", "pct_open_gross"]
    movie_gross = pd.DataFrame(data, columns=col_names)
    movie_gross.loc[:, "movieid"] = movieid
    return(movie_gross)

In [7]:
def get_players(corpus):
    players = corpus.find_all("div", {"class": "mp_box_content"})[2].find_all("td")
    movie_player = pd.DataFrame(columns = ["role", "playerid", "player"])
    for player in players:
        text = player.getText()
        if re.search(":", text):  
            role = copy.copy(text)
        else:
            for atag in player.find_all("a"):
                pattern = r'id=(?P<playerid>.*).htm">(?P<player>.*)<\/a>$'
                playerid = re.search(pattern, str(atag)).group("playerid")
                player = re.search(pattern, str(atag)).group("player")
                data = np.array([role, playerid, player])
                data.shape = (1, 3)
                col_names = ["role","playerid", "player"]
                movie_player = movie_player.append(pd.DataFrame(data, columns = col_names))
    movie_player = movie_player.reset_index(drop = True)
    movie_player.loc[:, "movieid"] = movieid
    return movie_player

In [8]:
def get_movie(movieid):
    movie_url = r"http://www.boxofficemojo.com/movies/?id=" + movieid + ".htm"
    corpus = get_html(movie_url)
    try: #TODO refine html search
        meta_df = get_movie_metadata(corpus)
    except:
        meta_df = None
    try:
        gross_df = get_gross(corpus)
    except:
        gross_df = None
    try:
        players_df = get_players(corpus)
    except:
        players_df = None
    return (meta_df, gross_df, players_df)

In [9]:
def get_player_metadata(role, playerid):
    player_url = r"http://www.boxofficemojo.com/people/chart/?view=" + role + r"&id=" + playerid + r".htm"
    player_corpus = get_html(player_url)
    common_tree = player_corpus.find_all("div", {'id': 'body'})[0]
    
    #past movies
    player_meta_df = pd.DataFrame()
    player_meta_li = []
    id_pattern = r".*?id=(?P<id>.*).htm"
    break_nested_loop = False
    for row in common_tree.find_all("tr")[1].find_all("tr")[1:]:
        for element in row.find_all("td"):
            text = element.getText()
            if text == "Title (click to view)":
                break_nested_loop = True
                break
            try:
                player_meta_li += [re.search(id_pattern, element.find("a")["href"]).group("id")]
            except:
                pass  
            player_meta_li += [text]
        if break_nested_loop:
            break
    row_ct = int(len(player_meta_li) / 9)
    player_meta_df = pd.DataFrame(np.array(player_meta_li).reshape([row_ct, 9]))
    player_meta_df.columns = ["release_date","movieid", "Title", "Studio", "life_gross", "life_theater", \
                              "open_gross", "open_theater", "Rank"]
    player_meta_df.loc[:, "role"] = role
    player_meta_df.loc[:, "playerid"] = playerid
    
    #gross
    text = common_tree.find_all("table")[1].find("table").find_next_siblings()[3].getText()
    pattern = r".*\((?P<plr_life_mv_num>[0-9]*)\).*\$(?P<plr_life_gross>[0-9,]*)Average: \$(?P<plr_avg_gross>[0-9,]*).*\((?P<plr_open_mv_num>[0-9]*)\).*\$(?P<plr_avg_open_gross>[0-9,]*)"
    try: #TODO refine html search. Most records are only missing plr_open_mv_num and plr_avg_open_gross
        player_gross_df = np.array(re.search(pattern, text).groups())
    except:
        player_gross_df = np.array([None] * 5)
    player_gross_df.shape = (1, 5)
    player_gross_df = pd.DataFrame(player_gross_df)
    player_gross_df.columns = ["plr_life_mv_num", "plr_life_gross", "plr_avg_gross", "plr_open_mv_num", "plr_avg_open_gross"]   
    player_gross_df.loc[:, "role"] = role
    player_gross_df.loc[:, "playerid"] = playerid
    return(player_meta_df, player_gross_df)

In [10]:
# I sorted genre by gross, then num of movies (comedysequel excluded) 
# and picked these five with similar values
GENRE_LIST = [("summerdrama", 2),
              ("supernaturalhorror", 2),
              ("crime", 2),
              ("caper", 2),
              ("r-ratedcomedy", 2)]

# import movie list

In [17]:
movie_main = pd.DataFrame()
for genre in GENRE_LIST:
    movie_main = movie_main.append(get_ranking_pages(genre[0], genre[1]), ignore_index = True)
movie_main = movie_main.reset_index(drop = True)

In [20]:
#movie_main.tail()

# import movie data

In [None]:
#toy corpus
#movieid = movie_main["movieid"][5]
#movie_meta = pd.DataFrame()
#movie_url = r"http://www.boxofficemojo.com/movies/?id=" + movieid + ".htm"
#corpus = get_html(movie_url)

In [21]:
meta_df = pd.DataFrame()
gross_df = pd.DataFrame()
players_df = pd.DataFrame()
for movieid in movie_main["movieid"]:
    movie_tup = get_movie(movieid)
    meta_df = meta_df.append(movie_tup[0])
    gross_df = gross_df.append(movie_tup[1])
    players_df = players_df.append(movie_tup[2])
meta_df = meta_df.reset_index(drop = True)
gross_df = gross_df.reset_index(drop = True)
players_df = players_df.reset_index(drop = True)

In [27]:
#clean player role
#players_df[players_df['role'] == 'Writers:']
players_df['role'] = players_df['role'].map(lambda x: re.sub(r"(.*r).*", "\g<1>", x))

In [29]:
#meta_df.tail()
#gross_df.tail()
#players_df.tail()
#players_df['role'].value_counts()

# import player data

In [None]:
#toy player param
#role = 'Producer'
#playerid = 'barrymendel'

In [30]:
#make a set of role-playerid
players_set = set()
for i in range(players_df.shape[0]):
    players_set.add(tuple(players_df.iloc[i, :2]))

In [32]:
player_meta_df = pd.DataFrame()
player_gross_df = pd.DataFrame()
for li in players_set:
    data_tup = get_player_metadata(li[0], li[1])
    player_meta_df = player_meta_df.append(data_tup[0])
    player_gross_df = player_gross_df.append(data_tup[1])
player_meta_df = player_meta_df.reset_index(drop = True)
player_gross_df = player_gross_df.reset_index(drop = True)

In [36]:
#players_set
#player_meta_df.tail()
#player_gross_df.tail()

# pickle

In [38]:
df_li = [x for x in dir() if (type(eval(x)) == type(pd.DataFrame())) & (x[0] != '_')]

In [46]:
#df_li

In [47]:
with open('my_data.pkl', 'wb') as picklefile:
    for df in df_li:
        pickle.dump(eval(df), picklefile)