## Import all needed modules

In [None]:
import sys
import subprocess

if 'darwin' in sys.platform:
    print('Running \'caffeinate\' on MacOSX to prevent the system from sleeping')
    subprocess.Popen('caffeinate')

In [None]:
import requests 
import string
from bs4 import BeautifulSoup  
import logging  
logging.basicConfig(level=logging.DEBUG)
import dateutil.parser
import time
import re
from collections import OrderedDict
import pandas as pd
from datetime import datetime
import numpy as np

## Get movie urls from boxofficemojo.com

In [None]:
def get_all_movies():  
    """ returns all the movie urls from boxofficemojo.com in a list"""

    # Alphabet loop for how movies are indexed including
    # movies that start with a special character or number
    index = ["NUM"] + list(string.ascii_uppercase)

    # List of movie urls
    movies_list = []

    # Loop through the pages for each letter
    for letter in index:

        # Loop through the pages within each letter
        for num in range(1, 20):
            url = ("http://www.boxofficemojo.com/movies/alphabetical.htm?"
                   "letter=" + letter + "&page=" + str(num))
            try:
                response = requests.get(url)
                page = response.text
                soup = BeautifulSoup(page, "lxml")
                rows = soup.find(id="body").find("table").find("table").find_all(
                    "table")[1].find_all("tr")

                # skip index row
                if len(rows) > 1:
                    counter = 1
                    for row in rows:
                        # skip index row
                        if counter > 1:
                            link = row.td.font.a['href']
                            # don't add duplicates
                            if link not in movies_list:
                                movies_list.append(link)

                        counter += 1
            except (Exception, e):
                logging.exception(e)

    return movies_list

#### To get the url of the movies, call the function defined above 

In [None]:
all_movies = get_all_movies()

## Define certain useful functions

In [None]:
def get_movie_value(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    obj = soup.find(text=re.compile(field_name))
    if not obj: 
        return None
    # this works for most of the values
    next_sibling = obj.findNextSibling()
    if next_sibling:
        return next_sibling.text # RETURN TEXT OF NEXT SIBLING
    else:
        return None


def get_movie_value_next(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    obj = soup.find(text=re.compile(field_name))
    if not obj: 
        return None
    # this works for most of the values
    next_sibling = obj.findNextSibling()
    if next_sibling:
        return next_sibling.text # RETURN TEXT OF NEXT SIBLING
    else:
        return None



#def get_movie_value_next_next(soup, field_name):
#    '''Grab a value from boxofficemojo HTML
#    
#    Takes a string attribute of a movie on the page and
#    returns the string in the next sibling object
#    (the value for that attribute)
#    or None if nothing is found.
#    '''
#    obj = soup.find(text=re.compile(field_name))
#    
#    if obj:
#        return obj.next.next.text
#    else:
#        return None

def get_movie_value_next_next(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    
    for br in soup.findAll('br'):
        br.replace_with('\n')
    
    obj = soup.find(text=re.compile(field_name))
    
    if obj:
        return obj.next.next.text 
    else:
        return None

def money_to_int(moneystring):
    moneystring = moneystring.replace('$', '').replace(',', '')
    return int(moneystring)


def money_to_int_2(string):
    if string is not None:
        i = string.replace('$','').replace(',','').replace(' ','')
        if "million" in i:
            i = i.replace('million','')
            i = float(i) * 1000000
            i = int(i)
        return i
    else:
        return np.nan

def money_to_int_3(moneystring):
    moneystring = moneystring.replace('\xa0','').replace('$', '').replace(',','')
    

def runtime_to_minutes(runtimestring):
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

def strip_and_return_int(s):
    t = s.replace('$', '').replace(',','')
    t = int(t)
    return t

def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

## Define functions to get movie information

In [None]:
def movie_title(soup):        
    #try:
    title_string = soup.find('title').text
    title = title_string.split('(')[0].strip()
    #except:
        #title = float('NaN')
        
    return title 
    
def movie_producers(soup):
    try:
        producer = get_movie_value_next(soup, 'Producer')
        producers = re.sub('([a-z()])([A-Z])', '\g<1>, \g<2>', producer).split(",") 
    except TypeError:
        producers = float('NaN')
    except AttributeError:
        producers = float('NaN')
        
    return producers
        
def movie_directors(soup):
    try:
        director = get_movie_value_next(soup,'Director')
        director = re.sub('([a-z()])([A-Z])', '\g<1>, \g<2>', director).split(",") 
    except TypeError:
        director = float('NaN')
    
    return director
    

def movie_actors(soup): # still doesn't deal well with McPersons, DeVitos, DaSouzas, etc. 
    try:
        actor = get_movie_value_next_next(soup,'Actor') # try get_movie_value_next if it doesnt work 
        if str(actor) != 'None':
            actors = actor.replace('*','').replace(' (Cameo)','').replace('(Voice)','').replace('\n',', ')
            actors = actors.split(", ")
            return actors

    except TypeError:
        return actors

def movie_genre(soup):
    try:
        genre = get_movie_value(soup, 'Genre[^a-z]')
    except:
        genre = float('NaN')
    return genre

def movie_rating(soup):
    try:
        rating = get_movie_value(soup,'MPAA Rating')
    except:
        rating = float('NaN')
    return rating

#def release_date(soup):
#    try:
#        raw_release_date = get_movie_value(soup,'Release Date')
#        if raw_release_date != 'TBD' and raw_release_date != 'N/A' and raw_release_date[0].isnumeric() :
#            release_date = to_date(raw_release_date)
#        else:
#            release_date = float('NaN')
#    except AttributeError:
#        release_date = float('NaN')
#    
#    return release_date

def release_date(soup):
    try:
        raw_release_date = get_movie_value(soup,'Release Date')
        if raw_release_date != 'TBD' and raw_release_date != 'N/A' and raw_release_date.split()[0] != 'Spring' and raw_release_date.split()[0] != 'Summer' and raw_release_date.split()[0] != 'Fall' and raw_release_date.split()[0] != 'Winter' :
            release_date = to_date(raw_release_date)
        else:
            release_date = float('NaN')
    except AttributeError:
        release_date = float('NaN')
    return release_date
    
def domestic_gross(soup):
    
    try:
        raw_domestic_total_gross = get_movie_value(soup,'Domestic Total')
        domestic_total_gross = money_to_int(raw_domestic_total_gross)
    except AttributeError:
        domestic_total_gross = float('NaN')
    
    return domestic_total_gross
    
def opening_weekend_gross(soup):
    
    try:
        opening_weekend_gross = get_movie_value_next(soup,'Opening\xa0Weekend:')
        opening_weekend_gross = money_to_int_3(opening_weekend_gross)
    except AttributeError:
        opening_weekend_gross = float('NaN')
    return opening_weekend_gross
    
def production_budget(soup):
    try:
        production_budget = get_movie_value(soup, 'Production Budget')
        production_budget = money_to_int_2(production_budget)
    except AttributeError:
        production_budget = float('NaN')
    return production_budget
    
def runtime(soup):
    try:
        raw_runtime = get_movie_value(soup,'Runtime')
        runtime = runtime_to_minutes(raw_runtime)
    except AttributeError:
        runtime = float('NaN')
    return runtime
    
def widest_release(soup):
    try:
        widest_release = get_movie_value_next_next(soup, 'Widest')
        widest_release = strip_and_return_int(widest_release.split()[0])
    except AttributeError:
        widest_release = float('NaN')
    return widest_release

def distributor(soup):
    try:
        if str(soup.find(id='body')) != 'None':
            distributor = soup.find(id="body").find(text=re.compile("Distributor"))
            if str(distributor) != 'None':
                distributor = distributor.findNextSibling().text
                return distributor
    except LookupError:
        distributor = float('NaN')


## MOVIE SCRAPER

In [None]:
def movie_scraper(all_movies):    
    movie_blocks = list(range(0, len(all_movies)-1, int(len(all_movies)/10)))
    movie_blocks.append(len(all_movies)-1) 
            
    print("The length of urls_chunks, minus one is %s" % str(len(movie_blocks)-1))
    
    movie_data_list = OrderedDict()
    count = 0 
    try:
        for block_index in range(len(movie_blocks) - 1):
                    
            print ("URL chunk ... %s of %s in 10 seconds ... \n" % (str(block_index), str(len(movie_blocks) - 1)))
            
            time.sleep(10)
            
            print ("Fetching URLs in movie_list location: %s to %s \n" % (movie_blocks[block_index], movie_blocks[block_index + 1]))
            for movie in all_movies[movie_blocks[block_index]:movie_blocks[block_index + 1]]:
                count +=1
                url = "http://www.boxofficemojo.com/" + movie
                response = requests.get(url)
                time.sleep(0.1)
                page = response.text
                soup = BeautifulSoup(page, "lxml")
                print(count,movie)
                movie_data_list[movie_title(soup)] = [movie, movie_genre(soup), release_date(soup), 
                                                        distributor(soup), runtime(soup), 
                                                        movie_rating(soup), production_budget(soup), 
                                                        domestic_gross(soup), movie_actors(soup), 
                                                        opening_weekend_gross(soup), widest_release(soup),
                                                        movie_producers(soup), movie_directors(soup) ]
                        
                
                
                #try:
                #    time.sleep(0.2)
                #    response = requests.get(url)
                #    page = response.text
                #    #print(str(re.search('\w+(?=\.htm)', url).group()))
                #    soupObjects[str(re.search('\w+(?=\.htm)',url).group())] = BeautifulSoup(page,'lxml')
                #except requests.exceptions.RequestException as e:
                #    print(e)
                #    sys.exit(1)
                #
                    
            print("continuing in 10 seconds ...")
    except requests.exceptions.RequestException as e:
        print(e)
        pass
        #sys.exit(1)
            
    return movie_data_list


## SCRAPING the data!

In [None]:
dict_1 = movie_scraper(all_movies)

In [None]:
df = pd.DataFrame(dict_1)

In [None]:
df.to_pickle('001.pkl')

In [None]:
dict_2 = movie_scraper(all_movies[1522:])

In [None]:
df2 = pd.DataFrame(dict_2)

In [None]:
df2.to_pickle('002.pkl')

In [None]:
dict_3 = movie_scraper(all_movies[7200:])

In [None]:
df3 = pd.DataFrame(dict_3)

In [None]:
df3.to_pickle('003.pkl')

In [None]:
dict_4 = movie_scraper(all_movies[10284:])

In [None]:
df4 = pd.DataFrame(dict_4)

In [None]:
df4.to_pickle('004.pkl')

In [None]:
dict_5 = movie_scraper(all_movies[15372:])

In [None]:
df5 = pd.DataFrame(dict_5)

In [None]:
df5.to_pickle('005.pkl')

In [None]:
dict_6 = movie_scraper(all_movies[15881:])

In [None]:
df6 = pd.DataFrame(dict_6)

In [None]:
df6.to_pickle('006.pkl')

## transpose the dfs

In [None]:
df = df.transpose()
df2 = df2.transpose()
df3 = df3.transpose()
df4 = df4.transpose()
df5 = df5.transpose()
df6 = df6.transpose()

In [None]:
df.columns = ['movie', 'genre','release date', 'distributor', 'runtime', 'rating', 'budget', 'domestic gross', 'actors', '1st weekend gross', 'widest release', 'producers', 'directors']

In [None]:
df_all = pd.concat([df, df2, df3, df4, df5, df6])

In [None]:
df_all.shape
df_all.columns = ['movie', 'genre','release date', 'distributor', 'runtime', 'rating', 'budget', 'domestic gross', 'actors', '1st weekend gross', 'widest release', 'producers', 'directors']

In [None]:
df_all.head()

In [None]:
df_all['title'] = df_all.index

In [None]:
df_all.head()

In [None]:
df_all.to_pickle('all_movies_20161003.pkl')

In [None]:
DF = df_all.sort_values('release date')

In [None]:
DF.index = range(len(DF))

Oldest movie in data set is from 1919.

In [None]:
DF['release date'][0]

Select movies with release dates in 1990 and beyond.

In [None]:
DF_90s = DF[DF['release date'] > datetime(1989, 12, 31, 0, 0)]

In [None]:
DF_90s.to_pickle('movies_from_the_90s.pkl')

# Work: DATA EXPLORATION

In [1]:
import pandas as pd 

In [2]:
DF_90s = pd.read_pickle('movies_from_the_90s.pkl')

In [3]:
DF_90s['actors_no_nan'] = DF_90s['actors'].fillna('')

In [4]:
def strip_end(s):
    if s.endswith("(narrator)"):
        s = s[:-len("(narrator)")]
    elif s.endswith("(Narrator)"):
        s = s[:-len("(Narrator)")]
    elif s.endswith("(Himself)"):
        s = s[:-len("(Himself)")]
    elif s.endswith("(Herself)"):
        s = s[:-len("(Herself)")]
    return s
# test on: s = "the narrator is (narrator)"

In [5]:
def strip_one_space(s):
    """ Strips space at beginning or end of string"""
    if s.endswith(" "): s = s[:-1]
    if s.startswith(" "): s = s[1:]
    return s

In [6]:
def unnest(df, col, reset_index=False):
    """ Unnest cells of series that have lists in them."""
    import pandas as pd
    col_flat = pd.DataFrame([[i, x] 
                       for i, y in df[col].apply(list).iteritems() 
                           for x in y], columns=['I', col])
    col_flat = col_flat.set_index('I')
    df = df.drop(col, 1)
    df = df.merge(col_flat, left_index=True, right_index=True)
    if reset_index:
        df = df.reset_index(drop=True)
    return df

In [7]:
expected = unnest(DF_90s, 'actors_no_nan')

#### Strip space and things like "(narrator)" at the end of the names

In [8]:
expected['actors_no_nan_2'] = expected['actors_no_nan'].apply(strip_end).apply(strip_one_space)

In [9]:
expected.shape

(27392, 16)

#### Add count =1 column so we can count how many movies an actor has made (total and by genre)

In [10]:
expected['count'] = 1

In [11]:
expected.shape

(27392, 17)

In [12]:
# movies with SLJ: value counts of distributors
# expected[expected['actors_no_nan_2'] == 'Samuel L. Jackson']['distributor'].value_counts()

In [13]:
expected.columns

Index(['movie', 'genre', 'release date', 'distributor', 'runtime', 'rating',
       'budget', 'domestic gross', 'actors', '1st weekend gross',
       'widest release', 'producers', 'directors', 'title', 'actors_no_nan',
       'actors_no_nan_2', 'count'],
      dtype='object')

In [14]:
del expected['1st weekend gross']
del expected['actors_no_nan']
del expected['producers']
del expected['directors']

In [15]:
expected.columns

Index(['movie', 'genre', 'release date', 'distributor', 'runtime', 'rating',
       'budget', 'domestic gross', 'actors', 'widest release', 'title',
       'actors_no_nan_2', 'count'],
      dtype='object')

#### Genre functions

In [17]:
def comedy(s):
    if 'Comedy' in s:
        return 1
    else:
        return 0

def drama(s):
    if 'Drama' in s:
        return 1
    else:
        return 0

def animation(s):
    if 'Animation' in s:
        return 1
    else:
        return 0

def thriller(s):
    if 'Thriller' in s:
        return 1
    else:
        return 0 

def unknown(s):
    if 'Unknown' in s:
        return 1
    else:
        return 0

def action(s):
    if 'Action' in s:
        return 1
    else:
        return 0

def horror(s):
    if 'Horror' in s:
        return 1 
    else:
        return 0

def romance(s):
    if 'Romance' in s:
        return 1
    else:
        return 0

def fantasy(s):
    if 'Fantasy' in s:
        return 1
    else:
        return 0

def family(s):
    if 'Family' in s:
        return 1
    else:
        return 0

def sports(s):
    if 'Sports' in s:
        return 1
    else:
        return 0

def foreign(s):
    if 'Foreign' in s:
        return 1
    else:
        return 0

def music(s):
    if 'Music' in s:
        return 1
    elif 'Concert' in s:
        return 1
    else:
        return 0

def western(s):
    if 'Western' in s:
        return 1
    else:
        return 0

def crime(s):
    if 'Crime' in s:
        return 1
    else:
        return 0 
    
def sci_fi(s):
    if 'Sci-Fi' in s:
        return 1
    else:
        return 0

def war(s):
    if 'War' in s:
        return 1
    else:
        return 0
    
def documentary(s):
    if 'Documentary' in s:
        return 1
    else:
        return 0


In [18]:
expected['genre_comedy'] = expected['genre'].apply(comedy)
expected['genre_drama'] = expected['genre'].apply(drama)
expected['genre_animation'] = expected['genre'].apply(animation)
expected['genre_thriller'] = expected['genre'].apply(thriller)
expected['genre_unknown'] = expected['genre'].apply(unknown)
expected['genre_action'] = expected['genre'].apply(action)
expected['genre_horror'] = expected['genre'].apply(horror)
expected['genre_romance'] = expected['genre'].apply(romance)
expected['genre_fantasy'] = expected['genre'].apply(fantasy)
expected['genre_family'] = expected['genre'].apply(family)
expected['genre_sports'] = expected['genre'].apply(sports)
expected['genre_foreign'] = expected['genre'].apply(foreign)
expected['genre_music'] = expected['genre'].apply(music)
expected['genre_western'] = expected['genre'].apply(western)
expected['genre_crime'] = expected['genre'].apply(crime)
expected['genre_sci_fi'] = expected['genre'].apply(sci_fi)
expected['genre_war'] = expected['genre'].apply(war)
expected['genre_documentary'] = expected['genre'].apply(documentary)

In [19]:
expected.columns

Index(['movie', 'genre', 'release date', 'distributor', 'runtime', 'rating',
       'budget', 'domestic gross', 'actors', 'widest release', 'title',
       'actors_no_nan_2', 'count', 'genre_comedy', 'genre_drama',
       'genre_animation', 'genre_thriller', 'genre_unknown', 'genre_action',
       'genre_horror', 'genre_romance', 'genre_fantasy', 'genre_family',
       'genre_sports', 'genre_foreign', 'genre_music', 'genre_western',
       'genre_crime', 'genre_sci_fi', 'genre_war', 'genre_documentary'],
      dtype='object')

# FIX THIS; 6 PM tuesday 

In [None]:
#def SLJ(a):
#    for i in a:
#        if 'Samuel L. Jackson' in i:  
#            return 1
#        else:
#            return 0

In [21]:
def SLJ(a):
    if 'Samuel L. Jackson' in a:
        return 1
    elif 'Samuel L. Jackson (Narrator)' in a:
        return 1
    elif 'Samuel L. Jackson (narrator)' in a:
        return 1
    elif 'Samuel L. Jackson (Himself)' in a:
        return 1
    else:
        return 0

In [22]:
expected['SLJ']  = expected['actors'].apply(SLJ)

In [23]:
expected.columns

Index(['movie', 'genre', 'release date', 'distributor', 'runtime', 'rating',
       'budget', 'domestic gross', 'actors', 'widest release', 'title',
       'actors_no_nan_2', 'count', 'genre_comedy', 'genre_drama',
       'genre_animation', 'genre_thriller', 'genre_unknown', 'genre_action',
       'genre_horror', 'genre_romance', 'genre_fantasy', 'genre_family',
       'genre_sports', 'genre_foreign', 'genre_music', 'genre_western',
       'genre_crime', 'genre_sci_fi', 'genre_war', 'genre_documentary', 'SLJ'],
      dtype='object')

In [24]:
del expected['actors']

In [25]:
expected.shape

(27392, 31)

In [26]:
expected = expected.drop_duplicates()

In [27]:
expected.shape

(23050, 31)

In [28]:
expected.columns

Index(['movie', 'genre', 'release date', 'distributor', 'runtime', 'rating',
       'budget', 'domestic gross', 'widest release', 'title',
       'actors_no_nan_2', 'count', 'genre_comedy', 'genre_drama',
       'genre_animation', 'genre_thriller', 'genre_unknown', 'genre_action',
       'genre_horror', 'genre_romance', 'genre_fantasy', 'genre_family',
       'genre_sports', 'genre_foreign', 'genre_music', 'genre_western',
       'genre_crime', 'genre_sci_fi', 'genre_war', 'genre_documentary', 'SLJ'],
      dtype='object')

In [29]:
#expected_action = expected[expected['genre_action'] == 1]

#expected_action.shape

#df_converted_action = expected_action.convert_objects(convert_dates = True, convert_numeric = True)

In [30]:
df_converted = expected.convert_objects(convert_dates = True, convert_numeric = True)

  if __name__ == '__main__':


In [31]:
#df_SLJ_action_effect = df_converted_action.groupby('SLJ')
#df_SLJ_action_effect.mean()

In [32]:
#df_SLJ_action_effect.std()

In [33]:
df_SLJ_effect = df_converted.groupby('SLJ') #groupby object

In [34]:
df_SLJ_effect.mean()

Unnamed: 0_level_0,runtime,budget,domestic gross,widest release,title,count,genre_comedy,genre_drama,genre_animation,genre_thriller,...,genre_fantasy,genre_family,genre_sports,genre_foreign,genre_music,genre_western,genre_crime,genre_sci_fi,genre_war,genre_documentary
SLJ,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,107.92653,58920190.0,48588990.0,1714.538978,468.232558,1.0,0.336899,0.308298,0.049777,0.112904,...,0.032006,0.033597,0.018302,0.010212,0.015826,0.009195,0.056143,0.04235,0.011892,0.005349
1,119.958435,108020800.0,122702600.0,2463.868486,1163.8,1.0,0.13986,0.172494,0.0,0.191142,...,0.04662,0.0,0.013986,0.0,0.006993,0.04662,0.109557,0.04662,0.0,0.002331


In [35]:
df_SLJ_effect.std()

Unnamed: 0_level_0,runtime,budget,domestic gross,widest release,title,count,genre_comedy,genre_drama,genre_animation,genre_thriller,...,genre_fantasy,genre_family,genre_sports,genre_foreign,genre_music,genre_western,genre_crime,genre_sci_fi,genre_war,genre_documentary
SLJ,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,18.571052,52420410.0,76992740.0,1385.671953,752.699739,0.0,0.472661,0.4618,0.217488,0.316482,...,0.176019,0.180194,0.134043,0.100538,0.124805,0.095451,0.230202,0.201391,0.108401,0.072943
1,20.747448,72199030.0,154661200.0,1408.437333,546.0478,0.0,0.347247,0.37825,0.0,0.39366,...,0.21107,0.0,0.11757,0.0,0.083429,0.21107,0.312702,0.21107,0.0,0.04828


In [36]:
sum(df_converted['budget'].value_counts())

11244

In [37]:
#sum(df_converted_action['budget'].value_counts())

In [38]:
df_converted['budget'].value_counts()

40000000.0     528
30000000.0     490
20000000.0     445
60000000.0     398
35000000.0     381
25000000.0     362
50000000.0     358
80000000.0     298
100000000.0    279
150000000.0    270
15000000.0     255
70000000.0     233
75000000.0     231
65000000.0     185
12000000.0     170
45000000.0     166
10000000.0     165
85000000.0     161
200000000.0    151
90000000.0     150
55000000.0     145
110000000.0    135
7000000.0      123
22000000.0     121
18000000.0     121
28000000.0     117
17000000.0     115
6000000.0      113
5000000.0      113
8000000.0      112
              ... 
86000000.0       2
850000.0         2
33100000.0       2
10800000.0       2
2200000.0        2
17700000.0       2
9500000.0        2
27000.0          2
6400000.0        2
8300000.0        2
123000000.0      2
19100000.0       2
2700000.0        2
11500000.0       1
250000.0         1
1300000.0        1
100000.0         1
10200000.0       1
127500000.0      1
3400000.0        1
11800000.0       1
40000.0     

In [39]:
#df_converted_action['budget'].value_counts()

In [40]:
df_converted.columns

Index(['movie', 'genre', 'release date', 'distributor', 'runtime', 'rating',
       'budget', 'domestic gross', 'widest release', 'title',
       'actors_no_nan_2', 'count', 'genre_comedy', 'genre_drama',
       'genre_animation', 'genre_thriller', 'genre_unknown', 'genre_action',
       'genre_horror', 'genre_romance', 'genre_fantasy', 'genre_family',
       'genre_sports', 'genre_foreign', 'genre_music', 'genre_western',
       'genre_crime', 'genre_sci_fi', 'genre_war', 'genre_documentary', 'SLJ'],
      dtype='object')

In [41]:
#df_converted_action.columns

In [42]:
df_converted['Year'] = df_converted['release date'].map(lambda x: x.year)
df_converted['Month'] = df_converted['release date'].map(lambda x: x.month)

In [43]:
#df_converted_action['Year'] = df_converted_action['release date'].map(lambda x: x.year)
#df_converted_action['Month'] = df_converted_action['release date'].map(lambda x: x.month)

In [44]:
df_converted.columns

Index(['movie', 'genre', 'release date', 'distributor', 'runtime', 'rating',
       'budget', 'domestic gross', 'widest release', 'title',
       'actors_no_nan_2', 'count', 'genre_comedy', 'genre_drama',
       'genre_animation', 'genre_thriller', 'genre_unknown', 'genre_action',
       'genre_horror', 'genre_romance', 'genre_fantasy', 'genre_family',
       'genre_sports', 'genre_foreign', 'genre_music', 'genre_western',
       'genre_crime', 'genre_sci_fi', 'genre_war', 'genre_documentary', 'SLJ',
       'Year', 'Month'],
      dtype='object')

In [45]:
#df_converted_action.columns

In [46]:
print (sum((df_converted['budget'].value_counts())))
print (len((df_converted['budget'].value_counts())))

11244
230


In [47]:
#print (sum((df_converted_action['budget'].value_counts())))
#print (len((df_converted_action['budget'].value_counts())))

# i am here

In [48]:
df_converted['rating'].value_counts()

R                10896
PG-13             7648
PG                2744
Unrated            891
Not Yet Rated      383
G                  326
Unknown            123
NC-17               39
Name: rating, dtype: int64

In [49]:
def rating_R(s):
    if 'R' in s:
        return 1
    else:
        return 0

def rating_PG_13(s):
    if 'PG-13' in s:
        return 1
    else:
        return 0

def rating_PG(s):
    if 'PG' in s:
        return 1
    else:
        return 0
    
def unrated(s):
    if 'Unrated' in s:
        return 1
    else:
        return 0

def not_yet_rated(s):
    if 'Not Yet Rated' in s:
        return 1
    else:
        return 0

def rating_G(s):
    if 'G' in s:
        return 1
    else:
        return 0

def unknown(s):
    if 'Unknown' in s:
        return 1
    else:
        return 0

def rating_NC_17(s):
    if 'NC-17' in s:
        return 1
    else:
        return 0

In [50]:
df_converted.shape

(23050, 33)

In [51]:
#df_converted_action.shape

In [52]:
df_converted.columns

Index(['movie', 'genre', 'release date', 'distributor', 'runtime', 'rating',
       'budget', 'domestic gross', 'widest release', 'title',
       'actors_no_nan_2', 'count', 'genre_comedy', 'genre_drama',
       'genre_animation', 'genre_thriller', 'genre_unknown', 'genre_action',
       'genre_horror', 'genre_romance', 'genre_fantasy', 'genre_family',
       'genre_sports', 'genre_foreign', 'genre_music', 'genre_western',
       'genre_crime', 'genre_sci_fi', 'genre_war', 'genre_documentary', 'SLJ',
       'Year', 'Month'],
      dtype='object')

In [53]:
df_converted['title'] = expected['title']

In [54]:
#df_converted_action['title'] = expected_action['title']

In [55]:
df_converted.shape

(23050, 33)

In [56]:
df_converted.columns

Index(['movie', 'genre', 'release date', 'distributor', 'runtime', 'rating',
       'budget', 'domestic gross', 'widest release', 'title',
       'actors_no_nan_2', 'count', 'genre_comedy', 'genre_drama',
       'genre_animation', 'genre_thriller', 'genre_unknown', 'genre_action',
       'genre_horror', 'genre_romance', 'genre_fantasy', 'genre_family',
       'genre_sports', 'genre_foreign', 'genre_music', 'genre_western',
       'genre_crime', 'genre_sci_fi', 'genre_war', 'genre_documentary', 'SLJ',
       'Year', 'Month'],
      dtype='object')

In [95]:
df_action = df_converted[df_converted['genre_action'] == 1]
df_comedy = df_converted[df_converted['genre_comedy'] == 1]

In [98]:
df_action.groupby('distributor').mean().sort_values(by = 'domestic gross', ascending = False).reset_index()

Unnamed: 0,distributor,runtime,budget,domestic gross,widest release,count,genre_comedy,genre_drama,genre_animation,genre_thriller,...,genre_foreign,genre_music,genre_western,genre_crime,genre_sci_fi,genre_war,genre_documentary,SLJ,Year,Month
0,Paramount (DreamWorks),135.236842,1.524737e+08,2.747012e+08,3911.763158,1.0,0.315789,0.000000,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.684211,0.0,0.0,0.000000,2008.842105,6.868421
1,Buena Vista,125.323529,1.555179e+08,2.226884e+08,3296.184874,1.0,0.037313,0.089552,0.0,0.052239,...,0.000000,0.0,0.0,0.007463,0.089552,0.0,0.0,0.171642,2007.716418,6.589552
2,Warner Bros. (New Line),114.000000,1.100000e+08,1.551908e+08,3812.000000,1.0,0.000000,0.000000,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,2015.000000,5.000000
3,Paramount,117.432099,1.130959e+08,1.330382e+08,3383.629630,1.0,0.073359,0.030888,0.0,0.104247,...,0.000000,0.0,0.0,0.065637,0.077220,0.0,0.0,0.250965,2007.193050,5.880309
4,Fox,114.450794,9.800542e+07,1.228495e+08,3241.704762,1.0,0.085890,0.021472,0.0,0.092025,...,0.000000,0.0,0.0,0.015337,0.134969,0.0,0.0,0.033742,2007.996933,5.711656
5,Sony / Columbia,115.072368,1.080573e+08,1.173655e+08,3250.519737,1.0,0.319620,0.075949,0.0,0.075949,...,0.000000,0.0,0.0,0.022152,0.148734,0.0,0.0,0.082278,2007.905063,6.218354
6,Lionsgate,111.538462,7.437748e+07,1.129583e+08,3090.128205,1.0,0.146226,0.094340,0.0,0.070755,...,0.000000,0.0,0.0,0.037736,0.000000,0.0,0.0,0.018868,2012.316038,7.438679
7,Warner Bros.,120.282776,1.219899e+08,1.093998e+08,3096.606684,1.0,0.159145,0.064133,0.0,0.171021,...,0.000000,0.0,0.0,0.000000,0.130641,0.0,0.0,0.045131,2005.992874,6.007126
8,Universal,116.040134,9.193220e+07,9.766187e+07,2917.953177,1.0,0.204403,0.050314,0.0,0.220126,...,0.003145,0.0,0.0,0.040881,0.053459,0.0,0.0,0.000000,2007.270440,6.276730
9,MGM,113.659091,8.978947e+07,7.323382e+07,2569.340909,1.0,0.090909,0.000000,0.0,0.204545,...,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,1997.863636,8.477273


In [99]:
df_comedy.groupby('distributor').mean().sort_values(by = 'domestic gross', ascending = False).reset_index()

Unnamed: 0,distributor,runtime,budget,domestic gross,widest release,count,genre_comedy,genre_drama,genre_animation,genre_thriller,...,genre_foreign,genre_music,genre_western,genre_crime,genre_sci_fi,genre_war,genre_documentary,SLJ,Year,Month
0,Warner Bros. (New Line),109.952703,4.540000e+07,7.897466e+07,3295.722973,1.0,1.0,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,2011.993671,5.797468
1,Paramount (DreamWorks),104.773585,6.267442e+07,7.801847e+07,3049.547170,1.0,1.0,0.094340,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,2008.207547,5.094340
2,Sony / Columbia,103.599653,6.294343e+07,7.057584e+07,2660.568458,1.0,1.0,0.117040,0.0,0.010327,...,0.0,0.0,0.000000,0.013769,0.044750,0.0,0.0,0.017212,2005.287435,7.015491
3,Universal,106.399449,4.629194e+07,6.696931e+07,2577.530303,1.0,1.0,0.069767,0.0,0.000000,...,0.0,0.0,0.009576,0.016416,0.020520,0.0,0.0,0.009576,2005.619699,6.324213
4,Sony (Revolution),99.417722,5.280435e+07,6.530877e+07,2898.962025,1.0,1.0,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.063291,0.000000,0.0,0.0,0.000000,2003.658228,6.392405
5,Dimension Films,98.720000,5.277273e+07,6.421941e+07,2919.680000,1.0,1.0,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.120000,0.000000,0.0,0.0,0.000000,2002.640000,8.280000
6,Fox,101.091255,4.133624e+07,5.632193e+07,2615.954373,1.0,1.0,0.083636,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.021818,0.0,0.0,0.005455,2005.645455,7.274545
7,Warner Bros.,104.815182,4.968159e+07,5.631534e+07,2564.938944,1.0,1.0,0.102439,0.0,0.014634,...,0.0,0.0,0.014634,0.112195,0.029268,0.0,0.0,0.004878,2004.573984,7.367480
8,Weinstein / Dimension,86.225806,3.114286e+07,5.359066e+07,2315.925926,1.0,1.0,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,2010.125000,5.187500
9,Lionsgate/Summit,116.000000,8.400000e+07,5.326256e+07,3016.000000,1.0,1.0,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.307692,2014.230769,7.307692


In [57]:
df_sum_actor = df_converted.groupby('actors_no_nan_2', sort = False).sum()

In [59]:
#df_sum_action_actor = df_converted_action.groupby('actors_no_nan_2', sort = False).sum()

In [60]:
df_sum_actor = df_sum_actor.reset_index()

In [61]:
#df_sum_action_actor = df_sum_action_actor.reset_index()

In [62]:
df_sum_actor.sort_values(by = 'count', ascending = False)

Unnamed: 0,index,actors_no_nan_2,runtime,budget,domestic gross,widest release,count,genre_comedy,genre_drama,genre_animation,...,genre_foreign,genre_music,genre_western,genre_crime,genre_sci_fi,genre_war,genre_documentary,SLJ,Year,Month
101,101,Jr.,11673.0,3.534000e+09,6.345186e+09,191055.0,106,36,35,4,...,1,2,1,9,2,1,0,5,212343,693
37,37,Samuel L. Jackson,9813.0,3.408500e+09,5.742738e+09,164325.0,91,16,20,3,...,0,1,2,12,3,0,1,88,182244,590
81,81,Bruce Willis,6598.0,2.349300e+09,3.188203e+09,134052.0,61,18,10,3,...,0,0,0,9,5,1,0,3,122133,440
53,53,Stanley Tucci,6278.0,2.108500e+09,3.511587e+09,99736.0,59,23,17,3,...,0,1,0,6,1,0,0,2,118187,455
21,21,John Leguizamo,5901.0,1.376000e+09,2.156342e+09,97938.0,59,17,16,6,...,0,1,0,6,0,0,0,0,118216,383
13,13,Robert DeNiro,6422.0,1.505400e+09,2.581228e+09,112687.0,58,22,20,1,...,0,0,0,14,0,0,0,2,116206,464
55,55,Julianne Moore,6199.0,1.288500e+09,1.983399e+09,71290.0,57,19,20,0,...,0,1,0,2,5,0,0,0,114220,404
128,128,Christopher Walken,5644.0,1.110200e+09,1.609224e+09,74856.0,56,26,9,1,...,0,3,0,13,1,0,0,2,112093,401
42,42,Willem Dafoe,5726.0,1.507200e+09,2.101273e+09,64019.0,54,7,13,2,...,0,1,0,6,2,0,1,2,108227,371
383,383,Paul Giamatti,6071.0,1.430000e+09,2.452910e+09,89874.0,54,23,19,5,...,0,2,0,2,4,1,0,1,108328,378


In [63]:
#df_sum_action_actor.sort_values(by = 'count', ascending = False)

In [64]:
df_sum_actor = df_sum_actor.drop(101)

In [65]:
#df_sum_action_actor = df_sum_action_actor.drop(17)

In [66]:
df_sum_actor.shape

(2555, 28)

In [67]:
#df_sum_action_actor.shape

In [68]:
df_actors_by_count = df_sum_actor.sort_values('count', ascending = False)

df_actors_by_life_gross = df_sum_actor.sort_values('domestic gross', ascending = False)

In [69]:
#df_action_actors_by_count = df_sum_action_actor.sort_values('count', ascending = False)
#df_action_actors_by_life_gross = df_sum_action_actor.sort_values('domestic gross', ascending = False)

In [70]:
df_actors_by_count = df_actors_by_count.reset_index()


In [72]:
#df_action_actors_by_count = df_action_actors_by_count.reset_index()

In [73]:
#df_actors_by_life_gross = df_actors_by_life_gross.reset_index()

ValueError: cannot insert level_0, already exists

In [None]:
#df_action_actors_by_life_gross = df_action_actors_by_life_gross.reset_index()

In [75]:
del df_actors_by_count['index']

In [76]:
del df_actors_by_life_gross['index']

In [77]:
#del df_action_actors_by_count['index']

In [78]:
#del df_action_actors_by_life_gross['index']

In [79]:
#df_action_actors_by_count.head(10)

In [80]:
#df_action_actors_by_life_gross.head(10)

In [82]:
del df_actors_by_count['level_0']

In [83]:
df_actors_by_count.head(10)

Unnamed: 0,actors_no_nan_2,runtime,budget,domestic gross,widest release,count,genre_comedy,genre_drama,genre_animation,genre_thriller,...,genre_foreign,genre_music,genre_western,genre_crime,genre_sci_fi,genre_war,genre_documentary,SLJ,Year,Month
0,Samuel L. Jackson,9813.0,3408500000.0,5742738000.0,164325.0,91,16,20,3,21,...,0,1,2,12,3,0,1,88,182244,590
1,Bruce Willis,6598.0,2349300000.0,3188203000.0,134052.0,61,18,10,3,13,...,0,0,0,9,5,1,0,3,122133,440
2,Stanley Tucci,6278.0,2108500000.0,3511587000.0,99736.0,59,23,17,3,7,...,0,1,0,6,1,0,0,2,118187,455
3,John Leguizamo,5901.0,1376000000.0,2156342000.0,97938.0,59,17,16,6,12,...,0,1,0,6,0,0,0,0,118216,383
4,Robert DeNiro,6422.0,1505400000.0,2581228000.0,112687.0,58,22,20,1,12,...,0,0,0,14,0,0,0,2,116206,464
5,Julianne Moore,6199.0,1288500000.0,1983399000.0,71290.0,57,19,20,0,9,...,0,1,0,2,5,0,0,0,114220,404
6,Christopher Walken,5644.0,1110200000.0,1609224000.0,74856.0,56,26,9,1,1,...,0,3,0,13,1,0,0,2,112093,401
7,Paul Giamatti,6071.0,1430000000.0,2452910000.0,89874.0,54,23,19,5,4,...,0,2,0,2,4,1,0,1,108328,378
8,Willem Dafoe,5726.0,1507200000.0,2101273000.0,64019.0,54,7,13,2,12,...,0,1,0,6,2,0,1,2,108227,371
9,William H. Macy,5569.0,858000000.0,1489120000.0,64331.0,53,17,21,3,11,...,0,1,0,7,1,0,0,0,106111,409


In [85]:
del df_actors_by_life_gross['level_0']

In [86]:
df_actors_by_life_gross.head(10)

Unnamed: 0,actors_no_nan_2,runtime,budget,domestic gross,widest release,count,genre_comedy,genre_drama,genre_animation,genre_thriller,...,genre_foreign,genre_music,genre_western,genre_crime,genre_sci_fi,genre_war,genre_documentary,SLJ,Year,Month
0,Samuel L. Jackson,9813.0,3408500000.0,5742738000.0,164325.0,91,16,20,3,21,...,0,1,2,12,3,0,1,88,182244,590
1,Robert Downey,5411.0,2051000000.0,3949538000.0,84894.0,48,16,12,1,4,...,1,1,0,2,0,0,0,4,96164,339
2,Elizabeth Banks,4366.0,1656100000.0,3791448000.0,84591.0,41,17,10,1,2,...,0,1,0,0,2,0,0,1,82376,257
3,Stanley Tucci,6278.0,2108500000.0,3511587000.0,99736.0,59,23,17,3,7,...,0,1,0,6,1,0,0,2,118187,455
4,Alan Rickman,4048.0,1637000000.0,3457901000.0,62169.0,33,7,6,0,3,...,0,2,1,1,1,0,0,0,66107,268
5,Johnny Depp,5340.0,2996500000.0,3360845000.0,98434.0,49,9,10,2,5,...,0,3,2,4,1,1,2,0,98247,337
6,Warwick Davis,1849.0,1535000000.0,3353207000.0,46702.0,14,1,1,0,0,...,0,1,0,0,2,0,0,0,28080,99
7,Owen Wilson,5029.0,2163000000.0,3350915000.0,113306.0,49,33,2,4,1,...,0,0,2,3,1,0,0,0,98314,369
8,Stellan Skarsgard,5284.0,2083000000.0,3341433000.0,59294.0,44,1,11,0,5,...,3,2,0,0,0,0,0,4,88181,306
9,Morgan Freeman,5532.0,2063000000.0,3324335000.0,111593.0,52,13,13,1,11,...,0,0,1,4,4,0,1,0,104312,336


In [None]:
df_sum_actor.columns

In [None]:
df_sum_actor[df_sum_actor['SLJ'] >= 1]

In [None]:
df_sum_action_SLJ = df_sum_action_actor[df_sum_action_actor['SLJ'] >= 1]

In [None]:
df_sum_action_no_SLJ = df_sum_action_actor[df_sum_action_actor['SLJ'] == 0]

In [None]:
df_sum_SLJ = df_sum_actor[df_sum_actor['SLJ'] >= 1]

In [None]:
df_sum_no_SLJ = df_sum_actor[df_sum_actor['SLJ'] == 0]

In [None]:
%matplotlib inline
# distrib of sum of grosses of actors who are not in movies with SLJ
df_sum_no_SLJ['domestic gross'].hist(bins = 20) 
# distrib of sum of grosses of actors who are in at least 1 movie with SLJ
df_sum_SLJ['domestic gross'].hist(bins = 20);



In [None]:
%matplotlib inline
# distrib of sum of grosses of actors who are not in movies with SLJ
df_sum_action_no_SLJ['domestic gross'].hist(bins = 20) 
# distrib of sum of grosses of actors who are in at least 1 movie with SLJ
df_sum_action_SLJ['domestic gross'].hist(bins = 20);

In [None]:
len(df_sum_action_no_SLJ['domestic gross'])

In [None]:
len(df_sum_action_SLJ['domestic gross'])

In [None]:
len(df_sum_no_SLJ['domestic gross'])

In [None]:
len(df_sum_SLJ['domestic gross'])

In [None]:

mean_no_SLJ = df_sum_no_SLJ['domestic gross'].mean()
mean_SLJ = df_sum_SLJ['domestic gross'].mean()

no_SLJ = df_sum_no_SLJ['domestic gross'].mean()*len(df_sum_no_SLJ['domestic gross'])
all_SLJ = df_sum_SLJ['domestic gross'].mean()*len(df_sum_SLJ['domestic gross'])


In [None]:
mean_action_no_SLJ = df_sum_action_no_SLJ['domestic gross'].mean()
mean_action_SLJ = df_sum_action_SLJ['domestic gross'].mean()

action_no_SLJ = df_sum_action_no_SLJ['domestic gross'].mean()*len(df_sum_action_no_SLJ['domestic gross'])
action_all_SLJ = df_sum_action_SLJ['domestic gross'].mean()*len(df_sum_action_SLJ['domestic gross'])


In [None]:
### Movies with Samuel L. Jackson: Domestic gross, 1990 onwards

In [None]:
mean_SLJ, mean_no_SLJ, mean_SLJ/mean_no_SLJ


In [None]:
### action movies w SLJ
mean_action_SLJ, mean_action_no_SLJ, mean_action_SLJ/mean_action_no_SLJ

In [None]:
df_sum_SLJ.columns

In [None]:
import matplotlib.pyplot as plt
#plt.scatter(df_sum_actor['count'], df_sum_actor['domestic gross'], c = 'y', edgecolors = 'y')
plt.scatter(df_sum_no_SLJ['count'], df_sum_no_SLJ['domestic gross'], c = 'b', edgecolors = 'b')
plt.scatter(df_sum_SLJ['count'], df_sum_SLJ['domestic gross'], c = 'r', edgecolors = 'r')

In [None]:
import matplotlib.pyplot as plt
#plt.scatter(df_sum_actor['count'], df_sum_actor['domestic gross'], c = 'y', edgecolors = 'y')
plt.scatter(df_sum_action_no_SLJ['count'], df_sum_action_no_SLJ['domestic gross'], c = 'b', edgecolors = 'b')
plt.scatter(df_sum_action_SLJ['count'], df_sum_action_SLJ['domestic gross'], c = 'r', edgecolors = 'r')

In [None]:
plt.scatter(df_sum_actor['count'], df_sum_actor['widest release'], edgecolor = 'b')
plt.scatter(df_sum_SLJ['count'], df_sum_SLJ['widest release'], c = 'y', edgecolor = 'y')

In [None]:
plt.scatter(df_sum_action_actor['count'], df_sum_action_actor['widest release'], edgecolor = 'b')
plt.scatter(df_sum_action_SLJ['count'], df_sum_action_SLJ['widest release'], c = 'y', edgecolor = 'y')

In [None]:
df_sum_actor['money_per_movie'] = df_sum_actor['domestic gross']/df_sum_actor['count']
df_sum_SLJ['money_per_movie'] = df_sum_SLJ['domestic gross']/df_sum_SLJ['count']

In [None]:
plt.scatter(df_sum_actor['count'], df_sum_actor['money_per_movie'], edgecolor = 'b')
plt.scatter(df_sum_SLJ['count'], df_sum_SLJ['money_per_movie'], c = 'y', edgecolor = 'y')

In [None]:
#plt.scatter(df_sum_action_actor['count'], df_sum_action_actor['money_per_movie'], edgecolor = 'b')
#plt.scatter(df_sum_action_SLJ['count'], df_sum_action_SLJ['money_per_movie'], c = 'y', edgecolor = 'y')

In [None]:
df_sum_actor.shape

In [None]:
df_sum_actor.sort_values('domestic gross', ascending = False)

In [None]:
df_sum_SLJ.sort_values('domestic gross', ascending = False)

In [None]:
df_sum_SLJ.shape

In [None]:
df_sum_no_SLJ.shape

In [None]:
df_sum_actor.sort_values(by = 'widest release' , ascending = False)

In [None]:
df_year = df_converted.groupby('Year').mean().reset_index()


In [None]:
df_year.columns

In [None]:
plt.plot(df_year['Year'], df_year['budget'])


In [None]:
plt.plot(df_year['Year'], df_year['Month'])


In [None]:
plt.plot(df_year['Year'], df_year['domestic gross'])


In [None]:
plt.plot(df_year['budget'], df_year['domestic gross'])

In [None]:
df_widest = df_sum_actor.sort_values(by = 'widest release' , ascending = False)

In [None]:
df_actors_by_count = df_sum_actor.sort_values('count', ascending = False)

df_actors_by_life_gross = df_sum_actor.sort_values('domestic gross', ascending = False)

In [None]:
df_actors_by_life_gross.head(10)

In [None]:
df_action_actors_by_life_gross = df_sum_action_actor.sort_values('domestic gross', ascending = False)

In [None]:
df_action_actors_by_life_gross.head(10)

In [None]:
df_sum_actor['money_per_theater'] = df_sum_actor['domestic gross']/df_sum_actor['widest release']

In [None]:
df_sum_actor.sort_values(by = 'money_per_theater', ascending = False)

In [None]:
plt.scatter(df_sum_actor['count'], df_sum_actor['money_per_theater'])

In [None]:
df_sum_actor.head()

In [None]:
df_sum_actor['avg_cost_movie'] = df_sum_actor['budget']/df_sum_actor['count']
df_sum_actor['avg_dom_gross_movie'] = df_sum_actor['domestic gross']/df_sum_actor['count']


In [None]:
plt.scatter(df_sum_actor['avg_cost_movie'], df_sum_actor['avg_dom_gross_movie'])

In [None]:
df_sum_actor['avg_cost_movie'].hist(bins = 100)

In [None]:
df_sum_actor['avg_dom_gross_movie'].hist(bins = 100)

In [None]:
df_sum_actor[df_sum_actor['avg_dom_gross_movie'] >= 100000000]['avg_dom_gross_movie'].hist(bins = 20)

In [None]:
df_sum_actor['avg_cost_movie'].hist(bins = 50)
df_sum_actor[df_sum_actor['avg_dom_gross_movie'] >= 100000000]['avg_cost_movie'].hist(bins = 50)

In [None]:
plt.scatter(df_sum_actor['avg_cost_movie'], df_sum_actor['avg_dom_gross_movie'], c = 'r')
plt.scatter(df_sum_actor[df_sum_actor['avg_dom_gross_movie'] >= 100000000]['avg_cost_movie'], df_sum_actor[df_sum_actor['avg_dom_gross_movie'] >= 100000000]['avg_dom_gross_movie'])
plt.xlabel('Avg cost movie')
plt.ylabel('Avg domestic gross movie')

In [None]:
df_sum_actor['domestic gross'].hist(bins= 100)

In [None]:
df_sum_actor

In [None]:
df_sum_actor.sort_values(by = 'money_per_movie' , ascending = False)

In [None]:
df_sum_actor.dropna().sort_values(by = 'count', ascending = False)

In [None]:
# /movies/?id=punchline.htm

In [None]:
df.to_pickle('movies_1_v2.pkl')

In [None]:
df.shape

In [None]:
dict_2 = movie_scraper(all_movies[11924:])

In [None]:
df2 = pd.DataFrame(dict_2)

In [None]:
df2 = df2.transpose()
df2.columns = ['movie', 'genre','release date', 'distributor', 'runtime', 'rating', 'budget', 'domestic gross', 'actors', '1st weekend gross', 'widest release', 'producers', 'directors']
df2['title'] = df2.index
df2.index = range(len(df2))

In [None]:
df2.to_pickle('movies_2.pkl')


## Miscellaneous

In [None]:
def get_genres(soup):
    """ returns all genres from specific movie page at boxofficemojo.com"""
    genres_list = []
    try:
        genres = soup.find(id="body").find(text=re.compile("Genres"))
        genres = genres.findParent().findNextSibling().find_all('tr')
        genre_count = 0
        for genre in genres:
            if genre_count > 0:
                genres_list.append(genre.td.font.a.text)
            genre_count += 1
    except LookupError:
        try:
            genres = soup.find(id="body").find(text=re.compile("Genre"))
            genres = genres.findNextSibling().text
            genres_list.append(genres)
        except:
            genres_list.append("N/A")
    return genres_list


def get_title(soup):
    """returns title from specific movie page at boxofficemojo.com"""
    try:
        title = soup.find("title").text.rsplit('(', 1)[0].strip()
    except LookupError:
        title = "N/A"
    return title


def get_release_date(soup):
    """returns datetime value of release date from specific movie
    page at boxofficemojo.com
    """
    try:
        date = soup.find(id="body").find(text=re.compile("Release Date"))
        date = date.findNextSibling().text
        date = datetime.strptime(date, "%B %d, %Y")
        return date
    except LookupError:
        return "N/A"


def get_distributor(soup):
    """returns movie distributor from specific movie page at boxofficemojo.com"""
    try:
        distributor = soup.find(id="body").find(text=re.compile("Distributor"))
        distributor = distributor.findNextSibling().text
        return distributor
    except LookupError:
        return "N/A"


def get_rating(soup):
    """returns MPAA Rating from specific movie page at boxofficemojo.com"""
    try:
        rating = soup.find(id="body").find(text=re.compile("MPAA Rating"))
        rating = rating.findNextSibling().text
        return rating
    except LookupError:
        return "N/A"


def get_runtime(soup):
    """returns integer value of runtime from specific movie page at boxofficemojo.com"""
    try:
        runtime = soup.find(id="body").find(text=re.compile("Runtime"))
        runtime = runtime.findNextSibling().text
        time_splits = runtime.split("hrs.")
        try:
            hrs = int(time_splits[0]) * 60
        except LookupError:
            hrs = 0
        mins = int(time_splits[1].split(" min.")[0].strip())
        total = hrs + mins
        return total
    except LookupError:
        return "N/A"


def get_budget(soup):
    """returns movie budget from specific movie page at boxofficemojo.com"""
    try:
        budget = soup.find(id="body").find(text=re.compile("Production Budget"))
        budget = budget.findNextSibling().text
        if budget != "N/A":
            budget = int(budget.split("million")[0].split("$")[1].strip()) * 1000000
        return budget
    except LookupError:
        return "N/A"


def get_domestic_gross(soup):
    """returns integer value of domestic gross from specific movie page at boxofficemojo.com"""
    try:
        gross = soup.find(id="body").find(text=re.compile("Domestic Total Gross: "))
        gross = gross.findNextSibling().text
        gross = int(gross.replace("$", "").replace(",", ""))
        return gross
    except LookupError:
        try:
            gross = soup.find(id="body").find(tex=re.compile("Domestic:"))
            gross = gross.findParent().findNextSibling().text
            return gross
        except:
            return "N/A"
        
        
def get_movie_value_next(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    obj = soup.find(text=re.compile(field_name))
    
    if obj:
        return obj.next.text
    else:
        return None

def get_actors(soup):
    actor = get_movie_value_next(soup,'Actor')
    v = re.sub('([a-z()])([A-Z])', '\g<1>, \g<2>', actor)
    actors = re.sub('[(*\']','', v)
    actors = actors.split(",")
    return actors


def money_to_int(moneystring):
    moneystring = moneystring.replace('$', '').replace(',', '')
    return int(moneystring)

def get_opening_weekend(soup):
    opening_weekend_gross = get_movie_value_next(soup,'Opening Weekend')
    opening_weekend_gross = money_to_int(opening_weekend_gross)
    return opening_weekend_gross
    
    
def scrape_movie_data(movie_list, start=0, end=20000):
    """returns dictionary of movies and relevant data from boxofficemojo.com:
    genres(as a list), release date, distributor, runtime, MPAA rating,
    budget, gross domestic revenue
    """
    movie_data_list = {}
    counter = 0
    for movie in movie_list:
        try:
            if start < counter < end and counter < len(movie_list):
                url = "http://www.boxofficemojo.com/" + movie
                
                response = requests.get(url)
                time.sleep(0.1)
                page = response.text
                soup = BeautifulSoup(page, "lxml")
                print(movie)
                movie_data_list[get_title(soup)] = [get_genres(soup), get_release_date(soup), 
                                                    get_distributor(soup), get_runtime(soup), 
                                                    get_rating(soup), get_budget(soup), 
                                                    get_domestic_gross(soup), get_actors(soup), 
                                                    get_opening_weekend(soup)]
                
            counter += 1
        except: #(Exception, e):
            pass

    return movie_data_list


#def main():
#    pass

#if __name__ == "__main__":
#    main()

In [None]:
movie_data_subset = scrape_movie_data(all_movies_subset, start=0, end=20000)

In [None]:
movie_data_subset

In [None]:
df = pd.DataFrame(movie_data_subset)

In [None]:
df = df.transpose()

In [None]:
movies_list_A = []
url = 'http://www.boxofficemojo.com/movies/alphabetical.htm?letter=A&page=1'
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, 'lxml')
rows = soup.find(id = 'body').find('table').find('table').find_all('table')[1].find_all("tr")
if len(rows) > 1:
    counter = 1
    for row in rows:
    # skip index row
        if counter > 1:
            link = row.td.font.a['href']
            # don't add duplicates
            if link not in movies_list:
                movies_list_A.append(link)

        counter += 1

In [None]:
def movie_actors(soup):
    try:
        actor = get_movie_value_next_next(soup,'Actor')
        v = re.sub('([a-z()])([A-Z])', '\g<1>, \g<2>', actor)
        if "*" in v:
            actors = v.split('*')
        else:
            actors = re.sub('[(*\']','', v)
            actors = actors.split(",")
        #if "*" in actors:
        #    actors =  actors.split('*')

    except TypeError:
        actors = float('NaN')
    return actors

In [None]:
def get_movie_value_next_next(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    obj = soup.find(text=re.compile(field_name))
    
    if obj:
        return obj.next.next.text
    else:
        return None

In [None]:
def get_movie_value_next(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    obj = soup.find(text=re.compile(field_name))
    if not obj: 
        return None
    # this works for most of the values
    next_sibling = obj.findNextSibling()
    if next_sibling:
        return next_sibling.text # RETURN TEXT OF NEXT SIBLING
    else:
        return None

In [None]:
def release_date(soup):
    try:
        raw_release_date = get_movie_value(soup,'Release Date')
        if raw_release_date != 'TBD' and raw_release_date != 'N/A' and raw_release_date.split()[0] != 'Spring' and raw_release_date.split()[0] != 'Summer' and raw_release_date.split()[0] != 'Fall' and raw_release_date.split()[0] != 'Winter' :
            release_date = to_date(raw_release_date)
        else:
            release_date = float('NaN')
    except AttributeError:
        release_date = float('NaN')
    return release_date

In [None]:
!pwd

In [None]:
def get_actors(soup):
    actor = get_movie_value_next(soup,'Actor')
    v = re.sub('([a-z()])([A-Z])', '\g<1>, \g<2>', actor)
    v = v.replace('(, Voice)','')
    actors = re.sub('[(*\']','', v)
    actors = actors.split(",")
    return actors

In [None]:
def movie_actors(soup):
    try:
        actor = get_movie_value_next_next(soup,'Actor')
        v = re.sub('([a-z()])([A-Z])', '\g<1>, \g<2>', actor)
        if "*" in v:
            actors = v.split('*')
        else:
            actors = re.sub('[(*\']','', v)
            actors = actors.split(",")
        #if "*" in actors:
        #    actors =  actors.split('*')

    except TypeError:
        actors = float('NaN')
    return actors

In [None]:
#url = 'http://www.boxofficemojo.com/movies/?id=disney2016.htm'
url = 'http://www.boxofficemojo.com/movies/?id=ghostbusters2016.htm'
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, "lxml")

In [None]:
get_movie_value_next_next(soup, 'Actor')

In [None]:
actor = get_movie_value_next_next(soup,'Actor')

In [None]:
actor

In [None]:
def movie_actors(soup): # still doesn't deal well with McPeople
    try:
        actor = get_movie_value_next_next(soup,'Actor') # try get_movie_value_next if it doesnt work 
        actor = actor.replace('*','')
        v = re.sub('([a-z()])([A-Z])', '\g<1>,\g<2>', actor)
        v = v.replace(' (,Voice)','')
        v = v.replace(' (,Cameo)','')
        actors = re.sub('[(*\']','', v)
        actors = actors.split(',')
        #if "*" in v:
        #    actors = v.split('*')
        #else:
        #    actors = re.sub('[(*\']','', v)
        #    actors = actors.split(",")

    except TypeError:
        actors = float('NaN')
    return actors

In [None]:
# startswith()
# Mc, De, van, Mac, Du, Le

In [None]:
print (movie_actors(soup))

In [None]:
actor = get_movie_value_next_next(soup, 'Actor')

In [None]:
actor = actor.replace('*','')

In [None]:
actor

In [None]:
s = actor.replace('(Cameo)','')

In [None]:
v = re.sub('([a-z()])([A-Z])', '\g<1>, \g<2>', actor)

In [None]:
v

In [None]:
#url = 'http://www.boxofficemojo.com/movies/?id=disney2016.htm'
#url = 'http://www.boxofficemojo.com/movies/?id=ghostbusters2016.htm'
#url = "http://www.boxofficemojo.com/movies/?id=13goingon30.htm"
url = "http://www.boxofficemojo.com/movies/?id=9dot99.htm"
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, "lxml")

In [None]:
def get_movie_value_next_next(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    
    for br in soup.findAll('br'):
        br.replace_with('\n')
    
    obj = soup.find(text=re.compile(field_name))
    
    if obj:
        return obj.next.next.text 
    else:
        return None

In [None]:
actor = get_movie_value_next_next(soup, 'Actor')

In [None]:
actor.replace('*','').replace(' (Cameo)','').replace('(Voice)','').replace('\n',', ')

In [None]:
def movie_actors(soup): # still doesn't deal well with McPersons, DeVitos, DaSouzas, etc. 
    try:
        actor = get_movie_value_next_next(soup,'Actor') # try get_movie_value_next if it doesnt work 
        if str(actor) != 'None':
            actors = actor.replace('*','').replace(' (Cameo)','').replace('(Voice)','').replace('\n',', ')
            actors = actors.split(", ")
            #v = v.replace('(Voice)','')
            #v = v.replace('(Cameo)', '')
            #v = v.replace('\n','')
            #if "*" in v:
            #    actors = v.split('*')
            #else:
            #    actors = re.sub('[(*\']','', v)
            return actors

    except TypeError:
        #actors = float('NaN')
        return actors

In [None]:
def movie_actors(soup): # still doesn't deal well with McPersons, DeVitos, DaSouzas, etc. 
    try:
        actor = get_movie_value_next_next(soup,'Actor') # try get_movie_value_next if it doesnt work 
        if str(actor) != 'None':
            actors = actor.replace('*','').replace(' (Cameo)','').replace('(Voice)','').replace('\n',', ')
            actors = actors.split(", ")
            return actors

    except TypeError:
        return actors

In [None]:
print(movie_actors(soup))

In [None]:
opening_weekend_gross(soup)

In [None]:
#fix this here. ARGH!
#def money_to_int_2(moneystring):
#    if moneystring != 'N/A':
#        moneystring = moneystring.replace('$', '').replace('.','').replace(',','').replace(' million', '000000')
#        return int(moneystring)

#def money_to_int_2(moneystring):
#    if moneystring != 'N/A':
#        moneystring = moneystring.replace('$','').replace(',','')
#        if 'million' in moneystring:
#            moneystring.replace('million','')
#            return float(moneystring)*1000000
#        else:
#            return float(moneystring)

In [None]:
#def get_movie_value_next_next(soup, field_name):
#    '''Grab a value from boxofficemojo HTML
#    
#    Takes a string attribute of a movie on the page and
#    returns the string in the next sibling object
#    (the value for that attribute)
#    or None if nothing is found.
#    '''
#    obj = soup.find(text=re.compile(field_name))
#    
#    if obj:
#        return obj.next.next.text
#    else:
#        return None

def get_movie_value_next_next(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    
    for br in soup.findAll('br'):
        br.replace_with('\n')
    
    obj = soup.find(text=re.compile(field_name))
    
    if obj:
        return obj.next.next.text 
    else:
        return None

In [None]:
#def movie_actors(soup):
#    try:
#        actor = get_movie_value_next(soup,'Actor')
#        v = re.sub('([a-z()])([A-Z])', '\g<1>, \g<2>', actor)
#        actors = re.sub('[(*\']','', v)
#        actors = actors.split(",") 
#    except TypeError:
#        actors = float('NaN')
#
#    return actors

#def movie_actors(soup):
#    try:
#        actor = get_movie_value_next(soup,'Actor')
#        if '*' in actor:
#            actors =  actor.split('*')
#        else:
#            v = re.sub('([a-z()])([A-Z])', '\g<1>, \g<2>', actor)
#            actors = re.sub('[(*\']','', v)
#            actors = actors.split(",") 
#    except TypeError:
#        actors = float('NaN')
#    return actors

#def movie_actors(soup): # still doesn't deal well with McPersons, DeVitos, DaSouzas, etc. 
#    try:
#        actor = get_movie_value_next_next(soup,'Actor') # try get_movie_value_next if it doesnt work 
#        v = re.sub('([a-z()])([A-Z])', '\g<1>, \g<2>', actor)
#        v = v.replace('(, Voice)','')
#        v = v.replace('(, Cameo)', '')
#        if "*" in v:
#            actors = v.split('*')
#        else:
#            actors = re.sub('[(*\']','', v)
#            actors = actors.split(",")
#
#    except TypeError:
#        actors = float('NaN')
#    return actors

#def movie_actors(soup): # still doesn't deal well with McPersons, DeVitos, DaSouzas, etc. 
#    try:
#        actor = get_movie_value_next_next(soup,'Actor') # try get_movie_value_next if it doesnt work 
#        actors = actor.replace('*','').replace(' (Cameo)','').replace('(Voice)','').replace('\n',', ')
#        #v = v.replace('(Voice)','')
#        #v = v.replace('(Cameo)', '')
#        #v = v.replace('\n','')
#        #if "*" in v:
#        #    actors = v.split('*')
#        #else:
#        #    actors = re.sub('[(*\']','', v)
#        actors = actors.split(", ")
#
#    except TypeError:
#        actors = float('NaN')
#    return actors
