The code in this notebook reads in a list of dictionaries that each contain:
- Movie Title
- Studio
- Opening
- Budget
- Earliest Release Date
- MPAA Rating
- Runtime
- Genres
- Domestic Gross
- International Gross

The data are packaged into a pandas data frame and linear regression models were generated to evaluate influence of these features on Domestic Gross earnings.





In [53]:
import requests
import pickle
import pandas as pd
import numpy as np
import matplotlib as plt

In [54]:
# Read in the list of dictionaries with movie data
with open('movies_data_raw.pickle','rb') as read_file:
    movies_raw = pickle.load(read_file)

In [55]:
# convert to a data frame
movies_df = pd.DataFrame(movies_raw)
movies_df

Unnamed: 0,Title,Studio,Opening,Budget,Release,Rating,Runtime,Genre,Domestic,International
0,Toy Story 4,Walt Disney Studios Motion Pictures,"$120,908,065","$200,000,000","June 20, 2019",G,1 hr 40 min,"[Adventure, Animation, Comedy, Family, Fantasy]","$434,038,008","$639,356,585"
1,The Lion King,Walt Disney Studios Motion Pictures,"$1,586,753","$45,000,000","June 15, 1994",G,1 hr 28 min,"[Adventure, Animation, Drama, Family, Musical]","$422,783,777","$545,728,028"
2,Toy Story 3,Walt Disney Studios Motion Pictures,"$110,307,189","$200,000,000","June 16, 2010",na,1 hr 43 min,"[Adventure, Animation, Comedy, Family, Fantasy]","$415,004,880","$651,964,823"
3,Finding Nemo,Walt Disney Studios Motion Pictures,"$70,251,710","$94,000,000","May 30, 2003",na,1 hr 40 min,"[Adventure, Animation, Comedy, Family]","$380,843,261","$559,506,933"
4,"Monsters, Inc.",Walt Disney Studios Motion Pictures,"$62,577,067","$115,000,000","November 2, 2001",G,1 hr 32 min,"[Adventure, Animation, Comedy, Family, Fantasy]","$289,916,256","$289,064,814"
...,...,...,...,...,...,...,...,...,...,...
3358,Three Days of the Condor,Paramount Pictures,na,na,"September 24, 1975",na,1 hr 57 min,"[Mystery, Thriller]","$27,476,252","$27,476,252"
3359,Friday,New Line Cinema,"$6,589,341","$3,500,000","April 28, 1995",R,1 hr 31 min,"[Comedy, Drama]","$27,467,564","$748,354"
3360,The Brothers,Screen Gems,"$10,302,846","$6,000,000","March 23, 2001",R,1 hr 46 min,"[Comedy, Drama]","$27,457,409","$500,782"
3361,Midsommar,A24,"$6,560,030","$9,000,000","July 3, 2019",R,2 hr 28 min,"[Drama, Horror, Mystery, Thriller]","$27,426,361","$20,476,738"


In [56]:
# A lot of movies don't have MPAA rating listed
movies_df['Rating'].value_counts()

PG-13       932
na          859
R           817
PG          657
G            97
Approved      1
Name: Rating, dtype: int64

In [57]:
# replace values that don't correspond to MPAA ratings with rating from previous row
for index, row in movies_df.iterrows():
    if row['Rating'] == 'G' or row['Rating'] == 'PG' or row['Rating'] == 'PG-13' or row['Rating'] == 'R':
        prev_rating = row['Rating']
        pass
    else:
        row['Rating'] = prev_rating

In [58]:
movies_df

Unnamed: 0,Title,Studio,Opening,Budget,Release,Rating,Runtime,Genre,Domestic,International
0,Toy Story 4,Walt Disney Studios Motion Pictures,"$120,908,065","$200,000,000","June 20, 2019",G,1 hr 40 min,"[Adventure, Animation, Comedy, Family, Fantasy]","$434,038,008","$639,356,585"
1,The Lion King,Walt Disney Studios Motion Pictures,"$1,586,753","$45,000,000","June 15, 1994",G,1 hr 28 min,"[Adventure, Animation, Drama, Family, Musical]","$422,783,777","$545,728,028"
2,Toy Story 3,Walt Disney Studios Motion Pictures,"$110,307,189","$200,000,000","June 16, 2010",G,1 hr 43 min,"[Adventure, Animation, Comedy, Family, Fantasy]","$415,004,880","$651,964,823"
3,Finding Nemo,Walt Disney Studios Motion Pictures,"$70,251,710","$94,000,000","May 30, 2003",G,1 hr 40 min,"[Adventure, Animation, Comedy, Family]","$380,843,261","$559,506,933"
4,"Monsters, Inc.",Walt Disney Studios Motion Pictures,"$62,577,067","$115,000,000","November 2, 2001",G,1 hr 32 min,"[Adventure, Animation, Comedy, Family, Fantasy]","$289,916,256","$289,064,814"
...,...,...,...,...,...,...,...,...,...,...
3358,Three Days of the Condor,Paramount Pictures,na,na,"September 24, 1975",R,1 hr 57 min,"[Mystery, Thriller]","$27,476,252","$27,476,252"
3359,Friday,New Line Cinema,"$6,589,341","$3,500,000","April 28, 1995",R,1 hr 31 min,"[Comedy, Drama]","$27,467,564","$748,354"
3360,The Brothers,Screen Gems,"$10,302,846","$6,000,000","March 23, 2001",R,1 hr 46 min,"[Comedy, Drama]","$27,457,409","$500,782"
3361,Midsommar,A24,"$6,560,030","$9,000,000","July 3, 2019",R,2 hr 28 min,"[Drama, Horror, Mystery, Thriller]","$27,426,361","$20,476,738"


In [60]:
# In some cases, international earnings were not defined
# and the scraping algorithm reported international earnings as equal to domestic earnings
# For cases where International Gross == Domestic Gross, replace International Gross with na
for index, row in movies_df.iterrows():
    if row['Domestic'] == row['International']:
        row['International'] = 'na'
    else:
        pass
movies_df

Unnamed: 0,Title,Studio,Opening,Budget,Release,Rating,Runtime,Genre,Domestic,International
0,Toy Story 4,Walt Disney Studios Motion Pictures,"$120,908,065","$200,000,000","June 20, 2019",G,1 hr 40 min,"[Adventure, Animation, Comedy, Family, Fantasy]","$434,038,008","$639,356,585"
1,The Lion King,Walt Disney Studios Motion Pictures,"$1,586,753","$45,000,000","June 15, 1994",G,1 hr 28 min,"[Adventure, Animation, Drama, Family, Musical]","$422,783,777","$545,728,028"
2,Toy Story 3,Walt Disney Studios Motion Pictures,"$110,307,189","$200,000,000","June 16, 2010",G,1 hr 43 min,"[Adventure, Animation, Comedy, Family, Fantasy]","$415,004,880","$651,964,823"
3,Finding Nemo,Walt Disney Studios Motion Pictures,"$70,251,710","$94,000,000","May 30, 2003",G,1 hr 40 min,"[Adventure, Animation, Comedy, Family]","$380,843,261","$559,506,933"
4,"Monsters, Inc.",Walt Disney Studios Motion Pictures,"$62,577,067","$115,000,000","November 2, 2001",G,1 hr 32 min,"[Adventure, Animation, Comedy, Family, Fantasy]","$289,916,256","$289,064,814"
...,...,...,...,...,...,...,...,...,...,...
3358,Three Days of the Condor,Paramount Pictures,na,na,"September 24, 1975",R,1 hr 57 min,"[Mystery, Thriller]","$27,476,252",na
3359,Friday,New Line Cinema,"$6,589,341","$3,500,000","April 28, 1995",R,1 hr 31 min,"[Comedy, Drama]","$27,467,564","$748,354"
3360,The Brothers,Screen Gems,"$10,302,846","$6,000,000","March 23, 2001",R,1 hr 46 min,"[Comedy, Drama]","$27,457,409","$500,782"
3361,Midsommar,A24,"$6,560,030","$9,000,000","July 3, 2019",R,2 hr 28 min,"[Drama, Horror, Mystery, Thriller]","$27,426,361","$20,476,738"


In [61]:
# Parse genres out into separate columns
genre_df = pd.get_dummies(movies_df['Genre'].apply(pd.Series).stack()).sum(level=0)
movies_df = pd.concat([movies_df, genre_df], axis=1)

# Drop Genre column
df_combined.drop(columns=['DATE','TIME'], inplace=True, axis=1)
movies_df

Unnamed: 0,Title,Studio,Opening,Budget,Release,Rating,Runtime,Genre,Domestic,International,...,Musical,Mystery,News,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
0,Toy Story 4,Walt Disney Studios Motion Pictures,"$120,908,065","$200,000,000","June 20, 2019",G,1 hr 40 min,"[Adventure, Animation, Comedy, Family, Fantasy]","$434,038,008","$639,356,585",...,0,0,0,0,0,0,0,0,0,0
1,The Lion King,Walt Disney Studios Motion Pictures,"$1,586,753","$45,000,000","June 15, 1994",G,1 hr 28 min,"[Adventure, Animation, Drama, Family, Musical]","$422,783,777","$545,728,028",...,1,0,0,0,0,0,0,0,0,0
2,Toy Story 3,Walt Disney Studios Motion Pictures,"$110,307,189","$200,000,000","June 16, 2010",G,1 hr 43 min,"[Adventure, Animation, Comedy, Family, Fantasy]","$415,004,880","$651,964,823",...,0,0,0,0,0,0,0,0,0,0
3,Finding Nemo,Walt Disney Studios Motion Pictures,"$70,251,710","$94,000,000","May 30, 2003",G,1 hr 40 min,"[Adventure, Animation, Comedy, Family]","$380,843,261","$559,506,933",...,0,0,0,0,0,0,0,0,0,0
4,"Monsters, Inc.",Walt Disney Studios Motion Pictures,"$62,577,067","$115,000,000","November 2, 2001",G,1 hr 32 min,"[Adventure, Animation, Comedy, Family, Fantasy]","$289,916,256","$289,064,814",...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3358,Three Days of the Condor,Paramount Pictures,na,na,"September 24, 1975",R,1 hr 57 min,"[Mystery, Thriller]","$27,476,252",na,...,0,1,0,0,0,0,0,1,0,0
3359,Friday,New Line Cinema,"$6,589,341","$3,500,000","April 28, 1995",R,1 hr 31 min,"[Comedy, Drama]","$27,467,564","$748,354",...,0,0,0,0,0,0,0,0,0,0
3360,The Brothers,Screen Gems,"$10,302,846","$6,000,000","March 23, 2001",R,1 hr 46 min,"[Comedy, Drama]","$27,457,409","$500,782",...,0,0,0,0,0,0,0,0,0,0
3361,Midsommar,A24,"$6,560,030","$9,000,000","July 3, 2019",R,2 hr 28 min,"[Drama, Horror, Mystery, Thriller]","$27,426,361","$20,476,738",...,0,1,0,0,0,0,0,1,0,0


In [78]:
# Convert Release date to datetime object
movies_df['Date'] = pd.to_datetime(movies_df['Release'], errors='coerce')

In [None]:
# Set Studio to dummy variable

# First get a list of studios that account for 90% of movies in the df
studio_counts = movies_df['Studio'].value_counts().reset_index()
top_studios = []
count = 0
i=0
while count < 0.9*len(movies_df):
    top_studios.append(studio_counts['index'].iloc[i])
    count+= studio_counts['Studio'].iloc[i]
    i+=1

    





