In [2]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import gzip
import sqlite3
import csv
import numpy as np
import seaborn as sns

In [3]:
# Load dataset 1 'bom.movie_gross' obtained from Box Office Mojo

df1 = pd.read_csv('data/zippedData/bom.movie_gross.csv.gz')
df1 = df1.rename(columns = {'title':'movie'})

In [4]:
df4 = pd.read_csv('data/zippedData/imdb.title.basics.csv.gz')
df4 = df4.rename(columns = {'primary_title':'movie'})

In [5]:
df11 = pd.read_csv('data/zippedData/tn.movie_budgets.csv.gz')
df11['worldwide_gross'] = df11['worldwide_gross'].str[1:]
df11['worldwide_gross'] = df11['worldwide_gross'].str.replace(',', '').astype(np.int64)
df11['domestic_gross'] = df11['domestic_gross'].str[1:]
df11['domestic_gross'] = df11['domestic_gross'].str.replace(',', '').astype(np.int64)
df11['production_budget'] = df11['production_budget'].str[1:]
df11['production_budget'] = df11['production_budget'].str.replace(',', '').astype(np.int64)

production = df11[df11.production_budget < 1000000].index
df11.drop(production, inplace=True)
domestic = df11[df11.domestic_gross < 1].index
df11.drop(domestic, inplace=True)
worldwide = df11[df11.worldwide_gross < 1].index
df11.drop(worldwide, inplace=True)
df11.set_index('movie')

Unnamed: 0_level_0,id,release_date,production_budget,domestic_gross,worldwide_gross
movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Avatar,1,"Dec 18, 2009",425000000,760507625,2776345279
Pirates of the Caribbean: On Stranger Tides,2,"May 20, 2011",410600000,241063875,1045663875
Dark Phoenix,3,"Jun 7, 2019",350000000,42762350,149762350
Avengers: Age of Ultron,4,"May 1, 2015",330600000,459005868,1403013963
Star Wars Ep. VIII: The Last Jedi,5,"Dec 15, 2017",317000000,620181382,1316721747
...,...,...,...,...,...
Sparkler,95,"Mar 19, 1999",1000000,5494,5494
The Helixâ¦Loaded,96,"Mar 18, 2005",1000000,3700,3700
Childless,97,"May 15, 2015",1000000,1036,1036
In Her Line of Fire,98,"Apr 21, 2006",1000000,884,884


In [6]:
BigFrame = df11.merge(df1, on='movie').merge(df4, on='movie').drop(columns=['tconst', 'original_title', 'start_year', 'id', 'domestic_gross_y'])

In [7]:
BigFrame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1533 entries, 0 to 1532
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   release_date       1533 non-null   object 
 1   movie              1533 non-null   object 
 2   production_budget  1533 non-null   int64  
 3   domestic_gross_x   1533 non-null   int64  
 4   worldwide_gross    1533 non-null   int64  
 5   studio             1533 non-null   object 
 6   foreign_gross      1345 non-null   object 
 7   year               1533 non-null   int64  
 8   runtime_minutes    1442 non-null   float64
 9   genres             1510 non-null   object 
dtypes: float64(1), int64(4), object(5)
memory usage: 131.7+ KB


In [8]:
BigFrame = BigFrame.drop_duplicates(subset='movie').set_index('movie')
BigFrame['month'] = BigFrame['release_date'].str[:3]

In [9]:
BigFrame.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1133 entries, Pirates of the Caribbean: On Stranger Tides to Camp X-Ray
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   release_date       1133 non-null   object 
 1   production_budget  1133 non-null   int64  
 2   domestic_gross_x   1133 non-null   int64  
 3   worldwide_gross    1133 non-null   int64  
 4   studio             1133 non-null   object 
 5   foreign_gross      994 non-null    object 
 6   year               1133 non-null   int64  
 7   runtime_minutes    1117 non-null   float64
 8   genres             1128 non-null   object 
 9   month              1133 non-null   object 
dtypes: float64(1), int64(4), object(5)
memory usage: 97.4+ KB


In [10]:
BigFrame = BigFrame.dropna()
BigFrame.dropna(inplace=True)

In [11]:
BigFrame.info()

<class 'pandas.core.frame.DataFrame'>
Index: 977 entries, Pirates of the Caribbean: On Stranger Tides to Patti Cake$
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   release_date       977 non-null    object 
 1   production_budget  977 non-null    int64  
 2   domestic_gross_x   977 non-null    int64  
 3   worldwide_gross    977 non-null    int64  
 4   studio             977 non-null    object 
 5   foreign_gross      977 non-null    object 
 6   year               977 non-null    int64  
 7   runtime_minutes    977 non-null    float64
 8   genres             977 non-null    object 
 9   month              977 non-null    object 
dtypes: float64(1), int64(4), object(5)
memory usage: 84.0+ KB


In [9]:
# Order month by order in the calender then sort the calenders in order from Jan-Dec

months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", 
          "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]

In [10]:
df_2013 = BigFrame[BigFrame['year'] == 2013]
df_2014 = BigFrame[BigFrame['year'] == 2014]
df_2015 = BigFrame[BigFrame['year'] == 2015]
df_2016 = BigFrame[BigFrame['year'] == 2016]
df_2017 = BigFrame[BigFrame['year'] == 2017]
df_2018 = BigFrame[BigFrame['year'] == 2018]

In [11]:
df_2013 = df_2013[['worldwide_gross', 'month', 'production_budget']]
df_2013 = df_2013.dropna(subset=['month'])

df_jan2013 = df_2013[df_2013['month'] == 'Jan'].mean().astype(np.int64)
df_feb2013 = df_2013[df_2013['month'] == 'Feb'].mean().astype(np.int64)
df_mar2013 = df_2013[df_2013['month'] == 'Mar'].mean().astype(np.int64)
df_apr2013 = df_2013[df_2013['month'] == 'Apr'].mean().astype(np.int64)
df_may2013 = df_2013[df_2013['month'] == 'May'].mean().astype(np.int64)
df_jun2013 = df_2013[df_2013['month'] == 'Jun'].mean().astype(np.int64)
df_jul2013 = df_2013[df_2013['month'] == 'Jul'].mean().astype(np.int64)
df_aug2013 = df_2013[df_2013['month'] == 'Aug'].mean().astype(np.int64)
df_sep2013 = df_2013[df_2013['month'] == 'Sep'].mean().astype(np.int64)
df_oct2013 = df_2013[df_2013['month'] == 'Oct'].mean().astype(np.int64)
df_nov2013 = df_2013[df_2013['month'] == 'Nov'].mean().astype(np.int64)
df_dec2013 = df_2013[df_2013['month'] == 'Dec'].mean().astype(np.int64)

In [12]:
df_2014 = df_2014[['worldwide_gross', 'month', 'production_budget']]
df_2014 = df_2014.dropna(subset=['month'])

df_jan2014 = df_2014[df_2014['month'] == 'Jan'].mean().astype(np.int64)
df_feb2014 = df_2014[df_2014['month'] == 'Feb'].mean().astype(np.int64)
df_mar2014 = df_2014[df_2014['month'] == 'Mar'].mean().astype(np.int64)
df_apr2014 = df_2014[df_2014['month'] == 'Apr'].mean().astype(np.int64)
df_may2014 = df_2014[df_2014['month'] == 'May'].mean().astype(np.int64)
df_jun2014 = df_2014[df_2014['month'] == 'Jun'].mean().astype(np.int64)
df_jul2014 = df_2014[df_2014['month'] == 'Jul'].mean().astype(np.int64)
df_aug2014 = df_2014[df_2014['month'] == 'Aug'].mean().astype(np.int64)
df_sep2014 = df_2014[df_2014['month'] == 'Sep'].mean().astype(np.int64)
df_oct2014 = df_2014[df_2014['month'] == 'Oct'].mean().astype(np.int64)
df_nov2014 = df_2014[df_2014['month'] == 'Nov'].mean().astype(np.int64)
df_dec2014 = df_2014[df_2014['month'] == 'Dec'].mean().astype(np.int64)

In [13]:
df_2015 = df_2015[['worldwide_gross', 'month', 'production_budget']]
df_2015 = df_2015.dropna(subset=['month'])

df_jan2015 = df_2015[df_2015['month'] == 'Jan'].mean().astype(np.int64)
df_feb2015 = df_2015[df_2015['month'] == 'Feb'].mean().astype(np.int64)
df_mar2015 = df_2015[df_2015['month'] == 'Mar'].mean().astype(np.int64)
df_apr2015 = df_2015[df_2015['month'] == 'Apr'].mean().astype(np.int64)
df_may2015 = df_2015[df_2015['month'] == 'May'].mean().astype(np.int64)
df_jun2015 = df_2015[df_2015['month'] == 'Jun'].mean().astype(np.int64)
df_jul2015 = df_2015[df_2015['month'] == 'Jul'].mean().astype(np.int64)
df_aug2015 = df_2015[df_2015['month'] == 'Aug'].mean().astype(np.int64)
df_sep2015 = df_2015[df_2015['month'] == 'Sep'].mean().astype(np.int64)
df_oct2015 = df_2015[df_2015['month'] == 'Oct'].mean().astype(np.int64)
df_nov2015 = df_2015[df_2015['month'] == 'Nov'].mean().astype(np.int64)
df_dec2015 = df_2015[df_2015['month'] == 'Dec'].mean().astype(np.int64)

In [14]:
df_2016 = df_2016[['worldwide_gross', 'month', 'production_budget']]
df_2016 = df_2016.dropna(subset=['month'])

df_jan2016 = df_2016[df_2016['month'] == 'Jan'].mean().astype(np.int64)
df_feb2016 = df_2016[df_2016['month'] == 'Feb'].mean().astype(np.int64)
df_mar2016 = df_2016[df_2016['month'] == 'Mar'].mean().astype(np.int64)
df_apr2016 = df_2016[df_2016['month'] == 'Apr'].mean().astype(np.int64)
df_may2016 = df_2016[df_2016['month'] == 'May'].mean().astype(np.int64)
df_jun2016 = df_2016[df_2016['month'] == 'Jun'].mean().astype(np.int64)
df_jul2016 = df_2016[df_2016['month'] == 'Jul'].mean().astype(np.int64)
df_aug2016 = df_2016[df_2016['month'] == 'Aug'].mean().astype(np.int64)
df_sep2016 = df_2016[df_2016['month'] == 'Sep'].mean().astype(np.int64)
df_oct2016 = df_2016[df_2016['month'] == 'Oct'].mean().astype(np.int64)
df_nov2016 = df_2016[df_2016['month'] == 'Nov'].mean().astype(np.int64)
df_dec2016 = df_2016[df_2016['month'] == 'Dec'].mean().astype(np.int64)

In [15]:
df_2017 = df_2017[['worldwide_gross', 'month', 'production_budget']]
df_2017 = df_2017.dropna(subset=['month'])

df_jan2017 = df_2017[df_2017['month'] == 'Jan'].mean().astype(np.int64)
df_feb2017 = df_2017[df_2017['month'] == 'Feb'].mean().astype(np.int64)
df_mar2017 = df_2017[df_2017['month'] == 'Mar'].mean().astype(np.int64)
df_apr2017 = df_2017[df_2017['month'] == 'Apr'].mean().astype(np.int64)
df_may2017 = df_2017[df_2017['month'] == 'May'].mean().astype(np.int64)
df_jun2017 = df_2017[df_2017['month'] == 'Jun'].mean().astype(np.int64)
df_jul2017 = df_2017[df_2017['month'] == 'Jul'].mean().astype(np.int64)
df_aug2017 = df_2017[df_2017['month'] == 'Aug'].mean().astype(np.int64)
df_sep2017 = df_2017[df_2017['month'] == 'Sep'].mean().astype(np.int64)
df_oct2017 = df_2017[df_2017['month'] == 'Oct'].mean().astype(np.int64)
df_nov2017 = df_2017[df_2017['month'] == 'Nov'].mean().astype(np.int64)
df_dec2017 = df_2017[df_2017['month'] == 'Dec'].mean().astype(np.int64)

In [16]:
df_2018 = df_2018[['worldwide_gross', 'month', 'production_budget']]
df_2018 = df_2018.dropna(subset=['month'])

df_jan2018 = df_2018[df_2018['month'] == 'Jan'].mean().astype(np.int64)
df_feb2018 = df_2018[df_2018['month'] == 'Feb'].mean().astype(np.int64)
df_mar2018 = df_2018[df_2018['month'] == 'Mar'].mean().astype(np.int64)
df_apr2018 = df_2018[df_2018['month'] == 'Apr'].mean().astype(np.int64)
df_may2018 = df_2018[df_2018['month'] == 'May'].mean().astype(np.int64)
df_jun2018 = df_2018[df_2018['month'] == 'Jun'].mean().astype(np.int64)
df_jul2018 = df_2018[df_2018['month'] == 'Jul'].mean().astype(np.int64)
df_aug2018 = df_2018[df_2018['month'] == 'Aug'].mean().astype(np.int64)
df_sep2018 = df_2018[df_2018['month'] == 'Sep'].mean().astype(np.int64)
df_oct2018 = df_2018[df_2018['month'] == 'Oct'].mean().astype(np.int64)
df_nov2018 = df_2018[df_2018['month'] == 'Nov'].mean().astype(np.int64)
df_dec2018 = df_2018[df_2018['month'] == 'Dec'].mean().astype(np.int64)

In [17]:
mean_2013 = [df_jan2013[0], df_feb2013[0], df_mar2013[0], df_apr2013[0], df_may2013[0], df_jun2013[0],\
             df_jul2013[0], df_aug2013[0], df_sep2013[0], df_oct2013[0], df_nov2013[0], df_dec2013[0]]
mean_produc2013 = [df_jan2013[1], df_feb2013[1], df_mar2013[1], df_apr2013[1], df_may2013[1], df_jun2013[1],\
             df_jul2013[1], df_aug2013[1], df_sep2013[1], df_oct2013[1], df_nov2013[1], df_dec2013[1]]
mean_2014 = [df_jan2014[0], df_feb2014[0], df_mar2014[0], df_apr2014[0], df_may2014[0], df_jun2014[0],\
             df_jul2014[0], df_aug2014[0], df_sep2014[0], df_oct2014[0], df_nov2014[0], df_dec2014[0]]
mean_produc2014 = [df_jan2014[1], df_feb2014[1], df_mar2014[1], df_apr2014[1], df_may2014[1], df_jun2014[1],\
             df_jul2014[1], df_aug2014[1], df_sep2014[1], df_oct2014[1], df_nov2014[1], df_dec2014[1]]
mean_2015 = [df_jan2015[0], df_feb2015[0], df_mar2015[0], df_apr2015[0], df_may2015[0], df_jun2015[0],\
             df_jul2015[0], df_aug2015[0], df_sep2015[0], df_oct2015[0], df_nov2015[0], df_dec2015[0]]
mean_produc2015 = [df_jan2015[1], df_feb2015[1], df_mar2015[1], df_apr2015[1], df_may2015[1], df_jun2015[1],\
             df_jul2015[1], df_aug2015[1], df_sep2015[1], df_oct2015[1], df_nov2015[1], df_dec2015[1]]
mean_2016 = [df_jan2016[0], df_feb2016[0], df_mar2016[0], df_apr2016[0], df_may2016[0], df_jun2016[0],\
             df_jul2016[0], df_aug2016[0], df_sep2016[0], df_oct2016[0], df_nov2016[0], df_dec2016[0]]
mean_produc2016 = [df_jan2016[1], df_feb2016[1], df_mar2016[1], df_apr2016[1], df_may2016[1], df_jun2016[1],\
             df_jul2016[1], df_aug2016[1], df_sep2016[1], df_oct2016[1], df_nov2016[1], df_dec2016[1]]

mean_2017 = [df_jan2017[0], df_feb2017[0], df_mar2017[0], df_apr2017[0], df_may2017[0], df_jun2017[0],\
             df_jul2017[0], df_aug2017[0], df_sep2017[0], df_oct2017[0], df_nov2017[0], df_dec2017[0]]
mean_produc2017 = [df_jan2017[1], df_feb2017[1], df_mar2017[1], df_apr2017[1], df_may2017[1], df_jun2017[1],\
             df_jul2017[1], df_aug2017[1], df_sep2017[1], df_oct2017[1], df_nov2017[1], df_dec2017[1]]
mean_2018 = [df_jan2018[0], df_feb2018[0], df_mar2018[0], df_apr2018[0], df_may2018[0], df_jun2018[0],\
             df_jul2018[0], df_aug2018[0], df_sep2018[0], df_oct2018[0], df_nov2018[0], df_dec2018[0]]
mean_produc2018 = [df_jan2018[1], df_feb2018[1], df_mar2018[1], df_apr2018[1], df_may2018[1], df_jun2018[1],\
             df_jul2018[1], df_aug2018[1], df_sep2018[1], df_oct2018[1], df_nov2018[1], df_dec2018[1]]

In [None]:
plt.style.use('seaborn-bright')
fig, ax = plt.subplots(2, 3, figsize = (18, 10))
x= months
ax[0,0].plot(x, mean_2013)
ax[0,0].plot(x, mean_produc2013)

ax[0,1].plot(x, mean_2014)
ax[0,1].plot(x, mean_produc2014)

ax[0,2].plot(x, mean_2015)
ax[0,2].plot(x, mean_produc2015)

ax[1,0].plot(x, mean_2016)
ax[1,0].plot(x, mean_produc2016)

ax[1,1].plot(x, mean_2017)
ax[1,1].plot(x, mean_produc2017)


ax[1,2].plot(x, mean_2018)
ax[1,2].plot(x, mean_produc2017);

In [None]:
df_complete = BigFrame[['production_budget','worldwide_gross', 'year', 'month']]

column_names = months
a = np.zeros(shape=(1,12), dtype = int)
df_months = pd.DataFrame(a, columns = column_names)

In [None]:
df_months_values = BigFrame

df_months_values['month'] = pd.Categorical(df_months_values['month'], categories=months, ordered=True)
df_months_values = df_months_values['month'].sort_values()
df_months_values = list(df_months_values)

for x in BigFrame['month']:
    if x in column_names:
        df_months[x] += 1
        
# Get the values from this function and copy them below *here*

list(df_months.value_counts().index)

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

plt.style.use('seaborn-bright')
fig, ax = plt.subplots(figsize = (10, 10))
x = months
#y = [*here*]
y = [69, 62, 81, 67, 69, 82, 84, 77, 83, 93, 107, 103]
sns.barplot(x, y);
ax.set_title('Total number of movie releases by month')
ax.set_xlabel('Data from 2001 - 2020')
ax.set_ylabel('Number of movies released');

In [None]:
# The best window to release our blockbuster action / drama/ suspense film is releasing during the holiday season.
# This distribution shows that other big name movie production companies release later in the year.