# Genres that yield highest net profit

brief description

## import the data

In [1]:
#necessary imports

import sqlite3
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob

In [2]:
#changing the display settings to only format floats

pd.options.display.float_format = '{:.2f}'.format

In [3]:
#converts tsv files

tsv_file = './zippedData/rt.movie_info.tsv.gz'
convert = pd.read_table(tsv_file, delimiter='\t')
convert.to_csv('./zippedData/rt.movie_info.csv.gz')

tsv_file_r = './zippedData/rt.reviews.tsv.gz'
convert_r = pd.read_table(tsv_file_r, delimiter='\t', encoding = 'unicode_escape')
convert_r.to_csv('./zippedData/rt.reviews.csv.gz')

In [4]:
#importing data using glob

datafiles = glob("./zippedData/*.csv.gz")
datafiles

['./zippedData\\bom.movie_gross.csv.gz',
 './zippedData\\imdb.name.basics.csv.gz',
 './zippedData\\imdb.title.akas.csv.gz',
 './zippedData\\imdb.title.basics.csv.gz',
 './zippedData\\imdb.title.crew.csv.gz',
 './zippedData\\imdb.title.principals.csv.gz',
 './zippedData\\rt.movie_info.csv.gz',
 './zippedData\\rt.reviews.csv.gz',
 './zippedData\\tmdb.movies.csv.gz',
 './zippedData\\tn.movie_budgets.csv.gz']

In [5]:
datafiles_dict = {}
for filename in datafiles:
    filename_cleaned = os.path.basename(filename).replace(".csv", "").replace(".gz", "").replace(".", "_") # cleaning the filenames
    filename_df = pd.read_csv(filename)
    datafiles_dict[filename_cleaned] = filename_df

In [6]:
print(datafiles_dict.keys())

dict_keys(['bom_movie_gross', 'imdb_name_basics', 'imdb_title_akas', 'imdb_title_basics', 'imdb_title_crew', 'imdb_title_principals', 'rt_movie_info', 'rt_reviews', 'tmdb_movies', 'tn_movie_budgets'])


In [7]:
#load in dataset in csv to dataframe

tmdb_mglb_df = pd.read_csv('tmdb_mglb.csv')

tmdb_mglb_df.head() #call first 5 rows

Unnamed: 0.1,Unnamed: 0,id,original_title,popularity,release_date_x,title,vote_average,vote_count,Fantasy,Drama,...,Science Fiction,Horror,original_language,release_date_y,movie,production_budget,domestic_gross,worldwide_gross,domesticgross_v_production,total_net
0,0,12444,Harry Potter and the Deathly Hallows: Part 1,33.53,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788,1.0,0.0,...,0.0,0.0,English,,,,,,,
1,1,10191,How to Train Your Dragon,28.73,2010-03-26,How to Train Your Dragon,7.7,7610,1.0,0.0,...,0.0,0.0,English,"Mar 26, 2010",How to Train Your Dragon,165000000.0,217581232.0,494870992.0,52581232.0,329870992.0
2,2,10138,Iron Man 2,28.52,2010-05-07,Iron Man 2,6.8,12368,0.0,0.0,...,1.0,0.0,English,"May 7, 2010",Iron Man 2,170000000.0,312433331.0,621156389.0,142433331.0,451156389.0
3,3,862,Toy Story,28.0,1995-11-22,Toy Story,7.9,10174,0.0,0.0,...,0.0,0.0,English,"Nov 22, 1995",Toy Story,30000000.0,191796233.0,364545516.0,161796233.0,334545516.0
4,4,27205,Inception,27.92,2010-07-16,Inception,8.3,22186,0.0,0.0,...,1.0,0.0,English,"Jul 16, 2010",Inception,160000000.0,292576195.0,835524642.0,132576195.0,675524642.0


In [8]:
type(tmdb_mglb_df) #call type of variable

pandas.core.frame.DataFrame

In [9]:
tmdb_mglb_df.shape #return the dimensions of the dataframe array

(26597, 35)

In [10]:
tmdb_mglb_df.info() #return basic summary of dataframe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26597 entries, 0 to 26596
Data columns (total 35 columns):
Unnamed: 0                    26597 non-null int64
id                            26597 non-null int64
original_title                26597 non-null object
popularity                    26597 non-null float64
release_date_x                26597 non-null object
title                         26597 non-null object
vote_average                  26597 non-null float64
vote_count                    26597 non-null int64
Fantasy                       26597 non-null float64
Drama                         26597 non-null float64
War                           26597 non-null float64
Mystery                       26597 non-null float64
Comedy                        26597 non-null float64
Romance                       26597 non-null float64
Family                        26597 non-null float64
Music                         26597 non-null float64
Documentary                   26597 non-null float64

In [11]:
tmdb_mglb_df.isna().sum() #sum of all the NaN values in each col

Unnamed: 0                        0
id                                0
original_title                    0
popularity                        0
release_date_x                    0
title                             0
vote_average                      0
vote_count                        0
Fantasy                           0
Drama                             0
War                               0
Mystery                           0
Comedy                            0
Romance                           0
Family                            0
Music                             0
Documentary                       0
Animation                         0
Western                           0
Action                            0
Crime                             0
Thriller                          0
TV Movie                          0
Adventure                         0
History                           0
Science Fiction                   0
Horror                            0
original_language           

In [12]:
tmdb_mglb_df = tmdb_mglb_df.drop(['Unnamed: 0', 'movie', 'title'], axis=1) #drop columns

In [13]:
tmdb_mglb_df.head() #call first 5 rows

Unnamed: 0,id,original_title,popularity,release_date_x,vote_average,vote_count,Fantasy,Drama,War,Mystery,...,History,Science Fiction,Horror,original_language,release_date_y,production_budget,domestic_gross,worldwide_gross,domesticgross_v_production,total_net
0,12444,Harry Potter and the Deathly Hallows: Part 1,33.53,2010-11-19,7.7,10788,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,English,,,,,,
1,10191,How to Train Your Dragon,28.73,2010-03-26,7.7,7610,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,English,"Mar 26, 2010",165000000.0,217581232.0,494870992.0,52581232.0,329870992.0
2,10138,Iron Man 2,28.52,2010-05-07,6.8,12368,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,English,"May 7, 2010",170000000.0,312433331.0,621156389.0,142433331.0,451156389.0
3,862,Toy Story,28.0,1995-11-22,7.9,10174,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,English,"Nov 22, 1995",30000000.0,191796233.0,364545516.0,161796233.0,334545516.0
4,27205,Inception,27.92,2010-07-16,8.3,22186,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,English,"Jul 16, 2010",160000000.0,292576195.0,835524642.0,132576195.0,675524642.0


## Overall, what are the top 5 grossing films?

In [14]:
#sort values by 4th to last col, descending

sortbytotalgross = tmdb_mglb_df.sort_values(by=['worldwide_gross'], ascending=False)

In [15]:
sortbytotalgross.head()

Unnamed: 0,id,original_title,popularity,release_date_x,vote_average,vote_count,Fantasy,Drama,War,Mystery,...,History,Science Fiction,Horror,original_language,release_date_y,production_budget,domestic_gross,worldwide_gross,domesticgross_v_production,total_net
6,19995,Avatar,26.53,2009-12-18,7.4,18676,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,English,"Dec 18, 2009",425000000.0,760507625.0,2776345279.0,335507625.0,2351345279.0
23887,299536,Avengers: Infinity War,80.77,2018-04-27,8.3,13948,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,English,"Apr 27, 2018",300000000.0,678815482.0,2048134200.0,378815482.0,1748134200.0
14236,135397,Jurassic World,20.71,2015-06-12,6.6,14056,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,English,"Jun 12, 2015",215000000.0,652270625.0,1648854864.0,437270625.0,1433854864.0
14239,168259,Furious 7,20.4,2015-04-03,7.3,6538,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,English,"Apr 3, 2015",190000000.0,353007020.0,1518722794.0,163007020.0,1328722794.0
5196,24428,The Avengers,50.29,2012-05-04,7.6,19673,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,English,"May 4, 2012",225000000.0,623279547.0,1517935897.0,398279547.0,1292935897.0


***Findings***: Looking at the total gross revenue of worldwide, including domestic sales, the top 5 grossing films are:
1) Avatar - 2,776,345,279
2) Avengers: Infinity War - 2,048,134,200.00
3) Jurassic World - 1,648,854,864
4) The Avengers - 1,518,722,794
5) Black Panther - 1,517,935,897

In [None]:
#visual of top 5 grossing

### If available, what are the genres of those top 5 grossing films? 

In [16]:
code here: KAT

SyntaxError: invalid syntax (<ipython-input-16-d4d7f2c1cec0>, line 1)

***Findings***: Leveraged the [imdb_title_basics] dataset to find the titles from top 5 grossing films and noted identified genres for each film

## Top 5 grossing films by year

### What are the titles of the films?

In [None]:
conn = sqlite3.connect('movie.db') 
cur = conn.cursor()

cur.execute(""" 
SELECT 
  DISTINCT year, domestic_gross, movie
FROM tn_movie_budgets 
ORDER BY domestic_gross DESC
;
""")

dft = pd.DataFrame(cur.fetchall())
dft.columns = (x[0] for x in cur.description)
#print(dft.shape)
#print(dft.info())

dft.head()

### What was the production cost?

In [None]:
conn = sqlite3.connect('movie.db') 
cur = conn.cursor()

cur.execute(""" 
SELECT 
  DISTINCT year, production_budget, movie
FROM tn_movie_budgets 
ORDER BY domestic_gross DESC
;
""")

dft = pd.DataFrame(cur.fetchall())
dft.columns = (x[0] for x in cur.description)
#print(dft.shape)
#print(dft.info())

dft.head()

***Findings***: write summary here

### What was the runtime?

In [None]:
code here: KAT

***Findings***: write summary here >> Leveraged the [imdb_title_basics] dataset to find the titles from top 5 grossing films and noted the runtime for each film

## Most recently, what has been the average production cost? 

In [22]:
#sort by production budget

productioncosts = tmdb_mglb_df.sort_values(by=['production_budget'], ascending=False)

productioncosts.head()

Unnamed: 0,id,original_title,popularity,release_date_x,vote_average,vote_count,Fantasy,Drama,War,Mystery,...,History,Science Fiction,Horror,original_language,release_date_y,production_budget,domestic_gross,worldwide_gross,domesticgross_v_production,total_net
6,19995,Avatar,26.53,2009-12-18,7.4,18676,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,English,"Dec 18, 2009",425000000.0,760507625.0,2776345279.0,335507625.0,2351345279.0
2481,1865,Pirates of the Caribbean: On Stranger Tides,30.58,2011-05-20,6.4,8571,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,English,"May 20, 2011",410600000.0,241063875.0,1045663875.0,-169536125.0,635063875.0
14211,99861,Avengers: Age of Ultron,44.38,2015-05-01,7.3,13457,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,English,"May 1, 2015",330600000.0,459005868.0,1403013963.0,128405868.0,1072413963.0
14217,206647,Spectre,30.32,2015-11-06,6.4,6719,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,English,"Nov 6, 2015",300000000.0,200074175.0,879620923.0,-99925825.0,579620923.0
17440,206647,Spectre,30.32,2015-11-06,6.4,6719,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,English,"Nov 6, 2015",300000000.0,200074175.0,879620923.0,-99925825.0,579620923.0


In [23]:
#remove rows with missing values in place/within the original dataframe

productioncosts.dropna(inplace = True) 

productioncosts

Unnamed: 0,id,original_title,popularity,release_date_x,vote_average,vote_count,Fantasy,Drama,War,Mystery,...,History,Science Fiction,Horror,original_language,release_date_y,production_budget,domestic_gross,worldwide_gross,domesticgross_v_production,total_net
6,19995,Avatar,26.53,2009-12-18,7.40,18676,1.00,0.00,0.00,0.00,...,0.00,1.00,0.00,English,"Dec 18, 2009",425000000.00,760507625.00,2776345279.00,335507625.00,2351345279.00
2481,1865,Pirates of the Caribbean: On Stranger Tides,30.58,2011-05-20,6.40,8571,1.00,0.00,0.00,0.00,...,0.00,0.00,0.00,English,"May 20, 2011",410600000.00,241063875.00,1045663875.00,-169536125.00,635063875.00
14211,99861,Avengers: Age of Ultron,44.38,2015-05-01,7.30,13457,0.00,0.00,0.00,0.00,...,0.00,1.00,0.00,English,"May 1, 2015",330600000.00,459005868.00,1403013963.00,128405868.00,1072413963.00
14217,206647,Spectre,30.32,2015-11-06,6.40,6719,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,English,"Nov 6, 2015",300000000.00,200074175.00,879620923.00,-99925825.00,579620923.00
17440,206647,Spectre,30.32,2015-11-06,6.40,6719,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,English,"Nov 6, 2015",300000000.00,200074175.00,879620923.00,-99925825.00,579620923.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12365,279516,Ten,1.57,2014-03-28,5.40,5,0.00,0.00,0.00,1.00,...,0.00,0.00,1.00,English,"Apr 21, 2015",25000.00,0.00,0.00,-25000.00,-25000.00
14730,226458,Exeter,5.93,2015-03-26,4.70,121,0.00,0.00,0.00,0.00,...,0.00,0.00,1.00,English,"Sep 1, 2015",25000.00,0.00,489792.00,-25000.00,464792.00
10500,255266,Dry Spell,0.60,2013-02-14,6.00,1,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,English,"Dec 31, 2014",22000.00,0.00,0.00,-22000.00,-22000.00
8920,86304,All Superheroes Must Die,2.08,2013-01-04,3.90,19,0.00,0.00,0.00,0.00,...,0.00,1.00,0.00,English,"Jan 4, 2013",20000.00,0.00,0.00,-20000.00,-20000.00


In [26]:
#calculate median of production_budget column

productioncosts.loc[:, 'production_budget'].median()

19000000.0

***Findings***: Median/Average production costs - 19,000,000

## Which 5 genres had the highest box office revenue?

In [None]:
code here: KAT

***Findings***: In the Rotten Tomatoes dataset, Is there a common genre associated with highest box office revenue?

## Data Visualization

In [None]:
code here

***Findings***: write summary here