# Genres that yield highest net profit

brief description

## import the data

In [1]:
#necessary imports

import sqlite3
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob

In [2]:
#converts tsv files

tsv_file = './zippedData/rt.movie_info.tsv.gz'
convert = pd.read_table(tsv_file, delimiter='\t')
convert.to_csv('./zippedData/rt.movie_info.csv.gz')

tsv_file_r = './zippedData/rt.reviews.tsv.gz'
convert_r = pd.read_table(tsv_file_r, delimiter='\t', encoding = 'unicode_escape')
convert_r.to_csv('./zippedData/rt.reviews.csv.gz')

In [3]:
#importing data using glob

datafiles = glob("./zippedData/*.csv.gz")
datafiles

['./zippedData\\bom.movie_gross.csv.gz',
 './zippedData\\imdb.name.basics.csv.gz',
 './zippedData\\imdb.title.akas.csv.gz',
 './zippedData\\imdb.title.basics.csv.gz',
 './zippedData\\imdb.title.crew.csv.gz',
 './zippedData\\imdb.title.principals.csv.gz',
 './zippedData\\rt.movie_info.csv.gz',
 './zippedData\\rt.reviews.csv.gz',
 './zippedData\\tmdb.movies.csv.gz',
 './zippedData\\tn.movie_budgets.csv.gz']

In [4]:
datafiles_dict = {}
for filename in datafiles:
    filename_cleaned = os.path.basename(filename).replace(".csv", "").replace(".gz", "").replace(".", "_") # cleaning the filenames
    filename_df = pd.read_csv(filename)
    datafiles_dict[filename_cleaned] = filename_df

In [5]:
print(datafiles_dict.keys())

dict_keys(['bom_movie_gross', 'imdb_name_basics', 'imdb_title_akas', 'imdb_title_basics', 'imdb_title_crew', 'imdb_title_principals', 'rt_movie_info', 'rt_reviews', 'tmdb_movies', 'tn_movie_budgets'])


#create SQLite database and tables

conn = sqlite3.connect("movies_db_f1.sqlite")
def create_sql_table_from_df(df, name, conn):
    try:
        df.to_sql(name, conn)
        print(f"Created table {name}")
    
    except Exception as e:
        print(f"could not make table {name}")
        print(e)

for name, table in datafiles_dict.items():
    create_sql_table_from_df(table, name, conn)        

In [6]:
#load in dataset in csv to dataframe

tmdb_mglb_df = pd.read_csv('tmdb_mglb.csv')

tmdb_mglb_df.head() #call first 5 rows

Unnamed: 0.1,Unnamed: 0,id,original_title,popularity,release_date_x,title,vote_average,vote_count,Fantasy,Drama,...,Science Fiction,Horror,original_language,release_date_y,movie,production_budget,domestic_gross,worldwide_gross,domesticgross_v_production,total_net
0,0,12444,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788,1.0,0.0,...,0.0,0.0,English,,,,,,,
1,1,10191,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610,1.0,0.0,...,0.0,0.0,English,"Mar 26, 2010",How to Train Your Dragon,165000000.0,217581232.0,494870992.0,52581232.0,329870992.0
2,2,10138,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368,0.0,0.0,...,1.0,0.0,English,"May 7, 2010",Iron Man 2,170000000.0,312433331.0,621156389.0,142433331.0,451156389.0
3,3,862,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174,0.0,0.0,...,0.0,0.0,English,"Nov 22, 1995",Toy Story,30000000.0,191796233.0,364545516.0,161796233.0,334545516.0
4,4,27205,Inception,27.92,2010-07-16,Inception,8.3,22186,0.0,0.0,...,1.0,0.0,English,"Jul 16, 2010",Inception,160000000.0,292576195.0,835524642.0,132576195.0,675524642.0


In [7]:
type(tmdb_mglb_df) #call type of variable

pandas.core.frame.DataFrame

In [8]:
tmdb_mglb_df.shape #return the dimensions of the dataframe array

(26597, 35)

In [9]:
tmdb_mglb_df.info() #return basic summary of dataframe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26597 entries, 0 to 26596
Data columns (total 35 columns):
Unnamed: 0                    26597 non-null int64
id                            26597 non-null int64
original_title                26597 non-null object
popularity                    26597 non-null float64
release_date_x                26597 non-null object
title                         26597 non-null object
vote_average                  26597 non-null float64
vote_count                    26597 non-null int64
Fantasy                       26597 non-null float64
Drama                         26597 non-null float64
War                           26597 non-null float64
Mystery                       26597 non-null float64
Comedy                        26597 non-null float64
Romance                       26597 non-null float64
Family                        26597 non-null float64
Music                         26597 non-null float64
Documentary                   26597 non-null float64

In [10]:
tmdb_mglb_df.isna().sum() #sum of all the NaN values in each col

Unnamed: 0                        0
id                                0
original_title                    0
popularity                        0
release_date_x                    0
title                             0
vote_average                      0
vote_count                        0
Fantasy                           0
Drama                             0
War                               0
Mystery                           0
Comedy                            0
Romance                           0
Family                            0
Music                             0
Documentary                       0
Animation                         0
Western                           0
Action                            0
Crime                             0
Thriller                          0
TV Movie                          0
Adventure                         0
History                           0
Science Fiction                   0
Horror                            0
original_language           

In [11]:
tmdb_mglb_df = tmdb_mglb_df.drop(['Unnamed: 0'], axis=1)

In [12]:
tmdb_mglb_df.head()

Unnamed: 0,id,original_title,popularity,release_date_x,title,vote_average,vote_count,Fantasy,Drama,War,...,Science Fiction,Horror,original_language,release_date_y,movie,production_budget,domestic_gross,worldwide_gross,domesticgross_v_production,total_net
0,12444,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788,1.0,0.0,0.0,...,0.0,0.0,English,,,,,,,
1,10191,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610,1.0,0.0,0.0,...,0.0,0.0,English,"Mar 26, 2010",How to Train Your Dragon,165000000.0,217581232.0,494870992.0,52581232.0,329870992.0
2,10138,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368,0.0,0.0,0.0,...,1.0,0.0,English,"May 7, 2010",Iron Man 2,170000000.0,312433331.0,621156389.0,142433331.0,451156389.0
3,862,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174,0.0,0.0,0.0,...,0.0,0.0,English,"Nov 22, 1995",Toy Story,30000000.0,191796233.0,364545516.0,161796233.0,334545516.0
4,27205,Inception,27.92,2010-07-16,Inception,8.3,22186,0.0,0.0,0.0,...,1.0,0.0,English,"Jul 16, 2010",Inception,160000000.0,292576195.0,835524642.0,132576195.0,675524642.0


## Overall, what are the top 5 grossing films?

In [13]:
#sort values by 4th to last col, descending

sortbytotalgross = tmdb_mglb_df.sort_values(by=['worldwide_gross'], ascending=False)

In [14]:
sortbytotalgross.head()

Unnamed: 0,id,original_title,popularity,release_date_x,title,vote_average,vote_count,Fantasy,Drama,War,...,Science Fiction,Horror,original_language,release_date_y,movie,production_budget,domestic_gross,worldwide_gross,domesticgross_v_production,total_net
6,19995,Avatar,26.526,2009-12-18,Avatar,7.4,18676,1.0,0.0,0.0,...,1.0,0.0,English,"Dec 18, 2009",Avatar,425000000.0,760507625.0,2776345000.0,335507625.0,2351345000.0
23887,299536,Avengers: Infinity War,80.773,2018-04-27,Avengers: Infinity War,8.3,13948,1.0,0.0,0.0,...,0.0,0.0,English,"Apr 27, 2018",Avengers: Infinity War,300000000.0,678815482.0,2048134000.0,378815482.0,1748134000.0
14236,135397,Jurassic World,20.709,2015-06-12,Jurassic World,6.6,14056,0.0,0.0,0.0,...,1.0,0.0,English,"Jun 12, 2015",Jurassic World,215000000.0,652270625.0,1648855000.0,437270625.0,1433855000.0
14239,168259,Furious 7,20.396,2015-04-03,Furious 7,7.3,6538,0.0,0.0,0.0,...,0.0,0.0,English,"Apr 3, 2015",Furious 7,190000000.0,353007020.0,1518723000.0,163007020.0,1328723000.0
5196,24428,The Avengers,50.289,2012-05-04,The Avengers,7.6,19673,0.0,0.0,0.0,...,1.0,0.0,English,"May 4, 2012",The Avengers,225000000.0,623279547.0,1517936000.0,398279547.0,1292936000.0


The columns of production_budget, domestic_gross and worldwide_gross seem to contain numbers but their datatype is object. Also at first glance there don't seem to be any missing values

## If available, what are the genres of those top 5 grossing films?

In [15]:
#code here: KAT

Findings: Leveraged the [imdb_title_basics] dataset to find the titles from top 5 grossing films and noted identified genres for each film

## Top 5 grossing films by year

    * Review net loss
    * Review net gain

### Genre performance by runtime

    * Review net loss
    * Review net gain

## Data Cleaning

### Dealing with datatypes

### Checking the value counts 

### Result: By Genre, which films yield the highest net profit?

## Data Visualization