In [1]:
#loading libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# setting plt to inline
%matplotlib inline

In [2]:
# importing sqlite3
import sqlite3

# conneting to db and creating cursor
conn = sqlite3.Connection('data/movies.db')
cur = conn.cursor()

# creating query functions
def fetcha(q):
    return cur.execute(q).fetchall()
def fetcho(q):
    return cur.execute(q).fetchone()

In [3]:
# getting table names
q = """SELECT name FROM sqlite_master 
WHERE type IN ('table','view') 
AND name NOT LIKE 'sqlite_%'
ORDER BY 1"""
fetcha(q)

[('bom_movie_gross',),
 ('imdb_name_basic',),
 ('imdb_name_basics',),
 ('imdb_title_akas',),
 ('imdb_title_basics',),
 ('imdb_title_crew',),
 ('imdb_title_principals',),
 ('imdb_title_ratings',),
 ('rotten_tomatoes_critic_reviews',),
 ('rotten_tomatoes_movies',),
 ('tmdb_movies',),
 ('tn_movie_budgets',)]

In [4]:
# creating secondary df joing bom and tmdb

q = """SELECT*FROM bom_movie_gross 
       JOIN tmdb_movies
       USING(title)
       """
tmdbom_df = pd.DataFrame(fetcha(q))
tmdbom_df.columns = [i[0] for i in cur.description]
tmdbom_df.head(1)

Unnamed: 0,idx,title,studio,domestic_gross,foreign_gross,year,idx.1,genre_ids,id,original_language,original_title,popularity,release_date,vote_average,vote_count
0,0,Toy Story 3,BV,415000000.0,652000000,2010,7,"[16, 10751, 35]",10193,en,Toy Story 3,24.445,2010-06-17,7.7,8340


In [5]:
# getting df info 
tmdbom_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2703 entries, 0 to 2702
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   idx                2703 non-null   int64  
 1   title              2703 non-null   object 
 2   studio             2702 non-null   object 
 3   domestic_gross     2682 non-null   float64
 4   foreign_gross      1723 non-null   object 
 5   year               2703 non-null   int64  
 6   idx                2703 non-null   int64  
 7   genre_ids          2703 non-null   object 
 8   id                 2703 non-null   int64  
 9   original_language  2703 non-null   object 
 10  original_title     2703 non-null   object 
 11  popularity         2703 non-null   float64
 12  release_date       2703 non-null   object 
 13  vote_average       2703 non-null   float64
 14  vote_count         2703 non-null   int64  
dtypes: float64(3), int64(5), object(7)
memory usage: 316.9+ KB


In [6]:
#converting numerical string to float
tmdbom_df['foreign_gross'] = pd.to_numeric(tmdbom_df['foreign_gross'], errors='coerce')

In [7]:
# converting date string to datetime object and checking types again
tmdbom_df['release_date'] = pd.to_datetime(tmdbom_df['release_date'])
tmdbom_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2703 entries, 0 to 2702
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   idx                2703 non-null   int64         
 1   title              2703 non-null   object        
 2   studio             2702 non-null   object        
 3   domestic_gross     2682 non-null   float64       
 4   foreign_gross      1717 non-null   float64       
 5   year               2703 non-null   int64         
 6   idx                2703 non-null   int64         
 7   genre_ids          2703 non-null   object        
 8   id                 2703 non-null   int64         
 9   original_language  2703 non-null   object        
 10  original_title     2703 non-null   object        
 11  popularity         2703 non-null   float64       
 12  release_date       2703 non-null   datetime64[ns]
 13  vote_average       2703 non-null   float64       
 14  vote_cou

In [8]:
# dropping extraneous columns
tmdbom_df = tmdbom_df.drop(['idx', 'id', 'original_language', 'original_title', 'year'], axis=1)
tmdbom_df.head(1)

Unnamed: 0,title,studio,domestic_gross,foreign_gross,genre_ids,popularity,release_date,vote_average,vote_count
0,Toy Story 3,BV,415000000.0,652000000.0,"[16, 10751, 35]",24.445,2010-06-17,7.7,8340


In [9]:
# checking NaNs
tmdbom_df.isna().sum()

title               0
studio              1
domestic_gross     21
foreign_gross     986
genre_ids           0
popularity          0
release_date        0
vote_average        0
vote_count          0
dtype: int64

In [10]:
tmdbom_df[tmdbom_df['studio'].isna() == True]

Unnamed: 0,title,studio,domestic_gross,foreign_gross,genre_ids,popularity,release_date,vote_average,vote_count
420,Fireflies in the Garden,,70600.0,3300000.0,[18],5.121,2011-10-11,6.3,96


In [11]:
tmdbom_df[tmdbom_df['domestic_gross'].isna() == True]

Unnamed: 0,title,studio,domestic_gross,foreign_gross,genre_ids,popularity,release_date,vote_average,vote_count
181,It's a Wonderful Afterlife,UTV,,1300000.0,"[35, 10749]",1.332,2010-10-08,4.6,12
221,Celine: Through the Eyes of the World,Sony,,119000.0,"[99, 10402]",0.989,2010-02-17,5.9,4
403,Force,FoxS,,4800000.0,"[18, 28, 53, 80, 10749]",1.433,2011-09-30,6.2,17
531,Empire of Silver,NeoC,,19000.0,"[18, 36, 10751, 10749]",2.72,2011-06-03,5.8,5
712,The Tall Man,Imag.,,5200000.0,"[27, 9648, 53]",8.149,2012-08-01,5.9,569
713,The Tall Man,Imag.,,5200000.0,[99],0.6,2011-09-08,5.8,5
773,Dark Tide,WHE,,432000.0,"[53, 12, 18]",7.132,2012-03-30,5.0,105
824,The Green Wave,RF,,70100.0,[],0.936,2012-08-10,8.7,3
979,22 Bullets,Cdgm.,,21300000.0,"[28, 80, 53]",8.697,2010-03-24,6.5,508
1016,Matru Ki Bijlee Ka Mandola,FIP,,6000000.0,"[35, 18]",2.036,2013-01-11,5.7,29


In [12]:
tmdbom_df[tmdbom_df['foreign_gross'].isna() == True].head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,genre_ids,popularity,release_date,vote_average,vote_count
176,Flipped,WB,1800000.0,,"[10749, 18]",9.781,2010-08-06,7.4,859
177,Flipped,WB,1800000.0,,"[18, 53]",0.962,2015-12-15,7.0,2
201,Tiny Furniture,IFC,392000.0,,"[10749, 35, 18]",6.695,2010-11-12,5.9,82
211,Last Train Home,Zeit.,288000.0,,"[99, 18]",2.733,2010-09-03,7.2,26
217,Casino Jack and the United States of Money,Magn.,177000.0,,[99],1.428,2010-05-09,7.2,21
