In [1]:
#loading libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# setting plt to inline
%matplotlib inline

In [2]:
# importing sqlite3
import sqlite3

# conneting to db and creating cursor
conn = sqlite3.Connection('data/movies.db')
cur = conn.cursor()

# creating query functions
def fetcha(q):
    return cur.execute(q).fetchall()
def fetcho(q):
    return cur.execute(q).fetchone()

In [3]:
# getting table names
q = """SELECT name FROM sqlite_master 
WHERE type IN ('table','view') 
AND name NOT LIKE 'sqlite_%'
ORDER BY 1"""
fetcha(q)

[('bom_movie_gross',),
 ('imdb_name_basic',),
 ('imdb_name_basics',),
 ('imdb_title_akas',),
 ('imdb_title_basics',),
 ('imdb_title_crew',),
 ('imdb_title_principals',),
 ('imdb_title_ratings',),
 ('rotten_tomatoes_critic_reviews',),
 ('rotten_tomatoes_movies',),
 ('tmdb_movies',),
 ('tn_movie_budgets',)]

In [4]:
# getting column names
q = "PRAGMA table_info('imdb_title_basics')"
fetcha(q)

[(0, 'idx', 'INTEGER', 0, None, 1),
 (1, 'tconst', 'TEXT', 0, None, 0),
 (2, 'primary_title', 'TEXT', 0, None, 0),
 (3, 'original_title', 'TEXT', 0, None, 0),
 (4, 'start_year', 'INTEGER', 0, None, 0),
 (5, 'runtime_minutes', 'REAL', 0, None, 0),
 (6, 'genres', 'TEXT', 0, None, 0)]

In [5]:
# getting column names
q = "PRAGMA table_info('imdb_title_ratings')"
fetcha(q)

[(0, 'idx', 'INTEGER', 0, None, 1),
 (1, 'tconst', 'TEXT', 0, None, 0),
 (2, 'averagerating', 'REAL', 0, None, 0),
 (3, 'numvotes', 'INTEGER', 0, None, 0)]

In [6]:
# getting column names
q = "PRAGMA table_info('bom_movie_gross')"
fetcha(q)

[(0, 'idx', 'INTEGER', 0, None, 1),
 (1, 'title', 'TEXT', 0, None, 0),
 (2, 'studio', 'TEXT', 0, None, 0),
 (3, 'domestic_gross', 'REAL', 0, None, 0),
 (4, 'foreign_gross', 'TEXT', 0, None, 0),
 (5, 'year', 'INTEGER', 0, None, 0)]

In [7]:
# creating initial dataframe by joining the three tables and printing first row
q = """SELECT*FROM bom_movie_gross AS bm
       JOIN imdb_title_basics AS ib
       ON ib.primary_title = bm.title
       JOIN imdb_title_ratings 
       USING(tconst)"""
df = pd.DataFrame(fetcha(q))
df.columns = [i[0] for i in cur.description]
df.head(1)     

Unnamed: 0,idx,title,studio,domestic_gross,foreign_gross,year,idx.1,tconst,primary_title,original_title,start_year,runtime_minutes,genres,idx.2,averagerating,numvotes
0,0,Toy Story 3,BV,415000000.0,652000000,2010,104,tt0435761,Toy Story 3,Toy Story 3,2010,103.0,"Adventure,Animation,Comedy",51135,8.3,682218


In [8]:
df[df.title == 'Titanic']

Unnamed: 0,idx,title,studio,domestic_gross,foreign_gross,year,idx.1,tconst,primary_title,original_title,start_year,runtime_minutes,genres,idx.2,averagerating,numvotes


In [8]:
# getting df info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3027 entries, 0 to 3026
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   idx              3027 non-null   int64  
 1   title            3027 non-null   object 
 2   studio           3024 non-null   object 
 3   domestic_gross   3005 non-null   float64
 4   foreign_gross    1832 non-null   object 
 5   year             3027 non-null   int64  
 6   idx              3027 non-null   int64  
 7   tconst           3027 non-null   object 
 8   primary_title    3027 non-null   object 
 9   original_title   3027 non-null   object 
 10  start_year       3027 non-null   int64  
 11  runtime_minutes  2980 non-null   float64
 12  genres           3020 non-null   object 
 13  idx              3027 non-null   int64  
 14  averagerating    3027 non-null   float64
 15  numvotes         3027 non-null   int64  
dtypes: float64(3), int64(6), object(7)
memory usage: 378.5+ KB


In [9]:
# checking NaNs
df.isna().sum()

idx                   0
title                 0
studio                3
domestic_gross       22
foreign_gross      1195
year                  0
idx                   0
tconst                0
primary_title         0
original_title        0
start_year            0
runtime_minutes      47
genres                7
idx                   0
averagerating         0
numvotes              0
dtype: int64

In [10]:
# changing numerical strings and NaN to floats and rechecking types
df['foreign_gross'] = pd.to_numeric(df['foreign_gross'], errors='coerce')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3027 entries, 0 to 3026
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   idx              3027 non-null   int64  
 1   title            3027 non-null   object 
 2   studio           3024 non-null   object 
 3   domestic_gross   3005 non-null   float64
 4   foreign_gross    1828 non-null   float64
 5   year             3027 non-null   int64  
 6   idx              3027 non-null   int64  
 7   tconst           3027 non-null   object 
 8   primary_title    3027 non-null   object 
 9   original_title   3027 non-null   object 
 10  start_year       3027 non-null   int64  
 11  runtime_minutes  2980 non-null   float64
 12  genres           3020 non-null   object 
 13  idx              3027 non-null   int64  
 14  averagerating    3027 non-null   float64
 15  numvotes         3027 non-null   int64  
dtypes: float64(4), int64(6), object(6)
memory usage: 378.5+ KB


In [11]:
# rechecking NaNs looks like we picked up four more
df.isna().sum() 

idx                   0
title                 0
studio                3
domestic_gross       22
foreign_gross      1199
year                  0
idx                   0
tconst                0
primary_title         0
original_title        0
start_year            0
runtime_minutes      47
genres                7
idx                   0
averagerating         0
numvotes              0
dtype: int64

In [12]:
# dropping extraneous columns
df = df.drop(['tconst', 'idx', 'primary_title', 'original_title', 'start_year'], axis=1)
df.head(1)

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year,runtime_minutes,genres,averagerating,numvotes
0,Toy Story 3,BV,415000000.0,652000000.0,2010,103.0,"Adventure,Animation,Comedy",8.3,682218


In [13]:
# checking the NaNs
df[df['studio'].isna() == True]

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year,runtime_minutes,genres,averagerating,numvotes
776,Keith Lemon: The Film,,,4000000.0,2012,85.0,Comedy,2.6,3950
1599,Plot for Peace,,7100.0,,2014,84.0,"Documentary,History,Thriller",6.7,128
2499,Secret Superstar,,,122000000.0,2017,150.0,"Drama,Music",8.0,16563


In [14]:
# checking the NaNs
df[df['domestic_gross'].isna() == True]

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year,runtime_minutes,genres,averagerating,numvotes
190,It's a Wonderful Afterlife,UTV,,1300000.0,2010,100.0,"Comedy,Drama,Fantasy",5.4,1361
227,Celine: Through the Eyes of the World,Sony,,119000.0,2010,120.0,"Documentary,Music",7.9,349
230,White Lion,Scre.,,99600.0,2010,88.0,"Drama,Family",6.7,828
440,Force,FoxS,,4800000.0,2011,137.0,"Action,Thriller",6.4,6348
441,Force,FoxS,,4800000.0,2011,135.0,"Action,Drama",6.2,23
769,The Tall Man,Imag.,,5200000.0,2012,106.0,"Crime,Drama,Horror",5.9,36331
770,The Tall Man,Imag.,,5200000.0,2012,79.0,Documentary,7.1,129
776,Keith Lemon: The Film,,,4000000.0,2012,85.0,Comedy,2.6,3950
851,Dark Tide,WHE,,432000.0,2012,94.0,"Action,Adventure,Drama",4.3,7682
907,The Green Wave,RF,,70100.0,2012,80.0,Documentary,7.6,290


In [15]:
# checking the NaNs
df[df['foreign_gross'].isna() == True].head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year,runtime_minutes,genres,averagerating,numvotes
184,Flipped,WB,1800000.0,,2010,90.0,"Comedy,Drama,Romance",7.7,74937
185,Flipped,WB,1800000.0,,2010,90.0,"Drama,Thriller",6.0,440
211,Tiny Furniture,IFC,392000.0,,2010,98.0,"Comedy,Drama,Romance",6.2,13397
225,Casino Jack and the United States of Money,Magn.,177000.0,,2010,118.0,Documentary,7.1,1522
236,The Taqwacores,Strand,11400.0,,2010,83.0,"Drama,Music",6.1,506


In [16]:
# checking the NaNs
df[df['runtime_minutes'].isna() == True]

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year,runtime_minutes,genres,averagerating,numvotes
17,Robin Hood,Uni.,105300000.0,216400000.0,2010,,,7.6,5
35,Red,Sum.,90400000.0,108600000.0,2010,,Thriller,5.8,9
51,The Bounty Hunter,Sony,67099999.0,69300000.0,2010,,,6.3,29
69,Burlesque,SGem,39400000.0,50100000.0,2010,,Drama,7.0,45
136,The Last Station,SPC,6600000.0,6900000.0,2010,,Drama,3.8,6
177,Twelve,Hann.,184000.0,2400000.0,2010,,Comedy,8.4,17
181,Housefull,Eros,1200000.0,1200000.0,2010,,Family,3.9,8
197,We Are Family,UTV,638000.0,393000.0,2010,,"Comedy,Drama",2.6,30
200,We Are Family,UTV,638000.0,393000.0,2010,,"Comedy,Drama,Family",4.9,26
273,War Horse,BV,79900000.0,97700000.0,2011,,Drama,8.6,143


In [17]:
# checking the NaNs
df[df['genres'].isna() == True]

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year,runtime_minutes,genres,averagerating,numvotes
17,Robin Hood,Uni.,105300000.0,216400000.0,2010,,,7.6,5
51,The Bounty Hunter,Sony,67099999.0,69300000.0,2010,,,6.3,29
106,Going the Distance,WB (NL),17800000.0,24200000.0,2010,117.0,,3.9,13
208,I'm Still Here,Magn.,409000.0,160000.0,2010,60.0,,7.1,14
1655,The Intern,WB,75800000.0,118800000.0,2015,65.0,,5.5,22
1685,Joy,Fox,56500000.0,44700000.0,2015,80.0,,5.8,28
2153,When the Bough Breaks,SGem,29700000.0,911000.0,2016,93.0,,6.1,8


In [18]:
pure_domestic_df = df[df['foreign_gross'].isna() == True].head()
pure_domestic_df

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year,runtime_minutes,genres,averagerating,numvotes
184,Flipped,WB,1800000.0,,2010,90.0,"Comedy,Drama,Romance",7.7,74937
185,Flipped,WB,1800000.0,,2010,90.0,"Drama,Thriller",6.0,440
211,Tiny Furniture,IFC,392000.0,,2010,98.0,"Comedy,Drama,Romance",6.2,13397
225,Casino Jack and the United States of Money,Magn.,177000.0,,2010,118.0,Documentary,7.1,1522
236,The Taqwacores,Strand,11400.0,,2010,83.0,"Drama,Music",6.1,506
