In [1]:
import pandas as pd
import gzip as gz
import matplotlib as plt
import sqlite3
import os
import zipfile

In [2]:
# Opened the relevant datasets to be explored in jupyter notebook
#if not os.path.exists('zippedData/im.db'):
with zipfile.ZipFile('zippedData/im.db.zip') as my_zip:
    zipfile.ZipFile.extractall(my_zip,path='zippedData/')
with gz.open('zippedData/rt.reviews.tsv.gz') as f:
    rt_reviews = pd.read_csv(f,delimiter='\t',encoding='latin1')
with gz.open('zippedData/rt.movie_info.tsv.gz') as f:
    rt_movie_info = pd.read_csv(f,delimiter='\t',encoding='latin1')
with gz.open('zippedData/tmdb.movies.csv.gz') as f:
    tmdb_movies = pd.read_csv(f)
with gz.open('zippedData/tn.movie_budgets.csv.gz') as f:
    tn_movie_budgets = pd.read_csv(f)
with gz.open('zippedData/bom.movie_gross.csv.gz') as f:
    bom_movie_gross = pd.read_csv(f)

In [3]:
# Connected to and opened the imdb database to utilize the data
im_db = sqlite3.connect('zippedData/im.db')
imdb_df = pd.read_sql("""SELECT * FROM movie_basics""",im_db)

In [4]:
rt_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54432 entries, 0 to 54431
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          54432 non-null  int64 
 1   review      48869 non-null  object
 2   rating      40915 non-null  object
 3   fresh       54432 non-null  object
 4   critic      51710 non-null  object
 5   top_critic  54432 non-null  int64 
 6   publisher   54123 non-null  object
 7   date        54432 non-null  object
dtypes: int64(2), object(6)
memory usage: 3.3+ MB


In [5]:
rt_movie_info['director'].value_counts()

Steven Spielberg    10
Clint Eastwood       8
Jim Jarmusch         4
Woody Allen          4
William Beaudine     4
                    ..
Mark Rydell          1
Stephen Frears       1
David Gelb           1
Lewis Teague         1
Rose Troche          1
Name: director, Length: 1125, dtype: int64

In [9]:
# Filtered out data frame of director count > 2 movies and set as a new variable
# Created a list_of_directors names where they directed more than 2 movies 
directors_with_morethanthree_movies = rt_movie_info['director'].value_counts().reset_index(name="count").query("count > 1")
list_of_directors = list(rt_movie_info['director'].value_counts()[:176].index)

In [10]:
print(directors_with_morethanthree_movies)
print(list_of_directors)

                index  count
0    Steven Spielberg     10
1      Clint Eastwood      8
2    William Friedkin      4
3         Woody Allen      4
4        Ridley Scott      4
..                ...    ...
171       Lloyd Bacon      2
172      Guy Hamilton      2
173       Lucio Fulci      2
174       John Huston      2
175      Marc Forster      2

[176 rows x 2 columns]
['Steven Spielberg', 'Clint Eastwood', 'William Friedkin', 'Woody Allen', 'Ridley Scott', 'Barry Levinson', 'Alfred Hitchcock', 'Bruce Beresford', 'Curtis Hanson', 'William Beaudine', 'Yimou Zhang', 'Jim Jarmusch', 'Neil Jordan', 'Boris Sagal', 'Mike Figgis', 'John Landis', 'Joseph Ruben', 'David Fincher', 'Gerald Thomas', 'David Swift', 'Phil Alden Robinson', 'Hong Sang-soo', 'Fritz Lang', 'George Cukor', 'Martha Coolidge', 'Sylvester Stallone', 'John Frankenheimer', 'Howard Hawks', 'Ang Lee', 'Charles Walters', 'David Lean', 'Richard Linklater', 'Anatole Litvak', 'George Sidney', 'Hark Tsui', 'Peter Bogdanovich', 'Wern

In [11]:
# Created a new dataframe where the data only represents movies directed by directors in list_of_directors
# Dropped any rows that didn't have an established studio name or box_office opening value
df_of_director_movies = rt_movie_info.loc[rt_movie_info['director'].isin(list_of_directors)]
dropped_nan_studios_and_boxoffice = df_of_director_movies.dropna(subset=['studio','box_office'])
dropped_nan_studios_and_boxoffice.info()
dropped_nan_studios_and_boxoffice['director'].value_counts()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73 entries, 15 to 1545
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            73 non-null     int64 
 1   synopsis      73 non-null     object
 2   rating        73 non-null     object
 3   genre         73 non-null     object
 4   director      73 non-null     object
 5   writer        67 non-null     object
 6   theater_date  73 non-null     object
 7   dvd_date      73 non-null     object
 8   currency      73 non-null     object
 9   box_office    73 non-null     object
 10  runtime       73 non-null     object
 11  studio        73 non-null     object
dtypes: int64(1), object(11)
memory usage: 7.4+ KB


Clint Eastwood         3
Donald Petrie          2
Thor Freudenthal       2
Ang Lee                2
Kasi Lemmons           2
Brian Robbins          2
George Hickenlooper    2
Bryan Singer           2
Gore Verbinski         2
Sam Mendes             2
Andrzej Bartkowiak     2
Paul Feig              2
Jay Roach              2
David Fincher          2
Barry Levinson         2
Seth MacFarlane        2
Neil Jordan            2
Yimou Zhang            2
Bruce Beresford        2
Kevin Smith            1
Chris Columbus         1
Jane Campion           1
James Cameron          1
John Singleton         1
Danny Boyle            1
Trish Sie              1
Steven Spielberg       1
Bill Duke              1
Guy Maddin             1
Gary Fleder            1
Craig Brewer           1
Tim Story              1
James Wong             1
Keenen Ivory Wayans    1
Gus Van Sant           1
Maggie Greenwald       1
Julian Jarrold         1
Peter Jackson          1
Steve Carr             1
Werner Herzog          1


In [12]:
imdb_df.value_counts(['primary_title'])
imdb_df[imdb_df['primary_title'] == 'Home']
imdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146144 entries, 0 to 146143
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   movie_id         146144 non-null  object 
 1   primary_title    146144 non-null  object 
 2   original_title   146123 non-null  object 
 3   start_year       146144 non-null  int64  
 4   runtime_minutes  114405 non-null  float64
 5   genres           140736 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 6.7+ MB


In [13]:
tn_movie_budgets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 5782 non-null   int64 
 1   release_date       5782 non-null   object
 2   movie              5782 non-null   object
 3   production_budget  5782 non-null   object
 4   domestic_gross     5782 non-null   object
 5   worldwide_gross    5782 non-null   object
dtypes: int64(1), object(5)
memory usage: 271.2+ KB


In [14]:
bom_movie_gross.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           3387 non-null   object 
 1   studio          3382 non-null   object 
 2   domestic_gross  3359 non-null   float64
 3   foreign_gross   2037 non-null   object 
 4   year            3387 non-null   int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 132.4+ KB


In [15]:
imdb_df.head()

Unnamed: 0,movie_id,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama"
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama"
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,,"Comedy,Drama"
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,"Comedy,Drama,Fantasy"


In [16]:
tn_movie_budgets.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


In [17]:
bom_movie_gross.head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010


In [18]:
# Merged the imdb + tn_movie_budgets and imdb + bom_movie_gross datasets as two seperate merged datasets, then merged
# The two merged datasets into a final dataset called all_movie_data
imdb_and_bom_data = imdb_df.merge(bom_movie_gross, left_on = 'primary_title', right_on = 'title')
imdb_and_tn_data = imdb_df.merge(tn_movie_budgets, left_on = 'primary_title', right_on = 'movie')
all_movie_data = imdb_and_bom_data.merge(imdb_and_tn_data, left_on = 'movie_id', right_on = 'movie_id')
all_movie_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1602 entries, 0 to 1601
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   movie_id           1602 non-null   object 
 1   primary_title_x    1602 non-null   object 
 2   original_title_x   1602 non-null   object 
 3   start_year_x       1602 non-null   int64  
 4   runtime_minutes_x  1502 non-null   float64
 5   genres_x           1576 non-null   object 
 6   title              1602 non-null   object 
 7   studio             1602 non-null   object 
 8   domestic_gross_x   1601 non-null   float64
 9   foreign_gross      1372 non-null   object 
 10  year               1602 non-null   int64  
 11  primary_title_y    1602 non-null   object 
 12  original_title_y   1602 non-null   object 
 13  start_year_y       1602 non-null   int64  
 14  runtime_minutes_y  1502 non-null   float64
 15  genres_y           1576 non-null   object 
 16  id                 1602 

In [19]:
# dropped the repeated column names as well as the columns that were not relevant to the business question.
all_movie_data.drop(['original_title_x', 'start_year_x', 'runtime_minutes_x', 'domestic_gross_x', 'foreign_gross', 'movie',\
                    'original_title_y', 'runtime_minutes_y', 'id', 'primary_title_x','start_year_y','primary_title_y',\
                    'genres_y'], axis = 1, inplace = True)

In [20]:
# Using this to check how many titles have more than one value - meaning it is likely a repeated entry
all_movie_data['title'].value_counts().reset_index(name="count").query("count > 1")


Unnamed: 0,index,count
0,The Journey,11
1,The Gambler,10
2,The Wall,10
3,Robin Hood,10
4,Eden,9
...,...,...
198,Won't Back Down,2
199,The Invitation,2
200,Devil,2
201,The Bounty Hunter,2


In [21]:
#Removed duplicate title names from the titles column
all_movie_data.drop_duplicates(['title'],inplace = True)

In [22]:
all_movie_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1158 entries, 0 to 1601
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   movie_id           1158 non-null   object
 1   genres_x           1152 non-null   object
 2   title              1158 non-null   object
 3   studio             1158 non-null   object
 4   year               1158 non-null   int64 
 5   release_date       1158 non-null   object
 6   production_budget  1158 non-null   object
 7   domestic_gross_y   1158 non-null   object
 8   worldwide_gross    1158 non-null   object
dtypes: int64(1), object(8)
memory usage: 90.5+ KB


In [23]:
all_movie_data

Unnamed: 0,movie_id,genres_x,title,studio,year,release_date,production_budget,domestic_gross_y,worldwide_gross
0,tt0337692,"Adventure,Drama,Romance",On the Road,IFC,2012,"Mar 22, 2013","$25,000,000","$720,828","$9,313,302"
6,tt0359950,"Adventure,Comedy,Drama",The Secret Life of Walter Mitty,Fox,2013,"Dec 25, 2013","$91,000,000","$58,236,838","$187,861,183"
7,tt0365907,"Action,Crime,Drama",A Walk Among the Tombstones,Uni.,2014,"Sep 19, 2014","$28,000,000","$26,017,685","$62,108,587"
8,tt0369610,"Action,Adventure,Sci-Fi",Jurassic World,Uni.,2015,"Jun 12, 2015","$215,000,000","$652,270,625","$1,648,854,864"
9,tt0376136,"Comedy,Drama",The Rum Diary,FD,2011,"Oct 28, 2011","$45,000,000","$13,109,815","$21,544,732"
...,...,...,...,...,...,...,...,...,...
1597,tt7334528,"Comedy,Sport",Uncle Drew,LG/S,2018,"Jun 29, 2018","$18,000,000","$42,469,946","$46,527,161"
1598,tt7349662,"Biography,Crime,Drama",BlacKkKlansman,Focus,2018,"Aug 10, 2018","$15,000,000","$49,275,340","$93,017,335"
1599,tt7388562,"Adventure,Biography,Drama","Paul, Apostle of Christ",Affirm,2018,"Mar 23, 2018","$5,000,000","$17,547,999","$25,529,498"
1600,tt7401588,"Comedy,Drama",Instant Family,Par.,2018,"Nov 16, 2018","$48,000,000","$67,363,237","$119,736,188"
