In [64]:
#Importing packages
import pandas as pd
import numpy as np
import matplotlib as plt


In [2]:
#loading the data files into pandas dataframe
gross = pd.read_csv('data/bom.movie_gross.csv')
imdb_name = pd.read_csv('data/imdb.name.basics.csv')
imdb_title_akas = pd.read_csv('data/imdb.title.akas.csv')
imdb_title_basics = pd.read_csv('data/imdb.title.basics.csv')
imdb_title_crew = pd.read_csv('data/imdb.title.crew.csv')
imdb_title_principals = pd.read_csv('data/imdb.title.principals.csv')
imdb_title_ratings = pd.read_csv('data/imdb.title.ratings.csv')
rt_movie_info = pd.read_csv('data/rt.movie_info.tsv', delimiter = '\t')
rt_reviews = pd.read_csv('data/rt.reviews.tsv', delimiter = '\t', encoding = 'unicode_escape')
tmdb_movies = pd.read_csv('data/tmdb.movies.csv')
tn_movie_budgets = pd.read_csv('data/tn.movie_budgets.csv') #No Missing values


In [3]:
# setting and renaming index of tmdb_movies dataset
tmdb_movies.set_index('Unnamed: 0',inplace = True)
tmdb_movies.index.rename('index',inplace = True)


In [4]:
#Checking for missing values in data
#gross.isna().sum()


title                0
studio               5
domestic_gross      28
foreign_gross     1350
year                 0
dtype: int64

In [5]:
# Dropping rows with missing studio
#gross.dropna(subset = ['studio'],axis = 0, inplace = True)


In [80]:
gross.head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415.0,652.0,2010
1,Alice in Wonderland (2010),BV,334.2,691.3,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296.0,664.3,2010
3,Inception,WB,292.6,535.7,2010
4,Shrek Forever After,P/DW,238.7,513.9,2010


In [6]:
#gross.isna().sum()


title                0
studio               0
domestic_gross      26
foreign_gross     1349
year                 0
dtype: int64

In [7]:
#Filled missing values with of gross foriegn value to 0 to convert the type of the column to float
gross.foreign_gross = gross.foreign_gross.fillna(0)



In [8]:
#This function takes a dataframe and column name as arguments
#It removes '$' and ',' from the column values and convert the column type to float
#This function also divides the number with a million to make the numbers easy to read
def string_to_float(df,column):
    df[column] =  df[column].replace({'\$':''}, regex = True)
    df[column] =  df[column].replace({',':''}, regex = True)
    df[column] =  (df[column].astype(float))/1000000

# Calling the function on all columns from all datasets to get desired result
string_to_float(tn_movie_budgets,'production_budget')
string_to_float(tn_movie_budgets,'domestic_gross')
string_to_float(tn_movie_budgets,'worldwide_gross')
string_to_float(gross,'foreign_gross')


In [9]:
#since domestic_gross was already a float dividing it by a million to have same unit of gross throughout the project
gross.domestic_gross = gross.domestic_gross/1000000


In [10]:
#Creating a new column profit for profit/loss of movies
tn_movie_budgets['profit'] =  tn_movie_budgets['worldwide_gross'] - tn_movie_budgets['production_budget']




In [11]:
# top ten movies according to profit
#top_grossing_movies = list(tn_movie_budgets.sort_values(by = 'profit', ascending = True)['movie'])
# find movies that are profitable and not profitable
#tn_movie_budgets.sort_values(by = 'profit', ascending = False)['profit'].hist()
#sequels how good are sequels
#
tn_movie_budgets.sort_values(by = 'profit', ascending = False).head(10)
profitable_movies = list(tn_movie_budgets.sort_values(by = 'profit', ascending = False)['movie'].head(50))
not_profitable_movies = list(tn_movie_budgets.sort_values(by = 'profit')['movie'].head(50))




In [44]:
# Two datasets imdb_title_basics and tn_movie_budgets are merged to get profit and genres together in one table
merged_set = pd.merge(imdb_title_basics[['tconst', 'start_year', 'runtime_minutes', 'genres','original_title']], tn_movie_budgets, left_on = 'original_title', right_on = 'movie', how='inner')


In [59]:
merged_set  = merged_set.drop_duplicates(subset = 'movie')


In [62]:
merged_set.sort_values(by = 'profit', ascending = False).head(10)

Unnamed: 0,tconst,start_year,runtime_minutes,genres,original_title,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,profit
2516,tt2495766,2012,,Adventure,Titanic,43,"Dec 19, 1997",Titanic,200.0,659.363944,2208.208395,2008.208395
3087,tt4154756,2018,149.0,"Action,Adventure,Sci-Fi",Avengers: Infinity War,7,"Apr 27, 2018",Avengers: Infinity War,300.0,678.815482,2048.1342,1748.1342
12,tt0369610,2015,124.0,"Action,Adventure,Sci-Fi",Jurassic World,34,"Jun 12, 2015",Jurassic World,215.0,652.270625,1648.854864,1433.854864
135,tt0848228,2012,143.0,"Action,Adventure,Sci-Fi",The Avengers,27,"May 4, 2012",The Avengers,225.0,623.279547,1517.935897,1292.935897
1672,tt1825683,2018,134.0,"Action,Adventure,Sci-Fi",Black Panther,42,"Feb 16, 2018",Black Panther,200.0,700.059566,1348.258224,1148.258224
3227,tt4881806,2018,128.0,"Action,Adventure,Sci-Fi",Jurassic World: Fallen Kingdom,13,"Jun 22, 2018",Jurassic World: Fallen Kingdom,170.0,417.71976,1305.772799,1135.772799
774,tt1323045,2010,93.0,"Adventure,Drama,Sport",Frozen,56,"Nov 22, 2013",Frozen,150.0,400.738009,1272.46991,1122.46991
2649,tt2771200,2017,129.0,"Family,Fantasy,Musical",Beauty and the Beast,35,"Mar 17, 2017",Beauty and the Beast,160.0,504.014165,1259.199706,1099.199706
2357,tt2293640,2015,91.0,"Adventure,Animation,Comedy",Minions,73,"Jul 10, 2015",Minions,74.0,336.04577,1160.336173,1086.336173
2460,tt2395427,2015,141.0,"Action,Adventure,Sci-Fi",Avengers: Age of Ultron,4,"May 1, 2015",Avengers: Age of Ultron,330.6,459.005868,1403.013963,1072.413963


In [81]:
#Merging gross and tn_movie_budgets to get profit and studios in same table
merged_set2 = pd.merge(gross, tn_movie_budgets, left_on = 'title', right_on = 'movie', how='inner')


In [90]:
merged_set2.sort_values(by = 'profit', ascending = False).head(20)

Unnamed: 0,title,studio,domestic_gross_x,foreign_gross,year,id,release_date,movie,production_budget,domestic_gross_y,worldwide_gross,profit
1153,Avengers: Infinity War,BV,678.8,0.00137,2018,7,"Apr 27, 2018",Avengers: Infinity War,300.0,678.815482,2048.1342,1748.1342
763,Jurassic World,Uni.,652.3,0.001019,2015,34,"Jun 12, 2015",Jurassic World,215.0,652.270625,1648.854864,1433.854864
764,Furious 7,Uni.,353.0,0.001163,2015,67,"Apr 3, 2015",Furious 7,190.0,353.00702,1518.722794,1328.722794
1154,Black Panther,BV,700.1,646.9,2018,42,"Feb 16, 2018",Black Panther,200.0,700.059566,1348.258224,1148.258224
1155,Jurassic World: Fallen Kingdom,Uni.,417.7,891.8,2018,13,"Jun 22, 2018",Jurassic World: Fallen Kingdom,170.0,417.71976,1305.772799,1135.772799
495,Frozen,BV,400.7,875.7,2013,56,"Nov 22, 2013",Frozen,150.0,400.738009,1272.46991,1122.46991
766,Minions,Uni.,336.0,823.4,2015,73,"Jul 10, 2015",Minions,74.0,336.04577,1160.336173,1086.336173
765,Avengers: Age of Ultron,BV,459.0,946.4,2015,4,"May 1, 2015",Avengers: Age of Ultron,330.6,459.005868,1403.013963,1072.413963
1156,Incredibles 2,BV,608.6,634.2,2018,44,"Jun 15, 2018",Incredibles 2,200.0,608.581744,1242.520711,1042.520711
496,Iron Man 3,BV,409.0,805.8,2013,48,"May 3, 2013",Iron Man 3,200.0,408.992272,1215.392272,1015.392272


In [87]:
merged_set2.studio.unique()

array(['BV', 'WB', 'P/DW', 'Sum.', 'Par.', 'Uni.', 'Fox', 'Sony', 'FoxS',
       'SGem', 'WB (NL)', 'LGF', 'MBox', 'W/Dim.', 'Focus', 'MGM',
       'Over.', 'Mira.', 'NM', 'CBS', 'SPC', 'ParV', 'Gold.', 'RAtt.',
       'Magn.', 'IFC', 'Free', '3D', 'Wein.', 'Rela.', 'Anch.', 'App.',
       'Drft.', 'IW', 'Relbig.', 'Viv.', 'Eros', 'Scre.', 'UTV', 'Kino',
       'ATO', 'First', 'GK', 'NFC', 'Strand', 'Mont.', 'IVP', 'FD',
       'TriS', 'ORF', 'Jan.', 'Osci.', 'OMNI/FSR', 'SMod', 'WHE', 'P4',
       'ALP', 'LG/S', 'RTWC', 'MNE', 'LD', 'Yash', 'IM', 'A24', 'PH',
       'EOne', 'ELS', 'CE', 'Saban', 'DR', 'Trib.', 'KE', 'VE', 'EC',
       'BG', 'PFR', 'BST', 'BH Tilt', 'BSC', 'FCW', 'Cohen', 'LGP', 'TFA',
       'Alc', 'STX', 'Orch.', 'PNT', 'CJ', 'Cleopatra', 'BBC', 'GrtIndia',
       'Neon', 'Affirm', 'ENTMP', 'Studio 8', 'Annapurna', 'Global Road',
       'Amazon', 'RLJ'], dtype=object)