# Box Office Dataset Exploration

In [15]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

## Load All Datasets

In [16]:
# Box Office Data
box_office_df = pd.read_csv("../data/boxoffice_data_2024.csv")
box_office_df = box_office_df[box_office_df['Year'] >= 2015]

# TMDB All Movies
tmdb_all_df = pd.read_csv("../data/TMDB_all_movies.csv")
tmdb_all_df['release_date'] = pd.to_datetime(tmdb_all_df['release_date'], errors='coerce')
tmdb_all_df = tmdb_all_df[(tmdb_all_df['release_date'].dt.year >= 2015) & (tmdb_all_df['release_date'].notna())]

# TMDB v11 Dataset
tmdb_930k_df = pd.read_csv("../data/TMDB_movie_dataset_v11.csv")
tmdb_930k_df['release_date'] = pd.to_datetime(tmdb_930k_df['release_date'], errors='coerce')
tmdb_930k_df = tmdb_930k_df[(tmdb_930k_df['release_date'].dt.year >= 2015) & (tmdb_930k_df['release_date'].notna())]

# IMDB Data
imdb_movies_df = pd.read_csv("../data/title.basics.tsv", sep='\t', low_memory=False)
imdb_movies_df['startYear'] = pd.to_numeric(imdb_movies_df['startYear'], errors='coerce')
imdb_movies_df = imdb_movies_df[(imdb_movies_df['startYear'] >= 2015) & (imdb_movies_df['startYear'].notna())]

imdb_ratings_df = pd.read_csv("../data/title.ratings.tsv", sep='\t', low_memory=False)
valid_movie_ids = set(imdb_movies_df['tconst'])
imdb_ratings_df = imdb_ratings_df[imdb_ratings_df['tconst'].isin(valid_movie_ids)]

print(f"Box Office: {box_office_df.shape}")
print(f"TMDB All: {tmdb_all_df.shape}")
print(f"TMDB v11: {tmdb_930k_df.shape}")
print(f"IMDB Movies: {imdb_movies_df.shape}")
print(f"IMDB Ratings: {imdb_ratings_df.shape}")

Box Office: (2000, 3)
TMDB All: (417542, 28)
TMDB v11: (427318, 24)
IMDB Movies: (4843239, 9)
IMDB Ratings: (660971, 3)


## Box Office Dataset

In [17]:
print("Box Office Dataset Overview:")
print(f"Shape: {box_office_df.shape}")
print(f"Columns: {list(box_office_df.columns)}")
print(f"Year range: {box_office_df['Year'].min()} - {box_office_df['Year'].max()}")
box_office_df.head()

Box Office Dataset Overview:
Shape: (2000, 3)
Columns: ['Year', 'Title', 'Gross']
Year range: 2015 - 2024


Unnamed: 0,Year,Title,Gross
6145,2015,Star Wars: Episode VII - The Force Awakens,"$2,068,223,624"
6146,2015,Jurassic World,"$1,670,400,637"
6147,2015,Furious 7,"$1,515,047,671"
6148,2015,Avengers: Age of Ultron,"$1,402,805,868"
6149,2015,Minions,"$1,159,398,397"


## TMDB All Movies Dataset

In [18]:
print("TMDB All Movies Dataset Overview:")
print(f"Shape: {tmdb_all_df.shape}")
print(f"Columns: {list(tmdb_all_df.columns)}")
print(f"Year range: {tmdb_all_df['release_date'].dt.year.min()} - {tmdb_all_df['release_date'].dt.year.max()}")
tmdb_all_df.head()

TMDB All Movies Dataset Overview:
Shape: (417542, 28)
Columns: ['id', 'title', 'vote_average', 'vote_count', 'status', 'release_date', 'revenue', 'runtime', 'budget', 'imdb_id', 'original_language', 'original_title', 'overview', 'popularity', 'tagline', 'genres', 'production_companies', 'production_countries', 'spoken_languages', 'cast', 'director', 'director_of_photography', 'writers', 'producers', 'music_composer', 'imdb_rating', 'imdb_votes', 'poster_path']
Year range: 2015 - 2036


Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,budget,imdb_id,original_language,original_title,overview,popularity,tagline,genres,production_companies,production_countries,spoken_languages,cast,director,director_of_photography,writers,producers,music_composer,imdb_rating,imdb_votes,poster_path
1530,2203,Für ein paar Filme mehr...,0.0,0.0,Released,2017-01-01,0.0,30.0,0.0,,de,Für ein paar Filme mehr...,,0.6,,Documentary,,Germany,,,,,,,,,,
3043,5492,Gunner,5.3,190.0,Released,2024-08-16,0.0,90.0,20000000.0,tt12598606,en,Gunner,"While on a camping trip in order to reconnect,...",3.7734,Vengeance has a new name.,"Action, Thriller, Crime","magiCity Studios, 120dB Films, Acme Rocket Fuel",United States of America,English,"Joseph Baena, Will Koberg, Luke Hemsworth, Sea...",Dimitri Logothetis,Gerardo Madrazo,"Dimitri Logothetis, Gary Scott Thompson","Ford Corbett, Wayne Mogel, Joel Shapiro, Natha...",,3.2,3134.0,/cS2TXN1YlrCvkZmMxaevC1ZKtEz.jpg
3631,6636,The Mugger,5.4,5.0,Released,2017-04-01,0.0,67.0,0.0,tt1006823,es,El asaltante,"A man visits a school to enroll his son, but t...",2.012,,Drama,Magma Cine,Argentina,Español,"Guillermo Arengo, Maya Lesca, Arturo Goetz, Bá...",Pablo Fendrik,Cobi Migliora,Pablo Fendrik,Juan Pablo Gugliotta,,6.6,330.0,/nI7fpYioLcumBOIH0PMPDKxeqZc.jpg
5260,10148,Krystal,5.1,61.0,Released,2018-04-13,0.0,90.0,0.0,tt0835802,en,Krystal,A young man living a sheltered life develops a...,0.6873,She turns men into boys and boys into men.,"Drama, Comedy","Dog Pond Productions, Dorian Media, Pantry Fil...",United States of America,English,"William Fichtner, Rosario Dawson, Grant Gustin...",William H. Macy,Adam Silver,Will Aldis,"Jim Reeve, Dan Keston, Rachel Rothman, Robert ...",Dan Romer,5.5,1829.0,/5Do7HKvKPgjiBVJieMAOt8aZXAB.jpg
5415,10317,Our Brand Is Crisis,5.9,505.0,Released,2015-09-11,7002261.0,108.0,28000000.0,tt1018765,en,Our Brand Is Crisis,"Based on the documentary ""Our Brand Is Crisis""...",2.6655,May the best campaign win.,"Comedy, Drama","Participant, Smokehouse Pictures",United States of America,English,"Billy Bob Thornton, Octavio Gómez Berríos, Car...",David Gordon Green,Tim Orr,Peter Straughan,"George Clooney, Sandra Bullock, Stuart M. Bess...",David Wingo,6.1,23868.0,/niDI0NHB11nONifUg8sV6o0mzIk.jpg


## TMDB v11 Dataset (Main Dataset)

In [19]:
print("TMDB v11 Dataset Overview:")
print(f"Shape: {tmdb_930k_df.shape}")
print(f"Columns: {list(tmdb_930k_df.columns)}")
print(f"Year range: {tmdb_930k_df['release_date'].dt.year.min()} - {tmdb_930k_df['release_date'].dt.year.max()}")
tmdb_930k_df.head()

TMDB v11 Dataset Overview:
Shape: (427318, 24)
Columns: ['id', 'title', 'vote_average', 'vote_count', 'status', 'release_date', 'revenue', 'runtime', 'adult', 'backdrop_path', 'budget', 'homepage', 'imdb_id', 'original_language', 'original_title', 'overview', 'popularity', 'poster_path', 'tagline', 'genres', 'production_companies', 'production_countries', 'spoken_languages', 'keywords']


Year range: 2015 - 2099


Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,budget,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords
5,293660,Deadpool,7.606,28894,Released,2016-02-09,783100000,108,False,/en971MEXui9diirXlogOrPKmsEn.jpg,58000000,https://www.20thcenturystudios.com/movies/dead...,tt1431045,en,Deadpool,The origin story of former Special Forces oper...,72.735,/zq8Cl3PNIDGU3iWNRoc5nEZ6pCe.jpg,Witness the beginning of a happy ending.,"Action, Adventure, Comedy","20th Century Fox, The Donners' Company, Genre ...",United States of America,English,"superhero, anti hero, mercenary, based on comi..."
6,299536,Avengers: Infinity War,8.255,27713,Released,2018-04-25,2052415039,149,False,/mDfJG3LC3Dqb67AZ52x3Z0jU0uB.jpg,300000000,https://www.marvel.com/movies/avengers-infinit...,tt4154756,en,Avengers: Infinity War,As the Avengers and their allies have continue...,154.34,/7WsyChQLEftFiDOVTGkv3hFpyyt.jpg,An entire universe. Once and for all.,"Adventure, Action, Science Fiction",Marvel Studios,United States of America,"English, Xhosa","sacrifice, magic, superhero, based on comic, s..."
15,299534,Avengers: Endgame,8.263,23857,Released,2019-04-24,2800000000,181,False,/7RyHsO4yDXtBv1zUU3mTpHeQ0d5.jpg,356000000,https://www.marvel.com/movies/avengers-endgame,tt4154796,en,Avengers: Endgame,After the devastating events of Avengers: Infi...,91.756,/or06FN3Dka5tukK1e9sl16pB3iy.jpg,Avenge the fallen.,"Adventure, Science Fiction, Action",Marvel Studios,United States of America,"English, Japanese, Xhosa","superhero, time travel, space travel, time mac..."
18,475557,Joker,8.168,23425,Released,2019-10-01,1074458282,122,False,/hO7KbdvGOtDdeg0W4Y5nKEHeDDh.jpg,55000000,http://www.jokermovie.net/,tt7286456,en,Joker,"During the 1980s, a failed stand-up comedian i...",54.522,/udDclJoHjfjb8Ekgsd4FDteOkCU.jpg,Put on a happy face.,"Crime, Thriller, Drama","Warner Bros. Pictures, Joint Effort, Village R...","Canada, United States of America",English,"dream, street gang, society, psychopath, clown..."
23,99861,Avengers: Age of Ultron,7.276,21754,Released,2015-04-22,1405403694,141,False,/6YwkGolwdOMNpbTOmLjoehlVWs5.jpg,365000000,http://marvel.com/movies/movie/193/avengers_ag...,tt2395427,en,Avengers: Age of Ultron,When Tony Stark tries to jumpstart a dormant p...,96.565,/4ssDuvEDkSArWEdyBl2X5EHvYKU.jpg,A new age has come.,"Action, Adventure, Science Fiction",Marvel Studios,United States of America,English,"artificial intelligence (a.i.), superhero, bas..."


In [20]:
tmdb_930k_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 427318 entries, 5 to 1290600
Data columns (total 24 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   id                    427318 non-null  int64         
 1   title                 427316 non-null  object        
 2   vote_average          427318 non-null  float64       
 3   vote_count            427318 non-null  int64         
 4   status                427318 non-null  object        
 5   release_date          427318 non-null  datetime64[ns]
 6   revenue               427318 non-null  int64         
 7   runtime               427318 non-null  int64         
 8   adult                 427318 non-null  bool          
 9   backdrop_path         160390 non-null  object        
 10  budget                427318 non-null  int64         
 11  homepage              68879 non-null   object        
 12  imdb_id               171406 non-null  object        
 13  ori

In [21]:
tmdb_930k_df.describe()

Unnamed: 0,id,vote_average,vote_count,release_date,revenue,runtime,budget,popularity
count,427318.0,427318.0,427318.0,427318,427318.0,427318.0,427318.0,427318.0
mean,945003.8,1.737052,15.429107,2020-07-30 18:48:14.672351488,624564.3,47.729075,236960.0,1.306086
min,2203.0,0.0,0.0,2015-01-01 00:00:00,0.0,0.0,0.0,0.0
25%,663868.5,0.0,0.0,2018-02-10 00:00:00,0.0,2.0,0.0,0.6
50%,944669.5,0.0,0.0,2020-10-14 00:00:00,0.0,18.0,0.0,0.6
75%,1208665.0,2.5,1.0,2023-01-18 00:00:00,0.0,88.0,0.0,0.865
max,1549253.0,10.0,28894.0,2099-12-31 00:00:00,3000000000.0,13319.0,888000000.0,2994.357
std,335061.3,3.094737,286.076339,,20768570.0,62.949199,5857091.0,11.708126


## IMDB Movies Dataset

In [22]:
print("IMDB Movies Dataset Overview:")
print(f"Shape: {imdb_movies_df.shape}")
print(f"Columns: {list(imdb_movies_df.columns)}")
print(f"Year range: {imdb_movies_df['startYear'].min()} - {imdb_movies_df['startYear'].max()}")
print(f"Title types: {imdb_movies_df['titleType'].value_counts().head()}")
imdb_movies_df.head()

IMDB Movies Dataset Overview:
Shape: (4843239, 9)
Columns: ['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult', 'startYear', 'endYear', 'runtimeMinutes', 'genres']
Year range: 2015.0 - 2032.0
Title types: titleType
tvEpisode    3781005
short         508320
movie         208271
tvSeries      117813
video         113243
Name: count, dtype: int64


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
11631,tt0011801,movie,Tötet nicht mehr,Tötet nicht mehr,0,2019.0,\N,\N,"Action,Crime"
38011,tt0038698,short,A Little Phantasy on a Nineteenth Century Pain...,A Little Phantasy on a Nineteenth Century Pain...,0,2025.0,\N,4,"Animation,Short"
55742,tt0056840,short,Aufsätze,Aufsätze,0,2021.0,\N,10,Short
56265,tt0057369,short,Number 14: Late Superimpositions,Number 14: Late Superimpositions,0,2023.0,\N,30,Short
59185,tt0060361,short,EMS nr 1,EMS nr 1,0,2016.0,\N,14,Short


## IMDB Ratings Dataset

In [23]:
print("IMDB Ratings Dataset Overview:")
print(f"Shape: {imdb_ratings_df.shape}")
print(f"Columns: {list(imdb_ratings_df.columns)}")
print(f"Average rating: {imdb_ratings_df['averageRating'].mean():.2f}")
print(f"Rating range: {imdb_ratings_df['averageRating'].min()} - {imdb_ratings_df['averageRating'].max()}")
imdb_ratings_df.head()

IMDB Ratings Dataset Overview:
Shape: (660971, 3)
Columns: ['tconst', 'averageRating', 'numVotes']
Average rating: 7.12
Rating range: 1.0 - 10.0


Unnamed: 0,tconst,averageRating,numVotes
21885,tt0038698,6.8,154
37461,tt0056840,6.8,97
37947,tt0057369,5.6,75
40598,tt0060361,6.5,6
42289,tt0062336,6.4,249


## Clean Dataset

In [27]:
# restrict to only necessary columns
df = tmdb_930k_df[['id', 'imdb_id', 'title', 'release_date', 'runtime', 'budget', 'revenue', 
                   'adult', 'spoken_languages', 'genres', 'production_companies', 
                   'production_countries', 'keywords']].copy()

In [28]:
# Filter out adult (pornographic) films - irrelevant
df = df[df['adult'] == False]

# Remove movies with zero revenue OR zero budget
df = df[(df['revenue'] > 0) & (df['budget'] > 0)]

# Filter out very low revenue films that are likely streaming releases with only small theater releases
# (Movies with revenue < 1% of budget are likely limited/streaming releases)
df = df[df['revenue'] >= (df['budget'] * 0.01)]

# Filter out non-English movies - very unlikely these would ever make a dent on US box office
df = df[df['spoken_languages'].str.contains('English', case=False, na=False)]

# Filter out TV Movies and TV content
df = df[~df['genres'].str.contains('TV Movie', case=False, na=False)]

# Filter out entries that are likely TV shows based on keywords
df = df[~df['keywords'].str.contains('tv series|television series|tv show|teen drama', case=False, na=False)]

In [29]:
# Export csv for use in feature-engineering.ipynb
df['release_year'] = pd.to_datetime(df['release_date']).dt.year
df.to_csv("../data/dataset.csv", index=False)

print(f"Exported dataset: {df.shape}")
print(f"Columns: {list(df.columns)}")

Exported dataset: (2241, 14)
Columns: ['id', 'imdb_id', 'title', 'release_date', 'runtime', 'budget', 'revenue', 'adult', 'spoken_languages', 'genres', 'production_companies', 'production_countries', 'keywords', 'release_year']
