# Box Office Dataset Exploration

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

## Load All Datasets

In [None]:
# Box Office Data
box_office_df = pd.read_csv("../data/boxoffice_data_2024.csv")
box_office_df = box_office_df[box_office_df['Year'] >= 2015]

# TMDB All Movies
tmdb_all_df = pd.read_csv("../data/TMDB_all_movies.csv")
tmdb_all_df['release_date'] = pd.to_datetime(tmdb_all_df['release_date'], errors='coerce')
tmdb_all_df = tmdb_all_df[(tmdb_all_df['release_date'].dt.year >= 2015) & (tmdb_all_df['release_date'].notna())]

# TMDB v11 Dataset
tmdb_930k_df = pd.read_csv("../data/TMDB_movie_dataset_v11.csv")
tmdb_930k_df['release_date'] = pd.to_datetime(tmdb_930k_df['release_date'], errors='coerce')
tmdb_930k_df = tmdb_930k_df[(tmdb_930k_df['release_date'].dt.year >= 2015) & (tmdb_930k_df['release_date'].notna())]

# IMDB Data
imdb_movies_df = pd.read_csv("../data/title.basics.tsv", sep='\t', low_memory=False)
imdb_movies_df['startYear'] = pd.to_numeric(imdb_movies_df['startYear'], errors='coerce')
imdb_movies_df = imdb_movies_df[(imdb_movies_df['startYear'] >= 2015) & (imdb_movies_df['startYear'].notna())]

imdb_ratings_df = pd.read_csv("../data/title.ratings.tsv", sep='\t', low_memory=False)
valid_movie_ids = set(imdb_movies_df['tconst'])
imdb_ratings_df = imdb_ratings_df[imdb_ratings_df['tconst'].isin(valid_movie_ids)]

print(f"Box Office: {box_office_df.shape}")
print(f"TMDB All: {tmdb_all_df.shape}")
print(f"TMDB v11: {tmdb_930k_df.shape}")
print(f"IMDB Movies: {imdb_movies_df.shape}")
print(f"IMDB Ratings: {imdb_ratings_df.shape}")

## Box Office Dataset

In [None]:
print("Box Office Dataset Overview:")
print(f"Shape: {box_office_df.shape}")
print(f"Columns: {list(box_office_df.columns)}")
print(f"Year range: {box_office_df['Year'].min()} - {box_office_df['Year'].max()}")
box_office_df.head()

## TMDB All Movies Dataset

In [None]:
print("TMDB All Movies Dataset Overview:")
print(f"Shape: {tmdb_all_df.shape}")
print(f"Columns: {list(tmdb_all_df.columns)}")
print(f"Year range: {tmdb_all_df['release_date'].dt.year.min()} - {tmdb_all_df['release_date'].dt.year.max()}")
tmdb_all_df.head()

## TMDB v11 Dataset (Main Dataset)

In [None]:
print("TMDB v11 Dataset Overview:")
print(f"Shape: {tmdb_930k_df.shape}")
print(f"Columns: {list(tmdb_930k_df.columns)}")
print(f"Year range: {tmdb_930k_df['release_date'].dt.year.min()} - {tmdb_930k_df['release_date'].dt.year.max()}")
tmdb_930k_df.head()

In [None]:
tmdb_930k_df.info()

In [None]:
tmdb_930k_df.describe()

## IMDB Movies Dataset

In [None]:
print("IMDB Movies Dataset Overview:")
print(f"Shape: {imdb_movies_df.shape}")
print(f"Columns: {list(imdb_movies_df.columns)}")
print(f"Year range: {imdb_movies_df['startYear'].min()} - {imdb_movies_df['startYear'].max()}")
print(f"Title types: {imdb_movies_df['titleType'].value_counts().head()}")
imdb_movies_df.head()

## IMDB Ratings Dataset

In [None]:
print("IMDB Ratings Dataset Overview:")
print(f"Shape: {imdb_ratings_df.shape}")
print(f"Columns: {list(imdb_ratings_df.columns)}")
print(f"Average rating: {imdb_ratings_df['averageRating'].mean():.2f}")
print(f"Rating range: {imdb_ratings_df['averageRating'].min()} - {imdb_ratings_df['averageRating'].max()}")
imdb_ratings_df.head()

## Revenue and Budget Analysis

In [None]:
movies_with_revenue = tmdb_930k_df[(tmdb_930k_df['revenue'] > 0) & (tmdb_930k_df['budget'] > 0)]
print(f"Movies with revenue and budget data: {len(movies_with_revenue)}")

movies_with_revenue['roi'] = movies_with_revenue['revenue'] / movies_with_revenue['budget']
print(f"Average ROI: {movies_with_revenue['roi'].mean():.2f}")
print(f"Median ROI: {movies_with_revenue['roi'].median():.2f}")

In [None]:
print("Top 10 highest grossing movies:")
top_movies = movies_with_revenue.nlargest(10, 'revenue')[['title', 'revenue', 'budget', 'roi']]
display(top_movies)

## Missing Data Analysis

In [None]:
missing_data = tmdb_930k_df.isnull().sum()
missing_pct = (missing_data / len(tmdb_930k_df)) * 100
missing_df = pd.DataFrame({'Missing': missing_data, 'Percentage': missing_pct})
missing_df = missing_df[missing_df['Missing'] > 0].sort_values('Missing', ascending=False)
display(missing_df)

## Export Clean Dataset

In [None]:
df = tmdb_930k_df[['id', 'imdb_id', 'title', 'release_date', 'runtime', 'budget', 'revenue', 
                   'adult', 'spoken_languages', 'genres', 'production_companies', 
                   'production_countries', 'keywords']].copy()

df['release_year'] = pd.to_datetime(df['release_date']).dt.year
df.to_csv("../data/dataset.csv", index=False)

print(f"Exported dataset: {df.shape}")
print(f"Columns: {list(df.columns)}")