In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

In [14]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

In [28]:
path = kagglehub.dataset_download("jainaru/marvel-movies-box-office-data")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\injik\.cache\kagglehub\datasets\jainaru\marvel-movies-box-office-data\versions\1


In [32]:
for file in os.listdir(path):
    if file.endswith(".csv"):
        csv_path = os.path.join(path, file)
        break

In [33]:
df = pd.read_csv(csv_path)

In [36]:
df.head(10)

Unnamed: 0,movie,category,year,worldwide gross ($m),% budget recovered,critics % score,audience % score,audience vs critics % deviance,budget,domestic gross ($m),international gross ($m),opening weekend ($m),second weekend ($m),1st vs 2nd weekend drop off,% gross from opening weekend,% gross from domestic,% gross from international,% budget opening weekend
0,Ant-Man,Ant-Man,2015,518,398%,83%,85%,-2%,130.0,180,338,57.0,24.0,-58%,31.8,34.70%,65.30%,43.80%
1,Ant-Man & The Wasp,Ant-Man,2018,623,479%,87%,80%,7%,130.0,216,406,75.8,29.0,-62%,35.0,34.70%,65.20%,58.30%
2,Avengers: Age of Ultron,Avengers,2015,1395,382%,76%,82%,-6%,365.0,459,936,191.0,77.0,-60%,41.7,32.90%,67.10%,52.30%
3,Avengers: End Game,Avengers,2019,2797,699%,94%,90%,4%,400.0,858,1939,357.0,147.0,-59%,41.6,30.70%,69.30%,89.30%
4,Avengers: Infinity War,Avengers,2018,2048,683%,85%,91%,-6%,300.0,678,1369,257.0,114.0,-56%,38.0,33.10%,66.80%,85.70%
5,Black Panther,Black Panther,2018,1336,668%,96%,79%,17%,200.0,700,636,202.0,111.0,-45%,28.9,52.40%,47.60%,101.00%
6,Black Panther 2,Black Panther,2022,855,342%,84%,94%,-10%,250.0,453,401,181.0,66.0,-64%,48.6,53.00%,46.90%,72.40%
7,Black Widow,Unique,2021,379,190%,79%,80%,-1%,200.0,183,196,80.3,25.8,-68%,43.8,48.30%,51.70%,40.20%
8,Captain America,Captain America,2011,370,264%,79%,75%,4%,140.0,176,193,65.0,25.0,-62%,36.8,47.60%,52.20%,46.40%
9,Captain America: Civil War,Captain America,2016,1151,460%,90%,89%,1%,250.0,408,743,179.0,72.6,-59%,43.9,35.40%,64.60%,71.60%


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   movie                           36 non-null     object 
 1   category                        36 non-null     object 
 2   year                            36 non-null     int64  
 3   worldwide gross ($m)            36 non-null     int64  
 4   % budget recovered              36 non-null     object 
 5   critics % score                 36 non-null     object 
 6   audience % score                36 non-null     object 
 7   audience vs critics % deviance  36 non-null     object 
 8   budget                          36 non-null     float64
 9   domestic gross ($m)             36 non-null     int64  
 10  international gross ($m)        36 non-null     int64  
 11  opening weekend ($m)            36 non-null     float64
 12  second weekend ($m)             36 non

In [38]:
df.isnull().sum()

movie                             0
category                          0
year                              0
worldwide gross ($m)              0
% budget recovered                0
critics % score                   0
audience % score                  0
audience vs critics % deviance    0
budget                            0
domestic gross ($m)               0
international gross ($m)          0
opening weekend ($m)              0
second weekend ($m)               0
1st vs 2nd weekend drop off       0
% gross from opening weekend      0
% gross from domestic             0
% gross from international        0
% budget opening weekend          0
dtype: int64

In [39]:
df.describe()

Unnamed: 0,year,worldwide gross ($m),budget,domestic gross ($m),international gross ($m),opening weekend ($m),second weekend ($m),% gross from opening weekend
count,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0
mean,2017.055556,897.194444,194.347222,360.555556,536.194444,132.888889,55.047222,33.664689
std,4.368357,525.392469,68.063187,179.884001,362.294105,66.307788,28.553343,11.68497
min,2008.0,265.0,58.0,134.0,130.0,55.0,22.1,0.1597
25%,2014.0,568.25,150.0,215.75,297.0,84.0,34.525,31.95
50%,2017.5,776.5,183.0,333.5,440.5,119.0,47.8,36.6
75%,2021.0,1129.75,231.25,408.75,652.5,175.25,66.475,41.625
max,2023.0,2797.0,400.0,858.0,1939.0,357.0,147.0,48.6


This is the link of the Kaggle page I got this dataset from:
https://www.kaggle.com/datasets/jainaru/marvel-movies-box-office-data

As you can see, I chose a dataset containing all the Marvel movies along with their box office records. Why did I? Well, mainly because I have been a huge fan of these movies since my earliest childhood, and it's the first thing that came to my mind for this homework. However, I think there are some interesting calculations that can be done with this particular dataset here, such as trend analysis, revenue breakdown, distribution insights, etc. But despite all of this, this dataset does have its problems, one of them being the mixed and incosistent data types. For example, critics and audience score in this dataset includes a percentage sign (%), which means that they are not stored as numeric values, but as strings instead, which may cause a lot of difficulties, as models can't process non-numeric strigns. Fortunately, there is a solution of this, as we can strip the %, divide appropriately and ensure consistency. So let's do it now...

In [42]:
#the columns with the % that we need to clean
percentage_columns = [
    '% budget recovered', 
    'critics % score', 
    'audience % score', 
    'audience vs critics % deviance', 
    '% gross from opening weekend',
    '% gross from domestic',
    '% gross from international',
    '% budget opening weekend'
]

#checking if the column has string values, and removing the %
for c in percentage_columns:
    if df[c].dtype == 'object':
        df[c] = df[c].str.replace('%', '', regex=False)

#converting to float
df[c] = pd.to_numeric(df[c], errors='coerce') / 100

df.head()

Unnamed: 0,movie,category,year,worldwide gross ($m),% budget recovered,critics % score,audience % score,audience vs critics % deviance,budget,domestic gross ($m),international gross ($m),opening weekend ($m),second weekend ($m),1st vs 2nd weekend drop off,% gross from opening weekend,% gross from domestic,% gross from international,% budget opening weekend
0,Ant-Man,Ant-Man,2015,518,3.98,0.83,0.85,-0.02,130.0,180,338,57.0,24.0,-58%,31.8,34.7,65.3,0.00438
1,Ant-Man & The Wasp,Ant-Man,2018,623,4.79,0.87,0.8,0.07,130.0,216,406,75.8,29.0,-62%,35.0,34.7,65.2,0.00583
2,Avengers: Age of Ultron,Avengers,2015,1395,3.82,0.76,0.82,-0.06,365.0,459,936,191.0,77.0,-60%,41.7,32.9,67.1,0.00523
3,Avengers: End Game,Avengers,2019,2797,6.99,0.94,0.9,0.04,400.0,858,1939,357.0,147.0,-59%,41.6,30.7,69.3,0.00893
4,Avengers: Infinity War,Avengers,2018,2048,6.83,0.85,0.91,-0.06,300.0,678,1369,257.0,114.0,-56%,38.0,33.1,66.8,0.00857


As we can see, the problem is fixed, as now the columns don't have the percentage sign anymore, and the values are numeric.