In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

In [43]:
annual_ticket_sales = pd.read_csv("AnnualTicketSales.csv")
highest_grossers = pd.read_csv("HighestGrossers.csv")
popular_creative_types = pd.read_csv("PopularCreativeTypes.csv")
top_distributors = pd.read_csv("TopDistributors.csv")
top_genres = pd.read_csv("TopGenres.csv")
top_grossing_ratings = pd.read_csv("TopGrossingRatings.csv")
top_grossing_sources = pd.read_csv("TopGrossingSources.csv")
top_production_methods = pd.read_csv("TopProductionMethods.csv")
wide_release_count = pd.read_csv("WideReleasesCount.csv")

### AnnualTicketSales.csv

In [87]:
annual_ticket_sales.head()

Unnamed: 0,YEAR,TICKETS SOLD,TOTAL BOX OFFICE,TOTAL INFLATION ADJUSTED BOX OFFICE,AVERAGE TICKET PRICE,Unnamed: 5
0,2021,423774881,"$3,881,777,912","$3,881,777,912",$9.16,
1,2020,223638958,"$2,048,534,616","$2,048,534,616",$9.16,
2,2019,1228541629,"$11,253,443,955","$11,253,444,050",$9.16,
3,2018,1311536128,"$11,948,096,650","$12,013,670,952",$9.11,
4,2017,1225639761,"$10,993,991,460","$11,226,860,216",$8.97,


In [88]:
annual_ticket_sales.drop(columns=["Unnamed: 5"], axis=1, inplace=True)

In [89]:
annual_ticket_sales.describe(include="all")

Unnamed: 0,YEAR,TICKETS SOLD,TOTAL BOX OFFICE,TOTAL INFLATION ADJUSTED BOX OFFICE,AVERAGE TICKET PRICE
count,27.0,27.0,27,27,27
unique,,27.0,27,27,25
top,,423774881.0,"$3,881,777,912","$3,881,777,912",$9.16
freq,,1.0,1,1,3
mean,2008.0,,,,
std,7.937254,,,,
min,1995.0,,,,
25%,2001.5,,,,
50%,2008.0,,,,
75%,2014.5,,,,


In [90]:
annual_ticket_sales.isnull().sum()

YEAR                                   0
TICKETS SOLD                           0
TOTAL BOX OFFICE                       0
TOTAL INFLATION ADJUSTED BOX OFFICE    0
AVERAGE TICKET PRICE                   0
dtype: int64

In [91]:
annual_ticket_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27 entries, 0 to 26
Data columns (total 5 columns):
 #   Column                               Non-Null Count  Dtype 
---  ------                               --------------  ----- 
 0   YEAR                                 27 non-null     int64 
 1   TICKETS SOLD                         27 non-null     object
 2   TOTAL BOX OFFICE                     27 non-null     object
 3   TOTAL INFLATION ADJUSTED BOX OFFICE  27 non-null     object
 4   AVERAGE TICKET PRICE                 27 non-null     object
dtypes: int64(1), object(4)
memory usage: 1.2+ KB


In [92]:
annual_ticket_sales["TICKETS SOLD"] = annual_ticket_sales["TICKETS SOLD"].str.replace(",","")

for i in ["TOTAL BOX OFFICE","TOTAL INFLATION ADJUSTED BOX OFFICE"]:
    annual_ticket_sales[i] = annual_ticket_sales[i].str.replace(",","")
    annual_ticket_sales[i] = annual_ticket_sales[i].str.replace("$","")

annual_ticket_sales["AVERAGE TICKET PRICE"] = annual_ticket_sales["AVERAGE TICKET PRICE"].str.replace("$", "")


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.



In [93]:
annual_ticket_sales.head()

Unnamed: 0,YEAR,TICKETS SOLD,TOTAL BOX OFFICE,TOTAL INFLATION ADJUSTED BOX OFFICE,AVERAGE TICKET PRICE
0,2021,423774881,3881777912,3881777912,9.16
1,2020,223638958,2048534616,2048534616,9.16
2,2019,1228541629,11253443955,11253444050,9.16
3,2018,1311536128,11948096650,12013670952,9.11
4,2017,1225639761,10993991460,11226860216,8.97


In [94]:
for col in ["TICKETS SOLD", "TOTAL BOX OFFICE", "TOTAL INFLATION ADJUSTED BOX OFFICE"]:
    annual_ticket_sales[col] = pd.to_numeric(annual_ticket_sales[col])

annual_ticket_sales["AVERAGE TICKET PRICE"] = pd.to_numeric(annual_ticket_sales["AVERAGE TICKET PRICE"], downcast="float")

In [95]:
annual_ticket_sales.head()

Unnamed: 0,YEAR,TICKETS SOLD,TOTAL BOX OFFICE,TOTAL INFLATION ADJUSTED BOX OFFICE,AVERAGE TICKET PRICE
0,2021,423774881,3881777912,3881777912,9.16
1,2020,223638958,2048534616,2048534616,9.16
2,2019,1228541629,11253443955,11253444050,9.16
3,2018,1311536128,11948096650,12013670952,9.11
4,2017,1225639761,10993991460,11226860216,8.97


In [96]:
annual_ticket_sales["avg"] = annual_ticket_sales["AVERAGE TICKET PRICE"]*150000000
annual_ticket_sales["box"] = annual_ticket_sales["TOTAL BOX OFFICE"]/5

In [98]:
fig1 = px.bar(annual_ticket_sales,
            x="YEAR",
            y="TICKETS SOLD",
            template="simple_white",
            title="Ticket Sales per Year")

fig2 = px.line(annual_ticket_sales,
            x="YEAR",
            y="avg",
            color_discrete_sequence=["black"],
            hover_data=["AVERAGE TICKET PRICE"])

fig2.update_yaxes(tickprefix="$", showgrid=True)

fig3 = px.line(annual_ticket_sales,
            x="YEAR",
            y="box",
            color_discrete_sequence=["ForestGreen"],
            hover_data=["TOTAL BOX OFFICE"])

fig3.update_yaxes(tickprefix="$", showgrid=True)



fig = go.Figure(data=fig1.data + fig2.data + fig3.data)

fig.update_yaxes(visible=False)

fig.update_layout(hovermode="x unified", template="simple_white", title="Ticket Sales and Total Box Office per Year")
fig.update_layout(title_x=0.5,
    font_family="Rockwell",
    legend=dict(
        title=None, orientation="h", y=1, yanchor="bottom", x=0.5, xanchor="center"
    )
)

fig.add_annotation(text="COVID-19", x="2020", arrowcolor="black", font_size=15, bgcolor="white")

fig.show()

### HighestGrossers.csv

In [3]:
highest_grossers.head()

Unnamed: 0,YEAR,MOVIE,GENRE,MPAA RATING,DISTRIBUTOR,TOTAL FOR YEAR,TOTAL IN 2019 DOLLARS,TICKETS SOLD
0,1995,Batman Forever,Drama,PG-13,Warner Bros.,"$184,031,112","$387,522,978",42306002
1,1996,Independence Day,Adventure,PG-13,20th Century Fox,"$306,169,255","$634,504,608",69269062
2,1997,Men in Black,Adventure,PG-13,Sony Pictures,"$250,650,052","$500,207,943",54607854
3,1998,Titanic,Adventure,PG-13,Paramount Pictures,"$443,319,081","$865,842,808",94524324
4,1999,Star Wars Ep. I: The Phantom Menace,Adventure,PG,20th Century Fox,"$430,443,350","$776,153,749",84732942


In [6]:
highest_grossers.isnull().sum()

YEAR                     0
MOVIE                    0
GENRE                    3
MPAA RATING              0
DISTRIBUTOR              0
TOTAL FOR YEAR           0
TOTAL IN 2019 DOLLARS    0
TICKETS SOLD             0
dtype: int64

In [27]:
highest_grossers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27 entries, 0 to 26
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   YEAR                   27 non-null     int64 
 1   MOVIE                  27 non-null     object
 2   GENRE                  27 non-null     object
 3   MPAA RATING            27 non-null     object
 4   DISTRIBUTOR            27 non-null     object
 5   TOTAL FOR YEAR         27 non-null     object
 6   TOTAL IN 2019 DOLLARS  27 non-null     object
 7   TICKETS SOLD           27 non-null     object
dtypes: int64(1), object(7)
memory usage: 1.8+ KB


In [47]:
highest_grossers[highest_grossers["GENRE"].isnull()] 

Unnamed: 0,YEAR,MOVIE,GENRE,MPAA RATING,DISTRIBUTOR,TOTAL FOR YEAR,TOTAL IN 2019 DOLLARS,TICKETS SOLD,tickets_sold
24,2019,Avengers: Endgame,,PG-13,Walt Disney,"$858,373,000","$858,373,002",93708843,937088439370884393708843
25,2020,Bad Boys For Life,,R,Sony Pictures,"$204,417,855","$204,417,848",22316359,223163592231635922316359
26,2021,Shang-Chi and the Legend of the Ten Rings,,PG-13,Walt Disney,"$224,226,704","$224,226,704",24478897,244788972447889724478897


In [48]:
highest_grossers["GENRE"].value_counts().reset_index()

Unnamed: 0,index,GENRE
0,Adventure,14
1,Action,9
2,Drama,1


In [49]:
highest_grossers.fillna("Adventure", inplace=True)

In [50]:
highest_grossers.at[25,'GENRE']="Action"

In [51]:
highest_grossers.tail()

Unnamed: 0,YEAR,MOVIE,GENRE,MPAA RATING,DISTRIBUTOR,TOTAL FOR YEAR,TOTAL IN 2019 DOLLARS,TICKETS SOLD,tickets_sold
22,2017,Star Wars Ep. VIII: The Last Jedi,Action,PG-13,Walt Disney,"$517,218,368","$528,173,936",57660910,576609105766091057660910
23,2018,Black Panther,Action,PG-13,Walt Disney,"$700,059,566","$703,901,821",76845177,768451777684517776845177
24,2019,Avengers: Endgame,Adventure,PG-13,Walt Disney,"$858,373,000","$858,373,002",93708843,937088439370884393708843
25,2020,Bad Boys For Life,Action,R,Sony Pictures,"$204,417,855","$204,417,848",22316359,223163592231635922316359
26,2021,Shang-Chi and the Legend of the Ten Rings,Adventure,PG-13,Walt Disney,"$224,226,704","$224,226,704",24478897,244788972447889724478897


In [52]:
highest_grossers["TICKETS SOLD"] = highest_grossers["TICKETS SOLD"].str.replace(",","")

for i in ["TOTAL FOR YEAR","TOTAL IN 2019 DOLLARS"]:
    highest_grossers[i] = highest_grossers[i].str.replace(",","")
    highest_grossers[i] = highest_grossers[i].str.replace("$","")


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.



In [53]:
for col in ["TICKETS SOLD", "TOTAL FOR YEAR", "TOTAL IN 2019 DOLLARS"]:
    highest_grossers[col] = pd.to_numeric(highest_grossers[col])

In [54]:
highest_grossers.dtypes

YEAR                      int64
MOVIE                    object
GENRE                    object
MPAA RATING              object
DISTRIBUTOR              object
TOTAL FOR YEAR            int64
TOTAL IN 2019 DOLLARS     int64
TICKETS SOLD              int64
tickets_sold             object
dtype: object

In [124]:
highest_grossers["tickets_sold"] = highest_grossers["TICKETS SOLD"]*3
highest_grossers["total"] = highest_grossers["TOTAL FOR YEAR"]/10

In [128]:
fig1 = px.line(highest_grossers,
            x="YEAR",
            y="TOTAL FOR YEAR",
            color_discrete_sequence=["black"])

fig2 = px.scatter(highest_grossers,
            x="YEAR",
            y="TOTAL FOR YEAR",
            color="DISTRIBUTOR",
            size="total")

fig3 = px.bar(highest_grossers,
            x="YEAR",
            y="tickets_sold",
            color_discrete_sequence=["grey"])

fig = go.Figure(data=fig1.data + fig2.data + fig3.data)

fig.update_yaxes(visible=False, showticklabels=False)

fig.update_layout(template="simple_white")

fig.show()