# Set up

In [20]:
import pandas as pd
from IPython.display import display
import numpy as np
import warnings
import plotly.express as px
from eda_support_functions import *
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Suppress all warnings
warnings.filterwarnings("ignore")

In [21]:
movie_input = pd.read_csv(
    "/Users/iliasx/Documents/GitHub/box-office-prediction/data/processed_data/movie.csv"
)
genres = pd.read_csv(
    "/Users/iliasx/Documents/GitHub/box-office-prediction/data/processed_data/genre.csv"
)
keywords = pd.read_csv(
    "/Users/iliasx/Documents/GitHub/box-office-prediction/data/processed_data/keyword.csv"
)
production_companies = pd.read_csv(
    "/Users/iliasx/Documents/GitHub/box-office-prediction/data/processed_data/production.csv"
)

In [22]:
movie = movie_input[
    [
        "movie_id",
        "imdb_id",
        "year",
        "month",
        "release_date",
        "quarter",
        "original_title",
        "is_released__scope",
        "is_released__US",
        "days_from_us_release",
        "runtime",
        "revenue_usd_adj",
        "budget_usd_adj",
        "surplus",
        "ratio_adj",
        "roi",
        "is_first_released_in_cinemas",
        "is_first_released_in_cinemas_safe",
        "release_category",
        "ageCert",
    ]
]

In [23]:
movie.groupby("release_category").size()

release_category
Close streaming release           255
Far streaming release            8055
Not released in major markets    1772
Streaming release                2045
dtype: int64

# Basic Understanding of the Data

In [24]:
movie.head(10)

Unnamed: 0,movie_id,imdb_id,year,month,release_date,quarter,original_title,is_released__scope,is_released__US,days_from_us_release,runtime,revenue_usd_adj,budget_usd_adj,surplus,ratio_adj,roi,is_first_released_in_cinemas,is_first_released_in_cinemas_safe,release_category,ageCert
0,340666,tt4550098,2016,11,2016-11-04,4,Nocturnal Animals,True,True,74.0,117,38482560.0,28564980.0,-9323700.0,1.347194,-0.326403,True,True,Far streaming release,R
1,242911,tt0366180,2006,12,2006-12-14,4,స్టాలిన్,False,False,-1.0,168,9593046.0,5481741.0,-685217.6,1.75,-0.125,False,False,Not released in major markets,U
2,179826,tt1767354,2013,1,2013-01-29,1,Odd Thomas,True,True,-21.0,100,1503213.0,35315330.0,-34563730.0,0.042565,-0.978717,False,False,Streaming release,PG13
3,340676,tt4714782,2016,12,2016-12-14,4,Personal Shopper,True,True,63.0,106,3606053.0,7617328.0,-5814301.0,0.473401,-0.763299,True,True,Far streaming release,R
4,505058,tt4761916,2018,7,2018-07-19,3,Unfriended: Dark Web,True,True,78.0,93,19414960.0,1213435.0,8494044.0,16.0,7.0,True,True,Far streaming release,R
5,14351,tt0473024,2006,9,2006-09-01,3,Crossover,True,True,15828.0,95,10594540.0,8766228.0,-3468957.0,1.208563,-0.395718,True,True,Far streaming release,PG13
6,14347,tt0086973,1984,2,1984-02-17,1,Blame It on Rio,True,True,24060.0,100,54677940.0,26393820.0,945148.1,2.071619,0.035809,True,True,Far streaming release,R
7,579245,tt8655470,2019,10,2019-10-23,4,Hors Normes,True,False,-1.0,114,23106960.0,15255540.0,-3702062.0,1.51466,-0.24267,False,False,Streaming release,U
8,442000,tt6167894,2017,3,2017-03-03,1,അങ്കമാലി ഡയറീസ്,False,False,-1.0,132,3853526.0,758274.4,1168488.0,5.081967,1.540984,False,False,Not released in major markets,U
9,276496,tt2192016,2014,7,2014-07-17,3,The Dark Horse,True,True,12670.0,124,2396720.0,2702902.0,-1504542.0,0.886721,-0.55664,True,True,Far streaming release,U


In [25]:
null_percentage_all = movie.isnull().mean() * 100

zero_percentage_all = (movie == 0).mean() * 100

null_zero_percentage_all = pd.DataFrame(
    {"Null Percentage": null_percentage_all, "Zero Percentage": zero_percentage_all}
)
print(null_zero_percentage_all)

                                   Null Percentage  Zero Percentage
movie_id                                  0.000000         0.000000
imdb_id                                   0.024738         0.000000
year                                      0.000000         0.000000
month                                     0.000000         0.000000
release_date                              0.000000         0.000000
quarter                                   0.000000         0.000000
original_title                            0.000000         0.000000
is_released__scope                        0.000000        14.612023
is_released__US                           0.000000        29.504412
days_from_us_release                      0.000000         0.313350
runtime                                   0.000000         0.000000
revenue_usd_adj                           0.000000         0.000000
budget_usd_adj                            0.000000         0.000000
surplus                                   0.0000

In [26]:
movie[movie.budget_usd_adj != 0][
    ["revenue_usd_adj", "budget_usd_adj", "surplus", "ratio_adj", "roi"]
].describe(
    percentiles=[0.01, 0.05, 0.1, 0.25, 0.50, 0.75, 0.9, 0.95, 0.99, 0.999, 0.9999]
).drop("count").style.format(
    {
        "revenue_usd_adj": "${:,.0f}",
        "budget_usd_adj": "${:,.0f}",
        "surplus": "${:,.0f}",
        "ratio_adj": "{:.2f}",
        "roi": "{:.2f}",
    }
)

Unnamed: 0,revenue_usd_adj,budget_usd_adj,surplus,ratio_adj,roi
mean,"$75,476,820","$29,921,482","$7,816,928",605.24,301.62
std,"$192,073,833","$43,785,666","$76,017,060",42530.53,21265.26
min,$0,$0,"$-631,614,969",0.0,-1.0
1%,$381,"$22,764","$-89,509,623",0.0,-1.0
5%,"$52,671","$687,223","$-44,264,279",0.01,-0.99
10%,"$266,722","$1,630,876","$-29,319,073",0.05,-0.97
25%,"$2,372,800","$5,078,219","$-12,336,844",0.31,-0.85
50%,"$14,209,684","$14,166,196","$-2,628,055",1.14,-0.43
75%,"$62,380,577","$36,739,059","$4,638,672",2.96,0.48
90%,"$195,231,890","$75,393,187","$41,351,457",6.66,2.33


In [27]:
print("Smallest ratio movies")
display(
    movie.sort_values(by="ratio_adj")
    .head(10)[
        [
            "movie_id",
            "original_title",
            "revenue_usd_adj",
            "budget_usd_adj",
            "surplus",
            "ratio_adj",
            "roi",
        ]
    ]
    .style.format(
        {
            "revenue_usd_adj": "${:,.0f}",
            "budget_usd_adj": "${:,.0f}",
            "surplus": "${:,.0f}",
            "ratio_adj": "{:.2f}",
            "roi": "{:.2f}",
        }
    )
)

print("Biggest ratio movies")
display(
    movie[movie.budget_usd_adj != 0]
    .sort_values(by="ratio_adj", ascending=False)
    .head(10)[
        [
            "movie_id",
            "original_title",
            "revenue_usd_adj",
            "budget_usd_adj",
            "surplus",
            "ratio_adj",
            "roi",
        ]
    ]
    .style.format(
        {
            "revenue_usd_adj": "${:,.0f}",
            "budget_usd_adj": "${:,.0f}",
            "surplus": "${:,.0f}",
            "ratio_adj": "{:.2f}",
            "roi": "{:.2f}",
        }
    )
)

Smallest ratio movies


Unnamed: 0,movie_id,original_title,revenue_usd_adj,budget_usd_adj,surplus,ratio_adj,roi
4167,198701,Elephant Tales,$2,"$11,335,640","$-11,335,639",0.0,-1.0
7628,46943,The Point Men,$2,"$10,495,100","$-10,495,099",0.0,-1.0
5735,92493,Edwin Boyd: Citizen Gangster,$1,"$6,772,992","$-6,772,991",0.0,-1.0
9811,228331,A Perfect Man,$1,"$6,539,876","$-6,539,876",0.0,-1.0
9073,334532,100 Streets,$1,"$4,763,807","$-4,763,807",0.0,-1.0
6941,214251,Història de la meva mort,$1,"$2,310,282","$-2,310,282",0.0,-1.0
6440,95023,The Frog Prince,$3,"$3,613,279","$-3,613,277",0.0,-1.0
4645,4593,Le Charme discret de la bourgeoisie,$7,"$5,831,617","$-5,831,614",0.0,-1.0
3013,280422,Все и сразу,$1,"$965,322","$-965,321",0.0,-1.0
7786,98544,Obsession,$28,"$8,732,892","$-8,732,878",0.0,-1.0


Biggest ratio movies


Unnamed: 0,movie_id,original_title,revenue_usd_adj,budget_usd_adj,surplus,ratio_adj,roi
456,34707,Darling,"$1,082,161",$0,"$541,080",4493069.2,2246533.6
906,80539,పంజా,"$7,665,368",$8,"$3,832,676",943129.3,471563.65
9711,168490,Daraar,"$5,992,656",$8,"$2,996,320",771448.66,385723.33
7396,46660,I Love You Too,"$2,934,449",$8,"$1,467,216",350000.0,174999.0
4156,20034,Hush,"$488,565",$1,"$244,281",345221.27,172609.64
2681,119193,Ce que le jour doit à la nuit,"$2,654,268",$23,"$1,327,111",117647.06,58822.53
11591,13701,Immortal Beloved,"$20,354,587",$247,"$10,177,047",82500.0,41249.0
9895,11537,蛇形刁手,"$14,020,031",$430,"$7,009,585",32608.7,16303.35
11960,33261,The FJ Holden,"$3,569,941",$133,"$1,784,837",26792.45,13395.23
1084,249677,Timepass,"$6,776,352",$257,"$3,387,919",26324.19,13161.09


In [28]:
print("Movies with ratio_adj > more than 10 and less than 0.1")
display(
    movie[(movie.ratio_adj > 10) | (movie.ratio_adj < 0.1)][
        ["movie_id", "original_title", "revenue_usd_adj", "budget_usd_adj", "ratio_adj"]
    ]
)

Movies with ratio_adj > more than 10 and less than 0.1


Unnamed: 0,movie_id,original_title,revenue_usd_adj,budget_usd_adj,ratio_adj
2,179826,Odd Thomas,1.503213e+06,3.531533e+07,0.042565
4,505058,Unfriended: Dark Web,1.941496e+07,1.213435e+06,16.000000
12,14353,Repo! The Genetic Opera,2.662405e+05,1.202941e+07,0.022132
13,340674,Dark Crimes,2.693487e+04,5.688104e+06,0.004735
24,33613,Luftslottet som sprängdes,6.288467e+07,5.681109e+06,11.069084
...,...,...,...,...,...
12112,14330,Trojan War,5.866225e+02,2.847682e+07,0.000021
12113,14337,Primer,8.798065e+05,1.129123e+04,77.919429
12115,416234,Winchester,5.341490e+07,4.247022e+06,12.577025
12122,276478,The Kitchen Toto,2.622214e+05,8.567825e+06,0.030605


Example of outlier, indeed a wrong imdb data entry

https://www.imdb.com/title/tt1334328/


movie_id	original_title	revenue_usd_adj	budget_usd_adj	ratio_adj

53128	Ψυχή Βαθιά	2.903065e+06	4.092371	7.093847e+0

In [29]:
print("Smallest ROI movies")
display(
    movie.sort_values(by="roi")
    .head(15)[
        [
            "movie_id",
            "original_title",
            "revenue_usd_adj",
            "budget_usd_adj",
            "surplus",
            "ratio_adj",
            "roi",
        ]
    ]
    .style.format(
        {
            "revenue_usd_adj": "${:,.0f}",
            "budget_usd_adj": "${:,.0f}",
            "surplus": "${:,.0f}",
            "ratio_adj": "{:.2f}",
            "roi": "{:.2f}",
        }
    )
)

print("Biggest ROI movies")
display(
    movie[movie.budget_usd_adj != 0]
    .sort_values(by="roi", ascending=False)
    .head(15)[
        [
            "movie_id",
            "original_title",
            "revenue_usd_adj",
            "budget_usd_adj",
            "surplus",
            "ratio_adj",
            "roi",
        ]
    ]
    .style.format(
        {
            "revenue_usd_adj": "${:,.0f}",
            "budget_usd_adj": "${:,.0f}",
            "surplus": "${:,.0f}",
            "ratio_adj": "{:.2f}",
            "roi": "{:.2f}",
        }
    )
)

Smallest ROI movies


Unnamed: 0,movie_id,original_title,revenue_usd_adj,budget_usd_adj,surplus,ratio_adj,roi
4167,198701,Elephant Tales,$2,"$11,335,640","$-11,335,639",0.0,-1.0
7628,46943,The Point Men,$2,"$10,495,100","$-10,495,099",0.0,-1.0
9811,228331,A Perfect Man,$1,"$6,539,876","$-6,539,876",0.0,-1.0
5735,92493,Edwin Boyd: Citizen Gangster,$1,"$6,772,992","$-6,772,991",0.0,-1.0
9073,334532,100 Streets,$1,"$4,763,807","$-4,763,807",0.0,-1.0
6941,214251,Història de la meva mort,$1,"$2,310,282","$-2,310,282",0.0,-1.0
6440,95023,The Frog Prince,$3,"$3,613,279","$-3,613,277",0.0,-1.0
4645,4593,Le Charme discret de la bourgeoisie,$7,"$5,831,617","$-5,831,614",0.0,-1.0
3013,280422,Все и сразу,$1,"$965,322","$-965,321",0.0,-1.0
7786,98544,Obsession,$28,"$8,732,892","$-8,732,878",0.0,-1.0


Biggest ROI movies


Unnamed: 0,movie_id,original_title,revenue_usd_adj,budget_usd_adj,surplus,ratio_adj,roi
456,34707,Darling,"$1,082,161",$0,"$541,080",4493069.2,2246533.6
906,80539,పంజా,"$7,665,368",$8,"$3,832,676",943129.3,471563.65
9711,168490,Daraar,"$5,992,656",$8,"$2,996,320",771448.66,385723.33
7396,46660,I Love You Too,"$2,934,449",$8,"$1,467,216",350000.0,174999.0
4156,20034,Hush,"$488,565",$1,"$244,281",345221.27,172609.64
2681,119193,Ce que le jour doit à la nuit,"$2,654,268",$23,"$1,327,111",117647.06,58822.53
11591,13701,Immortal Beloved,"$20,354,587",$247,"$10,177,047",82500.0,41249.0
9895,11537,蛇形刁手,"$14,020,031",$430,"$7,009,585",32608.7,16303.35
11960,33261,The FJ Holden,"$3,569,941",$133,"$1,784,837",26792.45,13395.23
1084,249677,Timepass,"$6,776,352",$257,"$3,387,919",26324.19,13161.09


In [30]:
print("Smallest Surplus movies")
display(
    movie.sort_values(by="surplus")
    .head(10)[
        [
            "movie_id",
            "original_title",
            "revenue_usd_adj",
            "budget_usd_adj",
            "surplus",
            "ratio_adj",
            "roi",
        ]
    ]
    .style.format(
        {
            "revenue_usd_adj": "${:,.0f}",
            "budget_usd_adj": "${:,.0f}",
            "surplus": "${:,.0f}",
            "ratio_adj": "{:.2f}",
            "roi": "{:.2f}",
        }
    )
)

print("Biggest Surplus movies")
display(
    movie[movie.budget_usd_adj != 0]
    .sort_values(by="surplus", ascending=False)
    .head(10)[
        [
            "movie_id",
            "original_title",
            "revenue_usd_adj",
            "budget_usd_adj",
            "surplus",
            "ratio_adj",
            "roi",
        ]
    ]
    .style.format(
        {
            "revenue_usd_adj": "${:,.0f}",
            "budget_usd_adj": "${:,.0f}",
            "surplus": "${:,.0f}",
            "ratio_adj": "{:.2f}",
            "roi": "{:.2f}",
        }
    )
)

Smallest Surplus movies


Unnamed: 0,movie_id,original_title,revenue_usd_adj,budget_usd_adj,surplus,ratio_adj,roi
4974,267268,Más negro que la noche,"$7,851,287","$635,540,613","$-631,614,969",0.01,-0.99
2295,37848,刺陵,"$515,560,607","$710,138,577","$-452,358,274",0.73,-0.64
9703,312408,一步之遥,"$104,753,513","$385,221,057","$-332,844,300",0.27,-0.86
1492,449927,追龍,"$109,348,068","$248,614,556","$-193,940,522",0.44,-0.78
7770,398978,The Irishman,"$1,154,717","$189,502,411","$-188,925,053",0.01,-1.0
2409,1408,Cutthroat Island,"$31,989,711","$195,936,982","$-179,942,126",0.16,-0.92
10056,11692,The Adventures of Pluto Nash,"$12,032,211","$169,372,985","$-163,356,879",0.07,-0.96
2998,1911,The 13th Warrior,"$112,843,805","$219,473,229","$-163,051,327",0.51,-0.74
5275,466980,大护法,"$16,167,694","$168,700,927","$-160,617,080",0.1,-0.95
9189,10935,Heaven's Gate,"$12,885,208","$162,704,951","$-156,262,348",0.08,-0.96


Biggest Surplus movies


Unnamed: 0,movie_id,original_title,revenue_usd_adj,budget_usd_adj,surplus,ratio_adj,roi
4849,11,Star Wars,"$3,898,767,715","$55,308,944","$1,894,074,914",70.49,34.25
1632,597,Titanic,"$4,298,409,952","$379,690,966","$1,769,514,010",11.32,4.66
4130,19995,Avatar,"$4,152,472,877","$336,605,686","$1,739,630,753",12.34,5.17
7332,9552,The Exorcist,"$3,028,532,995","$82,351,892","$1,431,914,606",36.78,17.39
7205,9461,Enter the Dragon,"$2,745,063,063","$5,833,259","$1,366,698,273",470.59,234.29
1597,578,Jaws,"$2,665,593,130","$39,645,242","$1,293,151,323",67.24,32.62
1948,299534,Avengers: Endgame,"$3,336,480,881","$424,294,707","$1,243,945,733",7.86,2.93
1649,601,E.T. the Extra-Terrestrial,"$2,503,815,272","$33,154,104","$1,218,753,533",75.52,36.76
1309,329,Jurassic Park,"$2,326,039,361","$132,845,855","$1,030,173,826",17.51,7.75
2975,1891,The Empire Strikes Back,"$1,990,916,951","$66,561,117","$928,897,359",29.91,13.96


In [31]:
print("Basic statistics per is_released__US flag for the ratio and the ROI: ")
display(
    movie.groupby("is_released__US")[["ratio_adj", "roi"]]
    .describe()
    .style.format("{:,.2f}")
)

Basic statistics per is_released__US flag for the ratio and the ROI: 


Unnamed: 0_level_0,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,roi,roi,roi,roi,roi,roi,roi,roi
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
is_released__US,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
False,3578.0,377.55,14313.59,0.0,0.08,0.58,2.1,771448.66,3578.0,187.78,7156.8,-1.0,-0.96,-0.71,0.05,385723.33
True,8549.0,700.54,49801.84,0.0,0.47,1.39,3.33,4493069.2,8549.0,349.27,24900.92,-1.0,-0.77,-0.31,0.67,2246533.6


In [32]:
print("Basic statistics per release_category for the ratio:")
display(
    movie.groupby("release_category")[["ratio_adj"]].describe().style.format("{:,.2f}")
)

Basic statistics per release_category for the ratio:


Unnamed: 0_level_0,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
release_category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Close streaming release,255.0,2.87,3.21,0.0,0.82,2.07,3.53,22.67
Far streaming release,8055.0,743.34,51306.13,0.0,0.47,1.38,3.34,4493069.2
Not released in major markets,1772.0,688.43,20143.91,0.0,0.08,0.81,2.5,771448.66
Streaming release,2045.0,64.33,2606.13,0.0,0.11,0.52,1.72,117647.06


In [33]:
print("Basic statistics per release_category for the ROI:")
display(movie.groupby("release_category")[["roi"]].describe().style.format("{:,.2f}"))

Basic statistics per release_category for the ROI:


Unnamed: 0_level_0,roi,roi,roi,roi,roi,roi,roi,roi
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
release_category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Close streaming release,255.0,0.43,1.61,-1.0,-0.59,0.04,0.76,10.33
Far streaming release,8055.0,370.67,25653.06,-1.0,-0.77,-0.31,0.67,2246533.6
Not released in major markets,1772.0,343.22,10071.95,-1.0,-0.96,-0.59,0.25,385723.33
Streaming release,2045.0,31.16,1303.07,-1.0,-0.95,-0.74,-0.14,58822.53


In [34]:
print("Basic statistics per release_category for the ratio and the ROI: ")
display(
    movie.groupby(["release_category"])[["ratio_adj", "roi"]]
    .describe()
    .style.format("{:,.2f}")
)

Basic statistics per release_category for the ratio and the ROI: 


Unnamed: 0_level_0,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,roi,roi,roi,roi,roi,roi,roi,roi
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
release_category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Close streaming release,255.0,2.87,3.21,0.0,0.82,2.07,3.53,22.67,255.0,0.43,1.61,-1.0,-0.59,0.04,0.76,10.33
Far streaming release,8055.0,743.34,51306.13,0.0,0.47,1.38,3.34,4493069.2,8055.0,370.67,25653.06,-1.0,-0.77,-0.31,0.67,2246533.6
Not released in major markets,1772.0,688.43,20143.91,0.0,0.08,0.81,2.5,771448.66,1772.0,343.22,10071.95,-1.0,-0.96,-0.59,0.25,385723.33
Streaming release,2045.0,64.33,2606.13,0.0,0.11,0.52,1.72,117647.06,2045.0,31.16,1303.07,-1.0,-0.95,-0.74,-0.14,58822.53


In [35]:
movie[movie.original_title == "The Irishman"]

Unnamed: 0,movie_id,imdb_id,year,month,release_date,quarter,original_title,is_released__scope,is_released__US,days_from_us_release,runtime,revenue_usd_adj,budget_usd_adj,surplus,ratio_adj,roi,is_first_released_in_cinemas,is_first_released_in_cinemas_safe,release_category,ageCert
2757,255839,tt0077749,1978,2,1978-02-28,1,The Irishman,True,True,26240.0,108,3303580.0,4461426.0,-2809636.0,0.740476,-0.629762,True,True,Far streaming release,U
7770,398978,tt1302006,2019,11,2019-11-01,4,The Irishman,False,False,-1.0,209,1154717.0,189502400.0,-188925100.0,0.006093,-0.996953,False,False,Not released in major markets,U


In [36]:
filtered_data = movie

In [37]:
fig = px.scatter(
    filtered_data,
    x="budget_usd_adj",
    y="revenue_usd_adj",
    color="release_category",
    labels={
        "budget_usd_adj": "Adjusted Budget (USD)",
        "revenue_usd_adj": "Adjusted Revenue (USD)",
    },
    title="Scatter Plot of Revenue vs Budget by Release Category",
    log_x=True,
    log_y=True,
    color_discrete_sequence=px.colors.qualitative.Pastel,
)

fig.update_layout(
    xaxis_title="Adjusted Budget (USD)",
    yaxis_title="Adjusted Revenue (USD)",
    legend_title="Release Category",
    xaxis_tickangle=-45,
    xaxis_tickfont_size=12,
    yaxis_tickfont_size=12,
)

fig.show()

In [38]:
fig = px.box(
    filtered_data,
    x="release_category",
    y="roi",
    title="Boxplot of ROI by Release Category (Log Scale)",
    labels={"roi": "ROI", "release_category": "Release Category"},
    log_y=True,
    color="release_category",
    color_discrete_map={
        "Streaming release": "rgba(204, 80, 62, 1)",
        "Not released in major markets": "rgba(237, 172, 8, 1)",
        "Far streaming release": "rgba(115, 175, 72, 1)",
        "Close streaming release": "rgba(95, 70, 144, 1)",
    },
)

fig.update_layout(
    plot_bgcolor="white",
    paper_bgcolor="white",
    yaxis=dict(gridcolor="lightgray", zerolinecolor="lightgray"),
    xaxis=dict(gridcolor="lightgray", zerolinecolor="lightgray"),
    font=dict(size=12),
    legend_title_text="Release Category",
)

fig.show()

# Outliers

In [39]:
# movie = movie[movie['release_category'] != 'Not released in major markets']

In [40]:
# Determining outliers for specific columns and appending the information to the dataset

# Selecting the specific columns for outlier analysis
specific_columns = ["revenue_usd_adj", "budget_usd_adj", "surplus", "ratio_adj", "roi"]

# Calculating IQR for the specific columns
Q1_specific = movie[specific_columns].quantile(0.25)
Q3_specific = movie[specific_columns].quantile(0.75)

Q1_specific_log = np.log(movie[specific_columns]).quantile(0.25)
Q3_specific_log = np.log(movie[specific_columns]).quantile(0.75)

IQR_specific = Q3_specific - Q1_specific
IQR_specific_log = Q3_specific_log - Q1_specific_log

# Creating outlier flags for each specific column
for column in specific_columns:
    lower_bound = Q1_specific[column] - 1.5 * IQR_specific[column]
    upper_bound = Q3_specific[column] + 1.5 * IQR_specific[column]
    lower_bound_log = Q1_specific_log[column] - 0.75 * IQR_specific_log[column]
    upper_bound_log = Q3_specific_log[column] + 0.75 * IQR_specific_log[column]
    movie[f"{column}_outlier"] = (movie[column] < lower_bound) | (
        movie[column] > upper_bound
    )
    movie[f"{column}_outlier_log"] = (np.log(movie[column]) < lower_bound_log) | (
        np.log(movie[column]) > upper_bound_log
    )

In [41]:
# showing the outliers
outliers = movie[movie["roi_outlier"]]

In [42]:
movie

Unnamed: 0,movie_id,imdb_id,year,month,release_date,quarter,original_title,is_released__scope,is_released__US,days_from_us_release,...,revenue_usd_adj_outlier,revenue_usd_adj_outlier_log,budget_usd_adj_outlier,budget_usd_adj_outlier_log,surplus_outlier,surplus_outlier_log,ratio_adj_outlier,ratio_adj_outlier_log,roi_outlier,roi_outlier_log
0,340666,tt4550098,2016,11,2016-11-04,4,Nocturnal Animals,True,True,74.0,...,False,False,False,False,False,False,False,False,False,False
1,242911,tt0366180,2006,12,2006-12-14,4,స్టాలిన్,False,False,-1.0,...,False,False,False,False,False,False,False,False,False,False
2,179826,tt1767354,2013,1,2013-01-29,1,Odd Thomas,True,True,-21.0,...,False,False,False,False,False,False,False,True,False,False
3,340676,tt4714782,2016,12,2016-12-14,4,Personal Shopper,True,True,63.0,...,False,False,False,False,False,False,False,False,False,False
4,505058,tt4761916,2018,7,2018-07-19,3,Unfriended: Dark Web,True,True,78.0,...,False,False,False,False,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12122,276478,tt0093354,1988,4,1988-04-29,2,The Kitchen Toto,True,True,22457.0,...,False,False,False,False,False,False,False,True,False,False
12123,14342,tt0119053,1998,5,1998-05-29,2,Almost Heroes,True,True,18845.0,...,False,False,False,False,True,False,False,False,False,False
12124,441894,tt4906960,2017,9,2017-09-01,3,बादशाहो,False,False,-1.0,...,False,False,False,False,False,False,False,False,False,False
12125,54948,tt0327206,2001,5,2001-05-26,2,Uma Vida em Segredo,False,False,-1.0,...,False,True,False,True,False,False,False,False,False,False


In [43]:
outlier_features = [
    "revenue_usd_adj_outlier",
    "revenue_usd_adj_outlier_log",
    "budget_usd_adj_outlier",
    "budget_usd_adj_outlier_log",
    "surplus_outlier",
    "surplus_outlier_log",
    "ratio_adj_outlier",
    "ratio_adj_outlier_log",
    "roi_outlier",
    "roi_outlier_log",
]
# Create subplots
fig = make_subplots(rows=5, cols=2, subplot_titles=outlier_features)

# Adding scatter plots to the respective subplot
for i, feature in enumerate(outlier_features):
    row = i // 2 + 1
    col = i % 2 + 1
    filtered_data = movie[movie[feature]]  # Filter data for outliers
    fig.add_trace(
        go.Scattergl(
            x=filtered_data["budget_usd_adj"],
            y=filtered_data["revenue_usd_adj"],
            mode="markers",
            name=feature,
            marker=dict(size=7, opacity=0.7),
        ),
        row=row,
        col=col,
    )

# Update axes and layout
fig.update_xaxes(type="log", title_text="Budget (USD)")
fig.update_yaxes(type="log", title_text="Revenue (USD)")
fig.update_layout(
    height=1500,
    width=1000,
    title_text="Scatter plots for Outliers by Category",
    showlegend=False,
)

# Show plot
fig.show()

In [44]:
outlier_features = [
    "revenue_usd_adj_outlier",
    "revenue_usd_adj_outlier_log",
    "budget_usd_adj_outlier",
    "budget_usd_adj_outlier_log",
    "surplus_outlier",
    "surplus_outlier_log",
    "ratio_adj_outlier",
    "ratio_adj_outlier_log",
    "roi_outlier",
    "roi_outlier_log",
]

# Create a 5x2 subplot grid
fig = make_subplots(rows=5, cols=2, subplot_titles=outlier_features)

# Populate the subplots with scatter plots
for index, feature in enumerate(outlier_features):
    row = (index // 2) + 1
    col = (index % 2) + 1
    # Filter data for the current feature
    current_data = movie[movie[feature]]

    # Add a scatter plot to the current subplot
    fig.add_trace(
        go.Scattergl(
            x=current_data["budget_usd_adj"],
            y=current_data["revenue_usd_adj"],
            mode="markers",
            marker=dict(size=7, opacity=0.7),
            name=f"Outliers in {feature}",  # Legend name
        ),
        row=row,
        col=col,
    )

    # Print number of outliers in the console/log
    print(f"Number of outliers in {feature}: ", len(current_data))

# Update layout for better appearance and readability
fig.update_layout(
    height=1500,
    width=1200,
    title_text="Scatter plots of Various Outlier Features",
    showlegend=True,
)
fig.update_xaxes(title_text="Budget (USD, log scale)", type="log")
fig.update_yaxes(title_text="Revenue (USD, log scale)", type="log")

# Show the figure
fig.show()

Number of outliers in revenue_usd_adj_outlier:  1508
Number of outliers in revenue_usd_adj_outlier_log:  1281
Number of outliers in budget_usd_adj_outlier:  1027
Number of outliers in budget_usd_adj_outlier_log:  1180
Number of outliers in surplus_outlier:  2305
Number of outliers in surplus_outlier_log:  365
Number of outliers in ratio_adj_outlier:  1142
Number of outliers in ratio_adj_outlier_log:  1693
Number of outliers in roi_outlier:  1142
Number of outliers in roi_outlier_log:  576


In [45]:
movie[(movie["release_category"] == "Far streaming release")][
    ["revenue_usd_adj", "budget_usd_adj", "surplus", "ratio_adj", "roi"]
].describe().drop("count").style.format(
    {
        "revenue_usd_adj": "${:,.0f}",
        "budget_usd_adj": "${:,.0f}",
        "surplus": "${:,.0f}",
        "ratio_adj": "{:.2f}",
        "roi": "{:.2f}",
    }
)

Unnamed: 0,revenue_usd_adj,budget_usd_adj,surplus,ratio_adj,roi
mean,"$99,852,497","$37,557,523","$12,368,725",743.34,370.67
std,"$218,809,162","$46,380,121","$88,578,308",51306.13,25653.06
min,$1,$0,"$-179,942,126",0.0,-1.0
25%,"$6,266,344","$8,384,140","$-16,043,644",0.47,-0.77
50%,"$28,818,890","$22,671,280","$-2,610,467",1.38,-0.31
75%,"$97,866,674","$47,461,371","$11,347,356",3.34,0.67
max,"$4,298,409,952","$513,392,778","$1,894,074,914",4493069.2,2246533.6


In [46]:
movie[(movie["release_category"] != "Far streaming release")][
    ["revenue_usd_adj", "budget_usd_adj", "surplus", "ratio_adj", "roi"]
].describe().drop("count").style.format(
    {
        "revenue_usd_adj": "${:,.0f}",
        "budget_usd_adj": "${:,.0f}",
        "surplus": "${:,.0f}",
        "ratio_adj": "{:.2f}",
        "roi": "{:.2f}",
    }
)

Unnamed: 0,revenue_usd_adj,budget_usd_adj,surplus,ratio_adj,roi
mean,"$27,258,235","$14,816,296","$-1,187,179",332.07,165.03
std,"$108,015,927","$33,335,495","$39,592,548",13417.63,6708.81
min,$0,$0,"$-631,614,969",0.0,-1.0
25%,"$504,793","$2,684,839","$-8,105,282",0.1,-0.95
50%,"$3,386,933","$6,613,705","$-2,652,403",0.67,-0.66
75%,"$13,445,334","$13,973,566","$230,136",2.25,0.12
max,"$1,817,208,894","$710,138,577","$739,279,652",771448.66,385723.33


In [47]:
movie_no_out = movie[(movie["release_category"] == "Far streaming release")]

fig = px.scatter(
    movie_no_out,
    x="budget_usd_adj",
    y="revenue_usd_adj",
    labels={
        "budget_usd_adj": "Adjusted Budget (USD)",
        "revenue_usd_adj": "Adjusted Revenue (USD)",
    },
    title="Scatter Plot of Revenue vs Budget for Movies within Scope with Far Streaming Release",
    hover_name="original_title",
    log_x=True,
    log_y=True,
)

fig.update_layout(
    xaxis_title="Adjusted Budget (USD)",
    yaxis_title="Adjusted Revenue (USD)",
    xaxis_tickangle=-45,
    xaxis_tickfont_size=12,
    yaxis_tickfont_size=12,
    height=800,
    width=800,
)

fig.show()

In [48]:
print("Smallest ratio movies")
display(
    movie_no_out.sort_values(by="ratio_adj")
    .head(10)[
        [
            "movie_id",
            "original_title",
            "revenue_usd_adj",
            "budget_usd_adj",
            "surplus",
            "ratio_adj",
            "roi",
        ]
    ]
    .style.format(
        {
            "revenue_usd_adj": "${:,.0f}",
            "budget_usd_adj": "${:,.0f}",
            "surplus": "${:,.0f}",
            "ratio_adj": "{:.2f}",
            "roi": "{:.5f}",
        }
    )
)

print("Biggest ratio movies")
display(
    movie_no_out.sort_values(by="ratio_adj", ascending=False)
    .head(10)[
        [
            "movie_id",
            "original_title",
            "revenue_usd_adj",
            "budget_usd_adj",
            "surplus",
            "ratio_adj",
            "roi",
        ]
    ]
    .style.format(
        {
            "revenue_usd_adj": "${:,.0f}",
            "budget_usd_adj": "${:,.0f}",
            "surplus": "${:,.0f}",
            "ratio_adj": "{:.2f}",
            "roi": "{:.2f}",
        }
    )
)

Smallest ratio movies


Unnamed: 0,movie_id,original_title,revenue_usd_adj,budget_usd_adj,surplus,ratio_adj,roi
7628,46943,The Point Men,$2,"$10,495,100","$-10,495,099",0.0,-1.0
5735,92493,Edwin Boyd: Citizen Gangster,$1,"$6,772,992","$-6,772,991",0.0,-1.0
9811,228331,A Perfect Man,$1,"$6,539,876","$-6,539,876",0.0,-1.0
6440,95023,The Frog Prince,$3,"$3,613,279","$-3,613,277",0.0,-1.0
2177,82887,Air Collision,$5,"$597,210","$-597,208",0.0,-1.0
4506,41393,Zyzzyx Road,$45,"$3,022,837","$-3,022,815",0.0,-0.99999
2726,59902,Bobbie Jo and the Outlaw,$27,"$1,606,513","$-1,606,500",0.0,-0.99999
2742,322075,Ёлки лохматые,$47,"$2,574,192","$-2,574,169",0.0,-0.99999
12112,14330,Trojan War,$587,"$28,476,822","$-28,476,529",0.0,-0.99999
8023,47501,Six Ways to Sunday,$104,"$4,746,137","$-4,746,085",0.0,-0.99999


Biggest ratio movies


Unnamed: 0,movie_id,original_title,revenue_usd_adj,budget_usd_adj,surplus,ratio_adj,roi
456,34707,Darling,"$1,082,161",$0,"$541,080",4493069.2,2246533.6
906,80539,పంజా,"$7,665,368",$8,"$3,832,676",943129.3,471563.65
4156,20034,Hush,"$488,565",$1,"$244,281",345221.27,172609.64
11591,13701,Immortal Beloved,"$20,354,587",$247,"$10,177,047",82500.0,41249.0
9895,11537,蛇形刁手,"$14,020,031",$430,"$7,009,585",32608.7,16303.35
11960,33261,The FJ Holden,"$3,569,941",$133,"$1,784,837",26792.45,13395.23
5464,91551,Death of a Superhero,"$45,880",$4,"$22,936",11290.0,5644.0
4196,124157,도둑들,"$115,633,065","$16,556","$57,799,976",6984.42,3491.21
2453,1435,Tarnation,"$1,987,187",$361,"$993,232",5504.59,2751.29
2861,256075,The New Adventures of Pinocchio,"$27,607,044","$18,289","$13,785,232",1509.45,753.73


# Genres

In [49]:
# Join the movie and the genre table
movie_genre = movie.merge(genres, how="left", on="movie_id")

In [50]:
# Lets do some EDA on the genres
# How many genres are there?
print("Number of genres: ", len(movie_genre["name"].unique()))
# How many movies are there in each genre?
print("Number of movies in each genre: ")
print(movie_genre["name"].value_counts())
# How many movies charecterized by more than one genre?

Number of genres:  20
Number of movies in each genre: 
name
drama              6445
comedy             4480
thriller           2837
action             2613
romance            2468
crime              1877
adventure          1380
horror             1286
science_fiction     960
mystery             881
fantasy             820
family              704
history             604
war                 435
music               376
documentary         173
western             145
animation           124
tv_movie              7
Name: count, dtype: int64


In [51]:
# Number genres per movie
print(
    "Number of genres per movie: {0:.2f}".format(
        movie_genre.groupby("movie_id")["name"].count().mean()
    )
)

Number of genres per movie: 2.36


In [52]:
# Can you give some basic statistics per genre for the ratio and the ROI?
print("Basic statistics per genre for the ratio and the ROI: ")
display(
    movie_genre.groupby("name")[["ratio_adj", "roi"]].describe().style.format("{:,.2f}")
)

Basic statistics per genre for the ratio and the ROI: 


Unnamed: 0_level_0,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,roi,roi,roi,roi,roi,roi,roi,roi
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
action,2613.0,521.92,19657.8,0.0,0.5,1.5,3.21,943129.3,2613.0,259.96,9828.9,-1.0,-0.75,-0.25,0.61,471563.65
adventure,1380.0,4.67,37.63,0.0,0.5,1.55,3.33,1204.69,1380.0,1.34,18.81,-1.0,-0.75,-0.22,0.67,601.35
animation,124.0,2.2,2.97,0.0,0.2,1.15,3.05,15.67,124.0,0.1,1.49,-1.0,-0.9,-0.43,0.52,6.83
comedy,4480.0,101.96,5275.74,0.0,0.41,1.31,3.15,350000.0,4480.0,49.98,2637.87,-1.0,-0.79,-0.35,0.57,174999.0
crime,1877.0,11.55,212.17,0.0,0.39,1.23,2.93,6984.42,1877.0,4.77,106.09,-1.0,-0.81,-0.38,0.46,3491.21
documentary,173.0,85.03,675.35,0.0,0.24,1.24,6.44,7000.0,173.0,41.51,337.68,-1.0,-0.88,-0.38,2.22,3499.0
drama,6445.0,917.88,56980.3,0.0,0.24,0.94,2.58,4493069.2,6445.0,457.94,28490.15,-1.0,-0.88,-0.53,0.29,2246533.6
family,704.0,5.09,57.49,0.0,0.51,1.46,2.83,1509.45,704.0,1.54,28.74,-1.0,-0.74,-0.27,0.42,753.73
fantasy,820.0,2.88,9.78,0.0,0.47,1.37,2.86,225.0,820.0,0.44,4.89,-1.0,-0.77,-0.32,0.43,111.5
history,604.0,45.12,1038.77,0.0,0.22,0.85,2.04,25530.35,604.0,21.56,519.39,-1.0,-0.89,-0.58,0.02,12764.17


In [53]:
# Can you give some basic statistics per genre for the ratio and the ROI?
print("Basic statistics per US theatrical release for the ratio and the ROI: ")
display(
    movie_genre.groupby("name")[["ratio_adj", "roi"]].describe().style.format("{:,.2f}")
)

Basic statistics per US theatrical release for the ratio and the ROI: 


Unnamed: 0_level_0,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,roi,roi,roi,roi,roi,roi,roi,roi
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
action,2613.0,521.92,19657.8,0.0,0.5,1.5,3.21,943129.3,2613.0,259.96,9828.9,-1.0,-0.75,-0.25,0.61,471563.65
adventure,1380.0,4.67,37.63,0.0,0.5,1.55,3.33,1204.69,1380.0,1.34,18.81,-1.0,-0.75,-0.22,0.67,601.35
animation,124.0,2.2,2.97,0.0,0.2,1.15,3.05,15.67,124.0,0.1,1.49,-1.0,-0.9,-0.43,0.52,6.83
comedy,4480.0,101.96,5275.74,0.0,0.41,1.31,3.15,350000.0,4480.0,49.98,2637.87,-1.0,-0.79,-0.35,0.57,174999.0
crime,1877.0,11.55,212.17,0.0,0.39,1.23,2.93,6984.42,1877.0,4.77,106.09,-1.0,-0.81,-0.38,0.46,3491.21
documentary,173.0,85.03,675.35,0.0,0.24,1.24,6.44,7000.0,173.0,41.51,337.68,-1.0,-0.88,-0.38,2.22,3499.0
drama,6445.0,917.88,56980.3,0.0,0.24,0.94,2.58,4493069.2,6445.0,457.94,28490.15,-1.0,-0.88,-0.53,0.29,2246533.6
family,704.0,5.09,57.49,0.0,0.51,1.46,2.83,1509.45,704.0,1.54,28.74,-1.0,-0.74,-0.27,0.42,753.73
fantasy,820.0,2.88,9.78,0.0,0.47,1.37,2.86,225.0,820.0,0.44,4.89,-1.0,-0.77,-0.32,0.43,111.5
history,604.0,45.12,1038.77,0.0,0.22,0.85,2.04,25530.35,604.0,21.56,519.39,-1.0,-0.89,-0.58,0.02,12764.17


# Production Companies

In [54]:
# Join the movie and the production companies table
movie_production_companies = movie.merge(
    production_companies, how="left", on="movie_id"
)

In [55]:
# Number of production companies
print(
    "Number of production companies: ",
    len(movie_production_companies["company_name"].unique()),
)
# # Number of parent companies (companies that own other companies)
# print('Number of parent companies: ', len(movie_production_companies['parent_name'].unique()))

Number of production companies:  11735


In [56]:
# Number of production companies per movie
print(
    "Number of production companies per movie: {0:.2f}".format(
        movie_production_companies.groupby("movie_id")["company_name"].count().mean()
    )
)

Number of production companies per movie: 3.09


In [57]:
# Biggest production companies (by number of movies and by revenue)
biggest_companies = movie_production_companies.groupby("company_name").agg(
    {"movie_id": "count", "revenue_usd_adj": "sum"}
)
biggest_companies.columns = ["Number of Movies", "Total Revenue"]
biggest_companies = biggest_companies.sort_values(
    by="Number of Movies", ascending=False
).head(10)
display(biggest_companies)

Unnamed: 0_level_0,Number of Movies,Total Revenue
company_name,Unnamed: 1_level_1,Unnamed: 2_level_1
warner_bros_pictures,642,118130000000.0
universal_pictures,621,109205800000.0
columbia_pictures,495,81080210000.0
paramount_pictures,474,93601940000.0
20th_century_fox,462,91542590000.0
canal_,341,10668040000.0
new_line_cinema,254,39385170000.0
metro_goldwyn_mayer,239,26070720000.0
france_2_cinema,222,3403914000.0
touchstone_pictures,195,26255410000.0


In [58]:
# # Biggest parent production companies (by number of movies and by revenue)
# biggest_parent_companies = movie_production_companies.groupby('parent_name').agg({'movie_id': 'count', 'revenue_usd_adj': 'sum'})
# biggest_parent_companies.columns = ['Number of Movies', 'Total Revenue']
# biggest_parent_companies = biggest_parent_companies.sort_values(by='Number of Movies', ascending=False).head(10).format({
#     'Total Revenue': "${:,.0f}"
# })
# display(biggest_parent_companies)

In [59]:
# # Companies with the most subsidiaries
# subsidiaries = movie_production_companies.groupby('parent_name').agg({'company_name': 'nunique'})
# subsidiaries.columns = ['Number of Subsidiaries']
# subsidiaries = subsidiaries.sort_values(by='Number of Subsidiaries', ascending=False).head(10)
# display(subsidiaries)

# Keywords

In [60]:
movie_keywords = movie.merge(keywords, how="left", on="movie_id")

In [61]:
print("Number of keywords: ", len(movie_keywords["keyword_name"].unique()))

Number of keywords:  15664


In [62]:
# On average, how many keywords are there per movie?
print(
    "Average number of keywords per movie: ",
    movie_keywords.groupby("movie_id")["keyword_name"].count().mean(),
)

Average number of keywords per movie:  7.521810835326132


In [63]:
# The most popular keywords (by number of movies and by revenue)
popular_keywords = movie_keywords.groupby("keyword_name").agg(
    {"movie_id": "count", "revenue_usd_adj": "sum"}
)
popular_keywords.columns = ["Number of Movies", "Total Revenue"]
popular_keywords = (
    popular_keywords.sort_values(by="Number of Movies", ascending=False)
    .head(10)
    .style.format({"Total Revenue": "${:,.0f}"})
)
display(popular_keywords)

Unnamed: 0_level_0,Number of Movies,Total Revenue
keyword_name,Unnamed: 1_level_1,Unnamed: 2_level_1
based_on_novel_or_book,939,"$123,583,740,770"
woman_director,854,"$29,621,662,441"
murder,623,"$42,413,535,661"
new_york_city,432,"$56,057,918,539"
based_on_true_story,415,"$39,593,951,693"
revenge,402,"$41,339,198,737"
biography,346,"$22,430,898,390"
duringcreditsstinger,340,"$82,913,539,923"
sequel,314,"$86,143,499,437"
love,287,"$21,720,906,973"


# Time analysis

In [64]:
plot_and_export_categorical_distribution(
    movie, "runtime", 0, False, True, "charts", True
)

In [65]:
plot_and_export_categorical_distribution(movie, "year", 0, False, True, "charts", True)

In [66]:
plot_and_export_categorical_distribution(
    movie, "quarter", 0, False, True, "charts", True
)

In [67]:
plot_and_export_categorical_distribution(movie, "month", 0, False, True, "charts", True)

In [68]:
plot_and_export_categorical_distribution(
    movie, "ageCert", 0, False, True, "charts", True
)