# Set up

In [44]:
import duckdb
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
import plotly.express as px
from eda_support_functions import *
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Suppress all warnings
warnings.filterwarnings('ignore')

In [45]:
movie_input = pd.read_csv('/Users/iliasx/Documents/GitHub/box-office-prediction/data/processed_data/movie.csv')
genres = pd.read_csv('/Users/iliasx/Documents/GitHub/box-office-prediction/data/processed_data/genre.csv')
keywords = pd.read_csv('/Users/iliasx/Documents/GitHub/box-office-prediction/data/processed_data/keyword.csv')
production_companies = pd.read_csv('/Users/iliasx/Documents/GitHub/box-office-prediction/data/processed_data/production.csv')

In [46]:
movie = movie_input[['movie_id','imdb_id','year','month','release_date','quarter','original_title',
                     'is_released', 'is_released_US','days_from_us_release',
                     'runtime','revenue_usd_adj','budget_usd_adj','surplus','ratio_adj','roi',
                     'is_first_released_in_cinemas', 'is_first_released_in_cinemas_safe',
                     'release_category'
                     ]]

In [47]:
movie.groupby('release_category').size()

release_category
Close streaming release           226
Far streaming release            5831
Not released in major markets     743
Streaming release                 843
dtype: int64

# Basic Understanding of the Data

In [48]:
movie.head(10)

Unnamed: 0,movie_id,imdb_id,year,month,release_date,quarter,original_title,is_released,is_released_US,days_from_us_release,runtime,revenue_usd_adj,budget_usd_adj,surplus,ratio_adj,roi,is_first_released_in_cinemas,is_first_released_in_cinemas_safe,release_category
0,418437,tt3462710,2017,3,2017-03-06,1,Unforgettable,True,True,117.0,100,22086930.0,14916870.0,-3873407.0,1.480668,-0.259666,True,True,Far streaming release
1,34746,tt0109578,1994,1,1994-01-14,1,Death Wish V: The Face of Death,True,True,2778.0,95,3500154.0,10280090.0,-8530017.0,0.340479,-0.829761,True,True,Far streaming release
2,112936,tt1728986,2012,7,2012-07-06,3,Bol Bachchan,True,True,13693.0,155,38188630.0,8360944.0,10733370.0,4.567502,1.283751,True,True,Far streaming release
3,211672,tt2293640,2015,6,2015-06-17,2,Minions,True,True,21.0,91,1490564000.0,95132200.0,650149800.0,15.668345,6.834172,True,False,Close streaming release
4,318226,tt3850544,2016,9,2016-09-16,3,Hillsong: Let Hope Rise,True,True,12160.0,103,3364016.0,12695550.0,-11013540.0,0.264976,-0.867512,True,True,Far streaming release
5,79464,tt1839596,2011,11,2011-11-11,4,Rockstar,False,False,-1.0,159,26631400.0,14792210.0,-1476512.0,1.800366,-0.099817,False,False,Not released in major markets
6,15005,tt0430431,2006,10,2006-10-10,4,One Night with the King,True,True,15786.0,123,20749440.0,30228370.0,-19853660.0,0.686423,-0.656789,True,True,Far streaming release
7,112949,tt1702439,2013,2,2013-02-07,1,Safe Haven,True,True,114.0,115,127650700.0,36623310.0,27202050.0,3.485505,0.742753,True,True,Far streaming release
8,375108,tt4471388,2016,1,2016-01-07,1,Мафия: Игра на выживание,False,False,-1.0,91,5112236.0,3262755.0,-706637.3,1.566846,-0.216577,False,False,Not released in major markets
9,34764,tt1373156,2010,2,2010-02-26,1,Karthik Calling Karthik,True,True,14554.0,135,5068453.0,3912599.0,-1378372.0,1.295419,-0.352291,True,True,Far streaming release


In [49]:
null_percentage_all = movie.isnull().mean() * 100

zero_percentage_all = (movie == 0).mean() * 100

null_zero_percentage_all = pd.DataFrame({'Null Percentage': null_percentage_all, 'Zero Percentage': zero_percentage_all})
print(null_zero_percentage_all)

                                   Null Percentage  Zero Percentage
movie_id                                  0.000000         0.000000
imdb_id                                   0.039252         0.000000
year                                      0.000000         0.000000
month                                     0.000000         0.000000
release_date                              0.000000         0.000000
quarter                                   0.000000         0.000000
original_title                            0.000000         0.000000
is_released                               0.000000         9.721314
is_released_US                            0.000000        18.801518
days_from_us_release                      0.000000         0.366348
runtime                                   0.000000         0.209342
revenue_usd_adj                           0.000000         0.000000
budget_usd_adj                            0.000000         0.000000
surplus                                   0.0000

In [50]:
movie[movie.budget_usd_adj != 0 ][['revenue_usd_adj','budget_usd_adj','surplus','ratio_adj','roi']].describe(percentiles=[0.01, 0.05, 0.1, 0.25, 0.50, 0.75, 0.9, 0.95, 0.99, 0.999, 0.9999]).drop('count').style.format({
    'revenue_usd_adj': "${:,.0f}",
    'budget_usd_adj': "${:,.0f}",
    'surplus': "${:,.0f}",
    'ratio_adj': "{:.2f}",
    'roi': "{:.2f}"
})

Unnamed: 0,revenue_usd_adj,budget_usd_adj,surplus,ratio_adj,roi
mean,"$110,706,833","$39,553,945","$15,799,472",5.66,1.83
std,"$233,325,540","$50,060,794","$93,239,159",69.55,34.77
min,$1,$2,"$-193,946,737",0.0,-1.0
1%,"$1,604","$35,024","$-101,348,316",0.0,-1.0
5%,"$167,839","$823,888","$-51,728,183",0.03,-0.98
10%,"$746,978","$2,146,375","$-34,597,427",0.12,-0.94
25%,"$6,098,158","$7,359,817","$-14,158,401",0.53,-0.74
50%,"$32,221,393","$22,671,280","$-1,274,311",1.6,-0.2
75%,"$111,113,485","$50,334,403","$15,021,640",3.57,0.79
90%,"$289,704,137","$99,359,348","$70,114,238",7.58,2.79


In [51]:
print('Smallest ratio movies')
display(movie.sort_values(by='ratio_adj').head(10)[['movie_id', 'original_title', 'revenue_usd_adj','budget_usd_adj', 'surplus', 'ratio_adj','roi']].style.format({
    'revenue_usd_adj': "${:,.0f}",
    'budget_usd_adj': "${:,.0f}",
    'surplus': "${:,.0f}",
    'ratio_adj': "{:.2f}",
    'roi': "{:.2f}"
}))

print('Biggest ratio movies')
display(movie[movie.budget_usd_adj != 0 ].sort_values(by='ratio_adj', ascending=False).head(10)[['movie_id', 'original_title', 'revenue_usd_adj','budget_usd_adj', 'surplus', 'ratio_adj','roi']].style.format({
    'revenue_usd_adj': "${:,.0f}",
    'budget_usd_adj': "${:,.0f}",
    'surplus': "${:,.0f}",
    'ratio_adj': "{:.2f}",
    'roi': "{:.2f}"
}))


Smallest ratio movies


Unnamed: 0,movie_id,original_title,revenue_usd_adj,budget_usd_adj,surplus,ratio_adj,roi
6311,198701,Elephant Tales,$2,"$11,335,640","$-11,335,639",0.0,-1.0
4667,46943,The Point Men,$2,"$10,495,100","$-10,495,099",0.0,-1.0
3345,92493,Edwin Boyd: Citizen Gangster,$1,"$6,772,992","$-6,772,991",0.0,-1.0
2192,228331,A Perfect Man,$1,"$6,539,876","$-6,539,876",0.0,-1.0
2636,4593,Le Charme discret de la bourgeoisie,$7,"$5,831,617","$-5,831,614",0.0,-1.0
1604,280422,Все и сразу,$1,"$965,322","$-965,321",0.0,-1.0
2544,41393,Zyzzyx Road,$45,"$3,022,837","$-3,022,815",0.0,-1.0
7391,14330,Trojan War,$587,"$28,476,822","$-28,476,529",0.0,-1.0
2007,85495,Портрет в сумерках,$1,"$27,092","$-27,091",0.0,-1.0
6884,863188,Malvinas: Historias de traiciones,$3,"$58,653","$-58,651",0.0,-1.0


Biggest ratio movies


Unnamed: 0,movie_id,original_title,revenue_usd_adj,budget_usd_adj,surplus,ratio_adj,roi
1253,1435,Tarnation,"$1,987,187",$361,"$993,232",5504.59,2751.29
7572,34288,The Last Broadcast,"$2,243,205","$1,682","$1,119,920",1333.33,665.67
6287,11713,精武門,"$728,952,153","$728,952","$363,747,124",1000.0,499.0
3492,23827,Paranormal Activity,"$284,148,407","$315,956","$141,758,248",899.33,448.66
4794,47178,"Aloha, Bobby and Rose","$198,226,208","$339,816","$98,773,288",583.33,290.67
3986,357940,老炮儿,"$178,940,250","$321,393","$89,148,732",556.77,277.38
4619,513434,カメラを止めるな！,"$34,296,602","$65,144","$17,083,156",526.47,262.23
2078,2667,The Blair Witch Project,"$454,746,883","$868,748","$226,504,693",523.45,260.73
810,692,Pink Flamingos,"$43,737,129","$87,474","$21,781,090",500.0,249.0
4370,9461,Enter the Dragon,"$2,745,063,063","$5,833,259","$1,366,698,273",470.59,234.29


In [52]:
print('Movies with ratio_adj > more than 10 and less than 0.1')
display(movie[(movie.ratio_adj > 10) | (movie.ratio_adj < 0.1)][['movie_id', 'original_title', 'revenue_usd_adj','budget_usd_adj', 'ratio_adj']])


Movies with ratio_adj > more than 10 and less than 0.1


Unnamed: 0,movie_id,original_title,revenue_usd_adj,budget_usd_adj,ratio_adj
3,211672,Minions,1.490564e+09,9.513220e+07,15.668345
11,15029,The Whoopee Boys,1.236451e+06,2.224102e+07,0.055593
12,34769,Defendor,6.314836e+04,4.970970e+06,0.012703
20,211954,No se aceptan devoluciones,1.314550e+08,7.193864e+06,18.273207
21,296945,Bañeros 4: Los rompeolas,6.662611e+06,2.574192e+05,25.882335
...,...,...,...,...,...
7610,211067,The Sacrament,1.206084e+04,5.231901e+06,0.002305
7622,55890,Bereavement,6.108125e+04,2.794713e+06,0.021856
7634,14905,Welcome to Sarajevo,6.346895e+05,1.708609e+07,0.037147
7636,34647,Enter the Void,1.145478e+06,1.846360e+07,0.062040


Example of outlier, indeed a wrong imdb data entry

https://www.imdb.com/title/tt1334328/


movie_id	original_title	revenue_usd_adj	budget_usd_adj	ratio_adj

53128	Ψυχή Βαθιά	2.903065e+06	4.092371	7.093847e+0

In [53]:
print('Smallest ROI movies')
display(movie.sort_values(by='roi').head(15)[['movie_id', 'original_title', 'revenue_usd_adj','budget_usd_adj', 'surplus', 'ratio_adj','roi']].style.format({
    'revenue_usd_adj': "${:,.0f}",
    'budget_usd_adj': "${:,.0f}",
    'surplus': "${:,.0f}",
    'ratio_adj': "{:.2f}",
    'roi': "{:.2f}"
}))

print('Biggest ROI movies')
display(movie[movie.budget_usd_adj != 0 ].sort_values(by='roi', ascending=False).head(15)[['movie_id', 'original_title', 'revenue_usd_adj','budget_usd_adj', 'surplus', 'ratio_adj','roi']].style.format({
    'revenue_usd_adj': "${:,.0f}",
    'budget_usd_adj': "${:,.0f}",
    'surplus': "${:,.0f}",
    'ratio_adj': "{:.2f}",
    'roi': "{:.2f}"
}))


Smallest ROI movies


Unnamed: 0,movie_id,original_title,revenue_usd_adj,budget_usd_adj,surplus,ratio_adj,roi
6311,198701,Elephant Tales,$2,"$11,335,640","$-11,335,639",0.0,-1.0
4667,46943,The Point Men,$2,"$10,495,100","$-10,495,099",0.0,-1.0
2192,228331,A Perfect Man,$1,"$6,539,876","$-6,539,876",0.0,-1.0
3345,92493,Edwin Boyd: Citizen Gangster,$1,"$6,772,992","$-6,772,991",0.0,-1.0
2636,4593,Le Charme discret de la bourgeoisie,$7,"$5,831,617","$-5,831,614",0.0,-1.0
1604,280422,Все и сразу,$1,"$965,322","$-965,321",0.0,-1.0
2544,41393,Zyzzyx Road,$45,"$3,022,837","$-3,022,815",0.0,-1.0
7391,14330,Trojan War,$587,"$28,476,822","$-28,476,529",0.0,-1.0
2007,85495,Портрет в сумерках,$1,"$27,092","$-27,091",0.0,-1.0
6884,863188,Malvinas: Historias de traiciones,$3,"$58,653","$-58,651",0.0,-1.0


Biggest ROI movies


Unnamed: 0,movie_id,original_title,revenue_usd_adj,budget_usd_adj,surplus,ratio_adj,roi
1253,1435,Tarnation,"$1,987,187",$361,"$993,232",5504.59,2751.29
7572,34288,The Last Broadcast,"$2,243,205","$1,682","$1,119,920",1333.33,665.67
6287,11713,精武門,"$728,952,153","$728,952","$363,747,124",1000.0,499.0
3492,23827,Paranormal Activity,"$284,148,407","$315,956","$141,758,248",899.33,448.66
4794,47178,"Aloha, Bobby and Rose","$198,226,208","$339,816","$98,773,288",583.33,290.67
3986,357940,老炮儿,"$178,940,250","$321,393","$89,148,732",556.77,277.38
4619,513434,カメラを止めるな！,"$34,296,602","$65,144","$17,083,156",526.47,262.23
2078,2667,The Blair Witch Project,"$454,746,883","$868,748","$226,504,693",523.45,260.73
810,692,Pink Flamingos,"$43,737,129","$87,474","$21,781,090",500.0,249.0
4370,9461,Enter the Dragon,"$2,745,063,063","$5,833,259","$1,366,698,273",470.59,234.29


In [54]:
print('Smallest Surplus movies')
display(movie.sort_values(by='surplus').head(10)[['movie_id', 'original_title', 'revenue_usd_adj','budget_usd_adj', 'surplus', 'ratio_adj','roi']].style.format({
    'revenue_usd_adj': "${:,.0f}",
    'budget_usd_adj': "${:,.0f}",
    'surplus': "${:,.0f}",
    'ratio_adj': "{:.2f}",
    'roi': "{:.2f}"
}))

print('Biggest Surplus movies')
display(movie[movie.budget_usd_adj != 0 ].sort_values(by='surplus', ascending=False).head(10)[['movie_id', 'original_title', 'revenue_usd_adj','budget_usd_adj', 'surplus', 'ratio_adj','roi']].style.format({
    'revenue_usd_adj': "${:,.0f}",
    'budget_usd_adj': "${:,.0f}",
    'surplus': "${:,.0f}",
    'ratio_adj': "{:.2f}",
    'roi': "{:.2f}"
}))


Smallest Surplus movies


Unnamed: 0,movie_id,original_title,revenue_usd_adj,budget_usd_adj,surplus,ratio_adj,roi
4610,449927,追龍,"$109,348,068","$248,620,771","$-193,946,737",0.44,-0.78
4770,398978,The Irishman,"$1,154,717","$189,502,411","$-188,925,053",0.01,-1.0
1223,1408,Cutthroat Island,"$31,989,711","$195,936,982","$-179,942,126",0.16,-0.92
6268,11692,The Adventures of Pluto Nash,"$12,032,211","$169,372,985","$-163,356,879",0.07,-0.96
1611,1911,The 13th Warrior,"$112,843,805","$219,473,229","$-163,051,327",0.51,-0.74
5718,10935,Heaven's Gate,"$12,885,208","$162,704,951","$-156,262,348",0.08,-0.96
5535,10733,The Alamo,"$41,648,458","$172,594,569","$-151,770,339",0.24,-0.88
4903,10048,Stealth,"$120,028,784","$210,623,502","$-150,609,110",0.57,-0.72
1387,1639,Speed 2: Cruise Control,"$312,311,132","$303,752,773","$-147,597,206",1.03,-0.49
5210,10384,Supernova,"$26,237,781","$159,251,916","$-146,133,026",0.16,-0.92


Biggest Surplus movies


Unnamed: 0,movie_id,original_title,revenue_usd_adj,budget_usd_adj,surplus,ratio_adj,roi
2754,11,Star Wars,"$3,898,767,715","$55,308,944","$1,894,074,914",70.49,34.25
717,597,Titanic,"$4,298,409,952","$379,690,966","$1,769,514,010",11.32,4.66
2326,19995,Avatar,"$4,152,472,877","$336,605,686","$1,739,630,753",12.34,5.17
4463,9552,The Exorcist,"$3,028,532,995","$82,351,892","$1,431,914,606",36.78,17.39
4370,9461,Enter the Dragon,"$2,745,063,063","$5,833,259","$1,366,698,273",470.59,234.29
680,578,Jaws,"$2,665,593,130","$39,645,242","$1,293,151,323",67.24,32.62
947,299534,Avengers: Endgame,"$3,336,480,881","$424,294,707","$1,243,945,733",7.86,2.93
710,601,E.T. the Extra-Terrestrial,"$2,503,815,272","$33,154,104","$1,218,753,533",75.52,36.76
473,329,Jurassic Park,"$2,326,039,361","$132,845,855","$1,030,173,826",17.51,7.75
1587,1891,The Empire Strikes Back,"$1,990,916,951","$66,561,117","$928,897,359",29.91,13.96


In [55]:
print('Basic statistics per is_released_US flag for the ratio and the ROI: ')
display(movie.groupby('is_released_US')[['ratio_adj', 'roi']].describe().style.format("{:,.2f}"))

Basic statistics per is_released_US flag for the ratio and the ROI: 


Unnamed: 0_level_0,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,roi,roi,roi,roi,roi,roi,roi,roi
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
is_released_US,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
False,1437.0,4.86,40.85,0.0,0.16,0.85,2.55,1333.33,1437.0,1.43,20.43,-1.0,-0.92,-0.57,0.28,665.67
True,6206.0,5.84,74.64,0.0,0.66,1.79,3.76,5504.59,6206.0,1.92,37.32,-1.0,-0.67,-0.11,0.88,2751.29


In [56]:
print('Basic statistics per release_category for the ratio:')
display(movie.groupby('release_category')[['ratio_adj']].describe().style.format("{:,.2f}"))

Basic statistics per release_category for the ratio:


Unnamed: 0_level_0,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
release_category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Close streaming release,226.0,3.1,3.31,0.0,0.98,2.26,3.95,22.67
Far streaming release,5831.0,6.03,76.99,0.0,0.66,1.78,3.73,5504.59
Not released in major markets,743.0,4.87,50.23,0.0,0.12,0.84,2.44,1333.33
Streaming release,843.0,4.52,25.03,0.0,0.21,0.98,2.88,526.47


In [57]:
print('Basic statistics per release_category for the ROI:')
display(movie.groupby('release_category')[['roi']].describe().style.format("{:,.2f}"))

Basic statistics per release_category for the ROI:


Unnamed: 0_level_0,roi,roi,roi,roi,roi,roi,roi,roi
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
release_category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Close streaming release,226.0,0.55,1.65,-1.0,-0.51,0.13,0.97,10.33
Far streaming release,5831.0,2.01,38.5,-1.0,-0.67,-0.11,0.87,2751.29
Not released in major markets,743.0,1.44,25.11,-1.0,-0.94,-0.58,0.22,665.67
Streaming release,843.0,1.26,12.51,-1.0,-0.9,-0.51,0.44,262.23


In [58]:
print('Basic statistics per release_category for the ratio and the ROI: ')
display(movie.groupby(['release_category'])[['ratio_adj', 'roi']].describe().style.format("{:,.2f}"))

Basic statistics per release_category for the ratio and the ROI: 


Unnamed: 0_level_0,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,roi,roi,roi,roi,roi,roi,roi,roi
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
release_category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Close streaming release,226.0,3.1,3.31,0.0,0.98,2.26,3.95,22.67,226.0,0.55,1.65,-1.0,-0.51,0.13,0.97,10.33
Far streaming release,5831.0,6.03,76.99,0.0,0.66,1.78,3.73,5504.59,5831.0,2.01,38.5,-1.0,-0.67,-0.11,0.87,2751.29
Not released in major markets,743.0,4.87,50.23,0.0,0.12,0.84,2.44,1333.33,743.0,1.44,25.11,-1.0,-0.94,-0.58,0.22,665.67
Streaming release,843.0,4.52,25.03,0.0,0.21,0.98,2.88,526.47,843.0,1.26,12.51,-1.0,-0.9,-0.51,0.44,262.23


In [59]:
movie[movie.original_title == 'The Irishman']

Unnamed: 0,movie_id,imdb_id,year,month,release_date,quarter,original_title,is_released,is_released_US,days_from_us_release,runtime,revenue_usd_adj,budget_usd_adj,surplus,ratio_adj,roi,is_first_released_in_cinemas,is_first_released_in_cinemas_safe,release_category
4770,398978,tt1302006,2019,11,2019-11-01,4,The Irishman,False,False,-1.0,209,1154717.0,189502400.0,-188925100.0,0.006093,-0.996953,False,False,Not released in major markets


In [60]:
filtered_data = movie

In [61]:

fig = px.scatter(filtered_data, x='budget_usd_adj', y='revenue_usd_adj',
                 color='release_category', 
                 labels={'budget_usd_adj': 'Adjusted Budget (USD)', 'revenue_usd_adj': 'Adjusted Revenue (USD)'},
                 title='Scatter Plot of Revenue vs Budget by Release Category',
                 log_x=True, log_y=True, 
                 color_discrete_sequence=px.colors.qualitative.Pastel)

fig.update_layout(xaxis_title='Adjusted Budget (USD)',
                  yaxis_title='Adjusted Revenue (USD)',
                  legend_title='Release Category',
                  xaxis_tickangle=-45,
                  xaxis_tickfont_size=12,
                  yaxis_tickfont_size=12)

fig.show()

In [62]:
fig = px.box(filtered_data, x='release_category', y='roi', 
             title='Boxplot of ROI by Release Category (Log Scale)',
             labels={'roi': 'ROI', 'release_category': 'Release Category'},
             log_y=True,
             color='release_category',
             color_discrete_map= {
                'Streaming release': 'rgba(204, 80, 62, 1)',
                'Not released in major markets': 'rgba(237, 172, 8, 1)',
                'Far streaming release': 'rgba(115, 175, 72, 1)',
                'Close streaming release': 'rgba(95, 70, 144, 1)'
             }
             )

fig.update_layout(
    plot_bgcolor='white',
    paper_bgcolor='white',
    yaxis=dict(gridcolor='lightgray', zerolinecolor='lightgray'),
    xaxis=dict(gridcolor='lightgray', zerolinecolor='lightgray'),
    font=dict(size=12),
    legend_title_text='Release Category',
    
)

fig.show()

# Outliers

In [63]:
#movie = movie[movie['release_category'] != 'Not released in major markets']

In [64]:
# Determining outliers for specific columns and appending the information to the dataset

# Selecting the specific columns for outlier analysis
specific_columns = ['revenue_usd_adj', 'budget_usd_adj', 'surplus', 'ratio_adj','roi']

# Calculating IQR for the specific columns
Q1_specific = movie[specific_columns].quantile(0.25)
Q3_specific = movie[specific_columns].quantile(0.75)

Q1_specific_log = np.log(movie[specific_columns]).quantile(0.25)
Q3_specific_log = np.log(movie[specific_columns]).quantile(0.75)

IQR_specific = Q3_specific - Q1_specific
IQR_specific_log = Q3_specific_log - Q1_specific_log

# Creating outlier flags for each specific column
for column in specific_columns:
    lower_bound = Q1_specific[column] - 1.5 * IQR_specific[column]
    upper_bound = Q3_specific[column] + 1.5 * IQR_specific[column]
    lower_bound_log = Q1_specific_log[column] - 0.75 * IQR_specific_log[column]
    upper_bound_log = Q3_specific_log[column] + 0.75 * IQR_specific_log[column]
    movie[f'{column}_outlier'] = ((movie[column] < lower_bound) | (movie[column] > upper_bound))
    movie[f'{column}_outlier_log'] = ((np.log(movie[column]) < lower_bound_log) | (np.log(movie[column]) > upper_bound_log))

In [65]:
# showing the outliers
outliers = movie[movie['roi_outlier']]

In [66]:
movie

Unnamed: 0,movie_id,imdb_id,year,month,release_date,quarter,original_title,is_released,is_released_US,days_from_us_release,...,revenue_usd_adj_outlier,revenue_usd_adj_outlier_log,budget_usd_adj_outlier,budget_usd_adj_outlier_log,surplus_outlier,surplus_outlier_log,ratio_adj_outlier,ratio_adj_outlier_log,roi_outlier,roi_outlier_log
0,418437,tt3462710,2017,3,2017-03-06,1,Unforgettable,True,True,117.0,...,False,False,False,False,False,False,False,False,False,False
1,34746,tt0109578,1994,1,1994-01-14,1,Death Wish V: The Face of Death,True,True,2778.0,...,False,False,False,False,False,False,False,False,False,False
2,112936,tt1728986,2012,7,2012-07-06,3,Bol Bachchan,True,True,13693.0,...,False,False,False,False,False,False,False,False,False,False
3,211672,tt2293640,2015,6,2015-06-17,2,Minions,True,True,21.0,...,True,True,False,False,True,True,True,True,True,False
4,318226,tt3850544,2016,9,2016-09-16,3,Hillsong: Let Hope Rise,True,True,12160.0,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7638,14924,tt0091954,1986,9,1986-09-12,3,Sid and Nancy,True,True,23066.0,...,False,False,False,False,False,False,False,False,False,False
7639,34653,tt1315981,2009,12,2009-12-11,4,A Single Man,True,True,14575.0,...,False,False,False,False,False,False,False,False,False,False
7640,539228,tt8792898,2018,10,2018-10-10,4,L'Amour flou,True,False,-1.0,...,False,False,False,True,False,True,False,False,False,False
7641,418378,tt6186430,2016,12,2016-12-25,4,Die Beautiful,False,False,-1.0,...,False,False,False,True,False,False,True,False,True,False


In [67]:
outlier_features = ['revenue_usd_adj_outlier', 'revenue_usd_adj_outlier_log',
                    'budget_usd_adj_outlier', 'budget_usd_adj_outlier_log',
                    'surplus_outlier', 'surplus_outlier_log',
                    'ratio_adj_outlier', 'ratio_adj_outlier_log',
                    'roi_outlier', 'roi_outlier_log']
# Create subplots
fig = make_subplots(rows=5, cols=2, subplot_titles=outlier_features)

# Adding scatter plots to the respective subplot
for i, feature in enumerate(outlier_features):
    row = i // 2 + 1
    col = i % 2 + 1
    filtered_data = movie[movie[feature]]  # Filter data for outliers
    fig.add_trace(
        go.Scattergl(x=filtered_data['budget_usd_adj'], y=filtered_data['revenue_usd_adj'],
                     mode='markers', name=feature,
                     marker=dict(size=7, opacity=0.7)),
        row=row, col=col
    )

# Update axes and layout
fig.update_xaxes(type='log', title_text='Budget (USD)')
fig.update_yaxes(type='log', title_text='Revenue (USD)')
fig.update_layout(height=1500, width=1000, title_text="Scatter plots for Outliers by Category", showlegend=False)

# Show plot
fig.show()

In [68]:
outlier_features = ['revenue_usd_adj_outlier', 'revenue_usd_adj_outlier_log',
                    'budget_usd_adj_outlier', 'budget_usd_adj_outlier_log',
                    'surplus_outlier', 'surplus_outlier_log',
                    'ratio_adj_outlier', 'ratio_adj_outlier_log',
                    'roi_outlier', 'roi_outlier_log']

# Create a 5x2 subplot grid
fig = make_subplots(rows=5, cols=2, subplot_titles=outlier_features)

# Populate the subplots with scatter plots
for index, feature in enumerate(outlier_features):
    row = (index // 2) + 1
    col = (index % 2) + 1
    # Filter data for the current feature
    current_data = movie[movie[feature]]

    # Add a scatter plot to the current subplot
    fig.add_trace(
        go.Scattergl(
            x=current_data['budget_usd_adj'],
            y=current_data['revenue_usd_adj'],
            mode='markers',
            marker=dict(size=7, opacity=0.7),
            name=f'Outliers in {feature}'  # Legend name
        ),
        row=row,
        col=col
    )

    # Print number of outliers in the console/log
    print(f'Number of outliers in {feature}: ', len(current_data))

# Update layout for better appearance and readability
fig.update_layout(height=1500, width=1200, title_text="Scatter plots of Various Outlier Features", showlegend=True)
fig.update_xaxes(title_text='Budget (USD, log scale)', type='log')
fig.update_yaxes(title_text='Revenue (USD, log scale)', type='log')

# Show the figure
fig.show()


Number of outliers in revenue_usd_adj_outlier:  838
Number of outliers in revenue_usd_adj_outlier_log:  837
Number of outliers in budget_usd_adj_outlier:  586
Number of outliers in budget_usd_adj_outlier_log:  797
Number of outliers in surplus_outlier:  1174
Number of outliers in surplus_outlier_log:  327
Number of outliers in ratio_adj_outlier:  698
Number of outliers in ratio_adj_outlier_log:  1106
Number of outliers in roi_outlier:  698
Number of outliers in roi_outlier_log:  408


In [69]:
movie[(movie['release_category'] == 'Far streaming release')][[
    'revenue_usd_adj', 'budget_usd_adj', 'surplus', 'ratio_adj', 'roi'
]].describe().drop('count').style.format({
    'revenue_usd_adj': "${:,.0f}",
    'budget_usd_adj': "${:,.0f}",
    'surplus': "${:,.0f}",
    'ratio_adj': "{:.2f}",
    'roi': "{:.2f}"
})

Unnamed: 0,revenue_usd_adj,budget_usd_adj,surplus,ratio_adj,roi
mean,"$130,023,690","$45,740,122","$19,271,723",6.03,2.01
std,"$249,653,854","$51,147,789","$102,547,865",76.99,38.5
min,$1,$2,"$-179,942,126",0.0,-1.0
25%,"$13,117,085","$12,445,330","$-17,628,267",0.66,-0.67
50%,"$47,613,217","$29,487,290","$-1,172,599",1.78,-0.11
75%,"$136,447,816","$58,448,696","$21,804,883",3.73,0.87
max,"$4,298,409,952","$513,392,778","$1,894,074,914",5504.59,2751.29


In [70]:
movie[(movie['release_category'] != 'Far streaming release')][[
    'revenue_usd_adj', 'budget_usd_adj', 'surplus', 'ratio_adj', 'roi'
]].describe().drop('count').style.format({
    'revenue_usd_adj': "${:,.0f}",
    'budget_usd_adj': "${:,.0f}",
    'surplus': "${:,.0f}",
    'ratio_adj': "{:.2f}",
    'roi': "{:.2f}"
})

Unnamed: 0,revenue_usd_adj,budget_usd_adj,surplus,ratio_adj,roi
mean,"$48,545,356","$19,646,883","$4,625,796",4.48,1.24
std,"$154,970,166","$40,417,622","$51,647,889",36.42,18.21
min,$1,$2,"$-193,946,737",0.0,-1.0
25%,"$825,678","$2,416,150","$-7,066,720",0.2,-0.9
50%,"$5,441,317","$7,047,017","$-1,363,233",1.05,-0.47
75%,"$24,197,465","$17,182,686","$1,569,739",2.86,0.43
max,"$1,817,208,894","$469,233,135","$739,279,652",1333.33,665.67


In [71]:
movie_no_out = movie[(movie['release_category'] == 'Far streaming release')]

fig = px.scatter(movie_no_out, x='budget_usd_adj', y='revenue_usd_adj',
                    labels={'budget_usd_adj': 'Adjusted Budget (USD)', 'revenue_usd_adj': 'Adjusted Revenue (USD)'},
                    title='Scatter Plot of Revenue vs Budget for Movies within Scope with Far Streaming Release',
                    hover_name='original_title',
                    log_x=True, log_y=True)

fig.update_layout(xaxis_title='Adjusted Budget (USD)',
                    yaxis_title='Adjusted Revenue (USD)',
                    xaxis_tickangle=-45,
                    xaxis_tickfont_size=12,
                    yaxis_tickfont_size=12,
                    height=800,
                    width=800)

fig.show()


In [72]:
print('Smallest ratio movies')
display(movie_no_out.sort_values(by='ratio_adj').head(10)[['movie_id', 'original_title', 'revenue_usd_adj','budget_usd_adj', 'surplus', 'ratio_adj','roi']].style.format({
    'revenue_usd_adj': "${:,.0f}",
    'budget_usd_adj': "${:,.0f}",
    'surplus': "${:,.0f}",
    'ratio_adj': "{:.2f}",
    'roi': "{:.5f}"
}))

print('Biggest ratio movies')
display(movie_no_out.sort_values(by='ratio_adj', ascending=False).head(10)[['movie_id', 'original_title', 'revenue_usd_adj','budget_usd_adj', 'surplus', 'ratio_adj','roi']].style.format({
    'revenue_usd_adj': "${:,.0f}",
    'budget_usd_adj': "${:,.0f}",
    'surplus': "${:,.0f}",
    'ratio_adj': "{:.2f}",
    'roi': "{:.2f}"
}))


Smallest ratio movies


Unnamed: 0,movie_id,original_title,revenue_usd_adj,budget_usd_adj,surplus,ratio_adj,roi
4667,46943,The Point Men,$2,"$10,495,100","$-10,495,099",0.0,-1.0
3345,92493,Edwin Boyd: Citizen Gangster,$1,"$6,772,992","$-6,772,991",0.0,-1.0
2192,228331,A Perfect Man,$1,"$6,539,876","$-6,539,876",0.0,-1.0
2544,41393,Zyzzyx Road,$45,"$3,022,837","$-3,022,815",0.0,-0.99999
7391,14330,Trojan War,$587,"$28,476,822","$-28,476,529",0.0,-0.99999
1049,58699,Bellflower,$1,"$23,028","$-23,027",0.0,-0.99997
3522,23947,Nasty Old People,$1,"$21,304","$-21,303",0.0,-0.99997
1249,425704,The Unknowns,$1,"$19,043","$-19,043",0.0,-0.99997
1816,121173,Bwakaw,$1,"$14,835","$-14,834",0.0,-0.99996
3323,241603,Miss Mary,$3,"$28,063","$-28,061",0.0,-0.99995


Biggest ratio movies


Unnamed: 0,movie_id,original_title,revenue_usd_adj,budget_usd_adj,surplus,ratio_adj,roi
1253,1435,Tarnation,"$1,987,187",$361,"$993,232",5504.59,2751.29
6287,11713,精武門,"$728,952,153","$728,952","$363,747,124",1000.0,499.0
3492,23827,Paranormal Activity,"$284,148,407","$315,956","$141,758,248",899.33,448.66
4794,47178,"Aloha, Bobby and Rose","$198,226,208","$339,816","$98,773,288",583.33,290.67
3986,357940,老炮儿,"$178,940,250","$321,393","$89,148,732",556.77,277.38
2078,2667,The Blair Witch Project,"$454,746,883","$868,748","$226,504,693",523.45,260.73
810,692,Pink Flamingos,"$43,737,129","$87,474","$21,781,090",500.0,249.0
4370,9461,Enter the Dragon,"$2,745,063,063","$5,833,259","$1,366,698,273",470.59,234.29
860,299245,The Gallows,"$55,233,767","$128,557","$27,488,326",429.64,213.82
262,83,Open Water,"$90,555,260","$215,279","$45,062,351",420.64,209.32


# Genres

In [73]:
# Join the movie and the genre table
movie_genre = movie.merge(genres, how='left', on='movie_id')

In [74]:
# Lets do some EDA on the genres
# How many genres are there?
print('Number of genres: ', len(movie_genre['name'].unique()))
# How many movies are there in each genre?
print('Number of movies in each genre: ')
print(movie_genre['name'].value_counts())
# How many movies charecterized by more than one genre?



Number of genres:  20
Number of movies in each genre: 
name
drama              3839
comedy             2770
thriller           2010
action             1862
romance            1475
crime              1285
adventure          1079
horror              930
science_fiction     759
mystery             661
fantasy             618
family              458
history             391
war                 274
music               238
western              97
documentary          88
animation            79
tv_movie              5
Name: count, dtype: int64


In [75]:
# Number genres per movie
print('Number of genres per movie: {0:.2f}'.format(movie_genre.groupby('movie_id')['name'].count().mean()))

Number of genres per movie: 2.48


In [76]:
# Can you give some basic statistics per genre for the ratio and the ROI?
print('Basic statistics per genre for the ratio and the ROI: ')
display(movie_genre.groupby('name')[['ratio_adj', 'roi']].describe().style.format("{:,.2f}"))


Basic statistics per genre for the ratio and the ROI: 


Unnamed: 0_level_0,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,roi,roi,roi,roi,roi,roi,roi,roi
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
action,1862.0,4.85,33.29,0.0,0.67,1.76,3.49,1000.0,1862.0,1.42,16.65,-1.0,-0.66,-0.12,0.74,499.0
adventure,1079.0,3.7,11.48,0.0,0.7,1.86,3.75,285.71,1079.0,0.85,5.74,-1.0,-0.65,-0.07,0.87,141.86
animation,79.0,2.46,2.87,0.0,0.47,1.57,3.45,15.67,79.0,0.23,1.43,-1.0,-0.77,-0.22,0.73,6.83
comedy,2770.0,4.2,16.98,0.0,0.66,1.76,3.81,526.47,2770.0,1.1,8.49,-1.0,-0.67,-0.12,0.91,262.23
crime,1285.0,4.37,23.56,0.0,0.57,1.52,3.34,556.77,1285.0,1.19,11.78,-1.0,-0.71,-0.24,0.67,277.38
documentary,88.0,74.32,587.51,0.0,0.31,1.81,6.86,5504.59,88.0,36.16,293.75,-1.0,-0.84,-0.1,2.43,2751.29
drama,3839.0,5.54,91.89,0.0,0.42,1.35,3.27,5504.59,3839.0,1.77,45.94,-1.0,-0.79,-0.33,0.63,2751.29
family,458.0,3.03,5.07,0.0,0.79,1.77,3.26,75.52,458.0,0.51,2.54,-1.0,-0.61,-0.12,0.63,36.76
fantasy,618.0,3.22,10.88,0.0,0.67,1.76,3.17,225.0,618.0,0.61,5.44,-1.0,-0.66,-0.12,0.59,111.5
history,391.0,2.52,5.71,0.0,0.44,1.16,2.56,84.33,391.0,0.26,2.85,-1.0,-0.78,-0.42,0.28,41.16


In [77]:
# Can you give some basic statistics per genre for the ratio and the ROI?
print('Basic statistics per US theatrical release for the ratio and the ROI: ')
display(movie_genre.groupby('name')[['ratio_adj', 'roi']].describe().style.format("{:,.2f}"))

Basic statistics per US theatrical release for the ratio and the ROI: 


Unnamed: 0_level_0,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,ratio_adj,roi,roi,roi,roi,roi,roi,roi,roi
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
action,1862.0,4.85,33.29,0.0,0.67,1.76,3.49,1000.0,1862.0,1.42,16.65,-1.0,-0.66,-0.12,0.74,499.0
adventure,1079.0,3.7,11.48,0.0,0.7,1.86,3.75,285.71,1079.0,0.85,5.74,-1.0,-0.65,-0.07,0.87,141.86
animation,79.0,2.46,2.87,0.0,0.47,1.57,3.45,15.67,79.0,0.23,1.43,-1.0,-0.77,-0.22,0.73,6.83
comedy,2770.0,4.2,16.98,0.0,0.66,1.76,3.81,526.47,2770.0,1.1,8.49,-1.0,-0.67,-0.12,0.91,262.23
crime,1285.0,4.37,23.56,0.0,0.57,1.52,3.34,556.77,1285.0,1.19,11.78,-1.0,-0.71,-0.24,0.67,277.38
documentary,88.0,74.32,587.51,0.0,0.31,1.81,6.86,5504.59,88.0,36.16,293.75,-1.0,-0.84,-0.1,2.43,2751.29
drama,3839.0,5.54,91.89,0.0,0.42,1.35,3.27,5504.59,3839.0,1.77,45.94,-1.0,-0.79,-0.33,0.63,2751.29
family,458.0,3.03,5.07,0.0,0.79,1.77,3.26,75.52,458.0,0.51,2.54,-1.0,-0.61,-0.12,0.63,36.76
fantasy,618.0,3.22,10.88,0.0,0.67,1.76,3.17,225.0,618.0,0.61,5.44,-1.0,-0.66,-0.12,0.59,111.5
history,391.0,2.52,5.71,0.0,0.44,1.16,2.56,84.33,391.0,0.26,2.85,-1.0,-0.78,-0.42,0.28,41.16


# Production Companies

In [78]:
# Join the movie and the production companies table
movie_production_companies = movie.merge(production_companies, how='left', on='movie_id')

In [79]:
# Number of production companies
print('Number of production companies: ', len(movie_production_companies['company_name'].unique()))
# Number of parent companies (companies that own other companies)
print('Number of parent companies: ', len(movie_production_companies['parent_name'].unique()))

Number of production companies:  8413


KeyError: 'parent_name'

In [None]:
# Number of production companies per movie
print('Number of production companies per movie: {0:.2f}'.format(movie_production_companies.groupby('movie_id')['company_name'].count().mean()))

Number of production companies per movie: 2.59


In [None]:
# Biggest production companies (by number of movies and by revenue)
biggest_companies = movie_production_companies.groupby('company_name').agg({'movie_id': 'count', 'revenue_usd_adj': 'sum'})
biggest_companies.columns = ['Number of Movies', 'Total Revenue']
biggest_companies = biggest_companies.sort_values(by='Number of Movies', ascending=False).head(10)
display(biggest_companies)

Unnamed: 0_level_0,Number of Movies,Total Revenue
company_name,Unnamed: 1_level_1,Unnamed: 2_level_1
warner_bros_pictures,558,128408600000.0
universal_pictures,548,114918000000.0
columbia_pictures,426,83510760000.0
paramount,414,97081180000.0
20th_century_fox,360,101741500000.0
canal_,246,12231470000.0
new_line_cinema,245,41426870000.0
metro_goldwyn_mayer,206,25239690000.0
touchstone_pictures,176,27621890000.0
walt_disney_pictures,173,74377660000.0


In [None]:
# Biggest parent production companies (by number of movies and by revenue)
biggest_parent_companies = movie_production_companies.groupby('parent_name').agg({'movie_id': 'count', 'revenue_usd_adj': 'sum'})
biggest_parent_companies.columns = ['Number of Movies', 'Total Revenue']
biggest_parent_companies = biggest_parent_companies.sort_values(by='Number of Movies', ascending=False).head(10).format({
    'Total Revenue': "${:,.0f}"
})
display(biggest_parent_companies)

Unnamed: 0_level_0,Number of Movies,Total Revenue
parent_name,Unnamed: 1_level_1,Unnamed: 2_level_1
warner_bros_entertainment,619,134577800000.0
universal_pictures,550,114933200000.0
columbia_pictures,426,83510760000.0
sony_pictures,375,71494880000.0
20th_century_fox,362,101788000000.0
new_line_cinema,277,41866320000.0
canal_,246,12231470000.0
walt_disney_pictures,194,94786740000.0
touchstone_pictures,176,27621890000.0
miramax,156,13131100000.0


In [None]:
# Companies with the most subsidiaries
subsidiaries = movie_production_companies.groupby('parent_name').agg({'company_name': 'nunique'})
subsidiaries.columns = ['Number of Subsidiaries']
subsidiaries = subsidiaries.sort_values(by='Number of Subsidiaries', ascending=False).head(10)
display(subsidiaries)

Unnamed: 0_level_0,Number of Subsidiaries
parent_name,Unnamed: 1_level_1
sony_pictures,4
universal_pictures,3
walt_disney_pictures,2
europacorp,2
new_line_cinema,2
warner_bros_entertainment,2
lotte_entertainment,2
ard,2
regency_enterprises,2
x_filme_creative_pool,2


# Keywords

In [None]:
movie_keywords = movie.merge(keywords, how='left', on='movie_id')

In [None]:
print('Number of keywords: ', len(movie_keywords['keyword_name'].unique()))

Number of keywords:  13264


In [None]:
# On average, how many keywords are there per movie?
print('Average number of keywords per movie: ', movie_keywords.groupby('movie_id')['keyword_name'].count().mean())

Average number of keywords per movie:  5.773925663146013


In [None]:
# The most popular keywords (by number of movies and by revenue)
popular_keywords = movie_keywords.groupby('keyword_name').agg({'movie_id': 'count', 'revenue_usd_adj': 'sum'})
popular_keywords.columns = ['Number of Movies', 'Total Revenue']
popular_keywords = popular_keywords.sort_values(by='Number of Movies', ascending=False).head(10).format({
    'Total Revenue': "${:,.0f}"
})
display(popular_keywords)

NameError: name 'movie_keywords' is not defined

# Time analysis

In [None]:
plot_and_export_categorical_distribution(movie, 'year', 0, "linear", False)

Unnamed: 0,grouped,total
0,1970,9
1,1971,11
2,1972,11
3,1973,10
4,1974,6
5,1975,12
6,1976,10
7,1977,9
8,1978,20
9,1979,24


In [None]:
plot_and_export_categorical_distribution(movie, 'quarter', 0, "linear", False)

Unnamed: 0,grouped,total
0,1,2917
1,2,2763
2,3,3340
3,4,3383


In [None]:
plot_and_export_categorical_distribution(movie, 'month', 0, "linear", False)

Unnamed: 0,grouped,total
0,1,983
1,2,905
2,3,1029
3,4,951
4,5,939
5,6,873
6,7,795
7,8,1048
8,9,1497
9,10,1309
