# Set up

In [163]:
import duckdb
import pandas as pd
from IPython.display import display
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from eda_support_functions import *

# Suppress all warnings
import warnings

warnings.filterwarnings('ignore')

In [164]:
full__regression__none = pd.read_csv('../data/ml_ready_data/full__regression__with_outliers__none.csv')

In [165]:
full__regression__complex = pd.read_csv('../data/ml_ready_data/full__regression__with_outliers__complex.csv')

In [166]:
full__regression__complex.shape

(12127, 387)

In [167]:
full__regression__complex = full__regression__complex.convert_dtypes(infer_objects=True)

In [168]:
# Make a dataframe with the columns that we have in the full__regression__complex dataframe, and then a column with the splits ("__") in the column names
full__regression__complex_columns = pd.DataFrame(full__regression__complex.columns, columns=['column_name'])
full__regression__complex_columns['level_1'] = full__regression__complex_columns['column_name'].apply(lambda x: x.split('__')[0])
full__regression__complex_columns['level_2'] = full__regression__complex_columns['column_name'].apply(lambda x: x.split('__')[1] if len(x.split('__')) > 1 else None)

In [169]:
full__regression__complex_columns['boolean_true'] = full__regression__complex_columns['column_name'].apply(lambda x: full__regression__complex[x].sum() if full__regression__complex[x].dtype == 'boolean' else None)

In [170]:
a = full__regression__complex_columns[full__regression__complex_columns['boolean_true'].notnull()].sort_values('boolean_true', ascending=False)
a['normalized'] = a['boolean_true'] / full__regression__complex.shape[0]

In [171]:
a

Unnamed: 0,column_name,level_1,level_2,boolean_true,normalized
310,is_spoken_language__en,is_spoken_language,en,9078.0,0.748578
7,is_released__US,is_released,US,8549.0,0.704956
373,is_prod_country__US,is_prod_country,US,7384.0,0.608889
58,is_genre__Drama,is_genre,Drama,6444.0,0.531376
9,is_released__FR,is_released,FR,6367.0,0.525027
...,...,...,...,...,...
260,is_collection__los_superagentes,is_collection,los_superagentes,0.0,0.000000
259,is_collection__lone_wolf_and_cub_collection,is_collection,lone_wolf_and_cub_collection,0.0,0.000000
258,is_collection__les_charlots_saga,is_collection,les_charlots_saga,0.0,0.000000
256,is_collection__lemon_popsicle,is_collection,lemon_popsicle,0.0,0.000000


In [172]:
a.groupby('level_1').agg({
    'normalized': ['mean', 'min', 'max']
}).rename(columns={'normalized': ''})

Unnamed: 0_level_0,mean,min,max
level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
is_collection,0.000191,0.0,0.001484
is_genre,0.124186,0.000577,0.531376
is_keyword,0.019244,0.001567,0.077431
is_on_holiday_window,0.403232,0.403232,0.403232
is_outlier,0.45073,0.45073,0.45073
is_prod_company,0.007709,0.0,0.05294
is_prod_country,0.032741,0.000495,0.608889
is_released,0.392447,0.038179,0.704956
is_spoken_language,0.047082,0.001897,0.748578


In [173]:
full__regression__complex_columns.groupby('level_1').agg({'level_2': 'count', 'boolean_true': 'sum'}).sort_values('level_2', ascending=False)

Unnamed: 0_level_0,level_2,boolean_true
level_1,Unnamed: 1_level_1,Unnamed: 2_level_1
is_collection,91,211.0
is_prod_company,89,8320.0
is_keyword,53,12369.0
is_prod_country,42,16676.0
is_spoken_language,29,16558.0
is_genre,19,28614.0
is_released,5,23796.0
actor_kpis,4,0.0
producer_kpis,4,0.0
genre_kpis,4,0.0


In [174]:
full__regression__none

Unnamed: 0,movie_id,original_language,runtime,ageCert,quarter,month,year,is_released__US,is_released__CN,is_released__FR,is_released__GB,is_released__JP,budget_usd_adj,revenue_usd_adj,is_outlier,production_size
0,340666,en,117,R,4,11,2016,True,False,True,True,False,2.856498e+07,3.848256e+07,False,large_productions
1,242911,te,168,U,4,12,2006,False,False,False,False,False,5.481741e+06,9.593046e+06,True,small_productions
2,179826,en,100,PG13,1,1,2013,True,False,False,False,True,3.531533e+07,1.503213e+06,True,large_productions
3,340676,fr,106,R,4,12,2016,True,False,True,True,False,7.617328e+06,3.606053e+06,False,small_productions
4,505058,en,93,R,3,7,2018,True,False,True,True,False,1.213435e+06,1.941496e+07,True,small_productions
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12122,276478,en,95,PG13,2,4,1988,True,False,False,False,True,8.567825e+06,2.622214e+05,True,small_productions
12123,14342,en,90,PG13,2,5,1998,True,False,False,False,False,5.608012e+07,1.147185e+07,False,large_productions
12124,441894,hi,136,U,3,9,2017,False,False,False,False,False,1.528980e+07,3.356297e+07,True,medium_productions
12125,54948,pt,102,U,2,5,2001,False,False,False,False,False,1.032305e+06,6.225913e+04,True,small_productions


In [175]:
full__regression__none.shape

(12127, 16)

In [176]:
(full__regression__none
 .groupby(['is_outlier', 'production_size'])
 .size()
 .unstack(fill_value=0)
 .rename(index={False: 'Not Outlier', True: 'Outlier'})
 .assign(Total=lambda x: x.sum(axis=1))
 .pipe(lambda df: pd.concat([df, pd.DataFrame([df.sum()], index=['Total'])])
       .fillna(0)
       .astype(int))
 .style
#  .background_gradient(cmap='YlOrRd')
 .format("{:,d}")
)

production_size,large_productions,medium_productions,small_productions,Total
Not Outlier,3480,1384,1797,6661
Outlier,788,1017,3661,5466
Total,4268,2401,5458,12127


In [177]:
full__regression__none['ratio'] = full__regression__none['budget_usd_adj'] / full__regression__none['revenue_usd_adj']

In [182]:
display(full__regression__none[~full__regression__none['is_outlier']].sort_values(by='ratio', ascending=False)[['movie_id','original_language','year','ratio', 'budget_usd_adj','revenue_usd_adj']].head(10).style.format({'budget_usd_adj': "${:,.0f}",'revenue_usd_adj': "${:,.0f}"}))

Unnamed: 0,movie_id,original_language,year,ratio,budget_usd_adj,revenue_usd_adj
1356,57709,fr,2004,12.299485,"$40,815,621","$3,318,482"
7402,46686,en,1999,12.279883,"$914,472","$74,469"
9401,11110,he,2006,12.26422,"$2,267,128","$184,857"
7632,26291,en,1991,12.163019,"$3,132,032","$257,505"
5545,468592,pt,2019,12.12818,"$797,661","$65,769"
1056,36141,en,1995,12.126345,"$12,995,820","$1,071,701"
7289,159037,ar,2013,12.073018,"$1,961,963","$162,508"
4330,325173,en,2015,12.049209,"$3,856,711","$320,080"
5994,24016,en,1990,12.047705,"$58,282,708","$4,837,661"
11710,32686,en,1996,12.031626,"$679,705","$56,493"


In [180]:
display(full__regression__none[~full__regression__none['is_outlier']].sort_values(by='ratio', ascending=True)[['movie_id', 'ratio', 'budget_usd_adj','revenue_usd_adj']].head(10).style.format({'budget_usd_adj': "${:,.0f}",'revenue_usd_adj': "${:,.0f}"}))

Unnamed: 0,movie_id,ratio,budget_usd_adj,revenue_usd_adj
806,297802,0.138885,"$194,149,586","$1,397,911,470"
11574,109513,0.138889,"$2,654,268","$19,110,729"
9356,11077,0.138889,"$23,313,083","$167,854,200"
5364,91186,0.138952,"$527,492","$3,796,206"
9925,145135,0.139031,"$4,577,914","$32,927,383"
4408,4233,0.139241,"$45,562,916","$327,223,941"
1193,512196,0.139319,"$10,726,552","$76,992,804"
4415,325348,0.13953,"$2,571,140","$18,427,094"
3967,3049,0.139903,"$30,840,283","$220,440,992"
3404,383498,0.139968,"$133,477,840","$953,634,409"
