# Set up

In [1]:
import duckdb
import pandas as pd
from IPython.display import display
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from eda_support_functions import *

# Suppress all warnings
import warnings

warnings.filterwarnings('ignore')

In [2]:
full__regression__no_outliers__none = pd.read_csv('../data/ml_ready_data/full__regression__no_outliers__none.csv')

In [3]:
full__regression__with_outliers__none = pd.read_csv('../data/ml_ready_data/full__regression__with_outliers__none.csv')

In [4]:
full__regression__no_outliers__none.rename(columns={'budget_usd_adj': 'budget', "revenue_usd_adj": "revenue"}, inplace=True)
full__regression__with_outliers__none.rename(columns={'budget_usd_adj': 'budget', "revenue_usd_adj": "revenue"}, inplace=True)

In [5]:
display(full__regression__no_outliers__none[['budget', 'revenue']].describe().style.format({'budget': "${:,.0f}",'revenue': "${:,.0f}"}))

Unnamed: 0,budget,revenue
count,"$6,661","$6,661"
mean,"$41,608,183","$85,552,314"
std,"$48,008,905","$163,480,900"
min,"$36,579","$28,233"
25%,"$10,858,945","$8,229,440"
50%,"$26,452,123","$29,733,018"
75%,"$52,319,012","$89,716,894"
max,"$513,392,778","$2,490,472,059"


In [6]:
import pandas as pd
import numpy as np

def flexible_budget_grouping(df, n_groups, flexibility=0.1):
    # Sort the dataframe by budget
    df_sorted = df.sort_values('budget')
    
    # Calculate the ideal size for each group
    ideal_size = len(df) / n_groups
    
    # Initialize groups
    groups = []
    current_group = []
    current_size = 0
    
    for _, movie in df_sorted.iterrows():
        current_group.append(movie)
        current_size += 1
        
        # Check if we should start a new group
        if current_size >= ideal_size * (1 - flexibility) and len(groups) < n_groups - 1:
            # If we're close to the ideal size or over it, and we're not on the last group
            groups.append(current_group)
            current_group = []
            current_size = 0
    
    # Add the last group
    groups.append(current_group)
    
    # Assign group labels
    df['group'] = 0
    for i, group in enumerate(groups):
        for movie in group:
            df.loc[movie.name, 'group'] = i
    
    # Print group statistics
    for i in range(n_groups):
        group_data = df[df['group'] == i]
        print(f"Group {i}:")
        print(f"  Budget range: {group_data['budget'].min():,.0f} - {group_data['budget'].max():,.0f}")
        # Print budget std
        print(f"  Budget std: {group_data['budget'].std():,.0f}")
        # Print normalized budget std
        print(f"  Normalized budget std: {group_data['budget'].std() / group_data['budget'].mean():.2f}")
        print(f"  Revenue range: {group_data['revenue'].min():,.0f} - {group_data['revenue'].max():,.0f}")
        print(f"  Number of movies: {len(group_data)}")
        display(group_data[['budget', 'revenue']].describe().style.format({'budget': "${:,.0f}",'revenue': "${:,.0f}"}))

In [8]:
# Usage
n_groups = 3  # You can change this to 2 if you prefer
df = flexible_budget_grouping(full__regression__no_outliers__none, n_groups, flexibility=0.25)

Group 0:
  Budget range: 36,579 - 10,858,945
  Budget std: 3,107,032
  Normalized budget std: 0.60
  Revenue range: 28,233 - 76,992,804
  Number of movies: 1666


Unnamed: 0,budget,revenue
count,"$1,666","$1,666"
mean,"$5,202,735","$9,353,674"
std,"$3,107,032","$11,702,093"
min,"$36,579","$28,233"
25%,"$2,523,859","$1,583,376"
50%,"$5,133,894","$4,761,449"
75%,"$7,727,016","$12,567,588"
max,"$10,858,945","$76,992,804"


Group 1:
  Budget range: 10,858,945 - 26,452,123
  Budget std: 4,567,613
  Normalized budget std: 0.25
  Revenue range: 1,071,701 - 179,631,482
  Number of movies: 1666


Unnamed: 0,budget,revenue
count,"$1,666","$1,666"
mean,"$18,133,498","$32,247,757"
std,"$4,567,613","$32,164,380"
min,"$10,858,945","$1,071,701"
25%,"$14,041,772","$7,722,662"
50%,"$17,694,657","$21,187,581"
75%,"$22,187,039","$46,154,312"
max,"$26,452,123","$179,631,482"


Group 2:
  Budget range: 26,452,123 - 513,392,778
  Budget std: 52,531,729
  Normalized budget std: 0.73
  Revenue range: 2,530,555 - 2,490,472,059
  Number of movies: 3329


Unnamed: 0,budget,revenue
count,"$3,329","$3,329"
mean,"$71,575,231","$150,362,265"
std,"$52,531,729","$210,635,808"
min,"$26,452,123","$2,530,555"
25%,"$37,292,183","$32,517,605"
50%,"$52,319,012","$79,394,100"
75%,"$86,025,409","$181,016,540"
max,"$513,392,778","$2,490,472,059"
