# Set up

In [10]:
import duckdb
import pandas as pd
from IPython.display import display
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from eda_support_functions import *

# Suppress all warnings
import warnings

warnings.filterwarnings('ignore')

In [11]:
full__regression__no_outliers__none = pd.read_csv('../data/ml_ready_data/full__regression__no_outliers__none.csv')

In [12]:
full__regression__with_outliers__none = pd.read_csv('../data/ml_ready_data/full__regression__with_outliers__none.csv')

In [13]:
import numpy as np
from scipy.optimize import minimize

def revenue_deviation(split_points, budgets, revenues):
    # Ensure split points are sorted
    split_points = np.sort(split_points)
    
    # Add min and max budget as boundaries
    boundaries = np.concatenate(([min(budgets)], split_points, [max(budgets)]))
    
    total_deviation = 0
    for i in range(len(boundaries) - 1):
        # Get movies in this budget range
        mask = (budgets >= boundaries[i]) & (budgets < boundaries[i+1])
        group_revenues = revenues[mask]
        
        if len(group_revenues) > 0:
            # Calculate deviation as sum of squared differences from mean
            mean_revenue = np.mean(group_revenues)
            deviation = np.sum((group_revenues - mean_revenue)**2)
            total_deviation += deviation
    
    return total_deviation

In [14]:
def find_optimal_splits(budgets, revenues, n_splits):
    # Initial guess: evenly spaced split points
    initial_guess = np.linspace(min(budgets), max(budgets), n_splits + 2)[1:-1]
    
    # Define bounds for split points
    bounds = [(min(budgets), max(budgets)) for _ in range(n_splits)]
    
    # Optimize
    result = minimize(
        revenue_deviation,
        initial_guess,
        args=(budgets, revenues),
        method='L-BFGS-B',
        bounds=bounds
    )
    
    return np.sort(result.x)



In [15]:
def main(df):
    # Usage
    n_splits = 1  # for 3 groups
    optimal_splits = find_optimal_splits(df['budget'].values, df['revenue'].values, n_splits)

    # Create groups based on optimal splits
    df['group'] = pd.cut(df['budget'], 
                        bins=[-np.inf] + list(optimal_splits) + [np.inf], 
                        labels=['low', 'high'])

    for group in df['group'].unique():
        group_data = df[df['group'] == group]
        print(f"Group: {group}")
        print(f"  Budget range: {group_data['budget'].min()} - {group_data['budget'].max()}")
        print(f"  Revenue range: {group_data['revenue'].min()} - {group_data['revenue'].max()}")
        print(f"  Number of movies: {len(group_data)}")
        print(f"  Revenue standard deviation: {group_data['revenue'].std()}")
        print()

In [16]:
full__regression__no_outliers__none['budget'] = full__regression__no_outliers__none['budget_usd_adj']
full__regression__no_outliers__none['revenue'] = full__regression__no_outliers__none['revenue_usd_adj']
main(full__regression__no_outliers__none)

Group: low
  Budget range: 38086.63913969176 - 255055181.85759604
  Revenue range: 31601.469109008303 - 2148878781.951033
  Number of movies: 5259
  Revenue standard deviation: 181798562.15072462

Group: high
  Budget range: 257033895.70552143 - 513392777.597482
  Revenue range: 256291676.55371317 - 4298409951.924026
  Number of movies: 48
  Revenue standard deviation: 859028731.5824009



In [17]:
full__regression__with_outliers__none['budget'] = full__regression__with_outliers__none['budget_usd_adj']
full__regression__with_outliers__none['revenue'] = full__regression__with_outliers__none['revenue_usd_adj']
main(full__regression__with_outliers__none)

Group: low
  Budget range: 1.7205081874647092 - 255055181.85759604
  Revenue range: 1.2134349102175566 - 3898767714.998581
  Number of movies: 7585
  Revenue standard deviation: 202466403.4665562

Group: high
  Budget range: 257033895.70552143 - 513392777.597482
  Revenue range: 256291676.55371317 - 4298409951.924026
  Number of movies: 58
  Revenue standard deviation: 797470319.5153462



In [26]:
display(full__regression__no_outliers__none[['budget', 'revenue']].describe().style.format({'budget': "${:,.0f}",'revenue': "${:,.0f}"}))

Unnamed: 0,budget,revenue
count,"$5,307","$5,307"
mean,"$48,835,346","$122,355,702"
std,"$52,301,470","$223,810,968"
min,"$38,087","$31,601"
25%,"$14,545,681","$15,125,960"
50%,"$32,282,554","$48,174,229"
75%,"$62,872,163","$133,050,676"
max,"$513,392,778","$4,298,409,952"


In [25]:
import pandas as pd
import numpy as np

def flexible_budget_grouping(df, n_groups, flexibility=0.1):
    # Sort the dataframe by budget
    df_sorted = df.sort_values('budget')
    
    # Calculate the ideal size for each group
    ideal_size = len(df) / n_groups
    
    # Initialize groups
    groups = []
    current_group = []
    current_size = 0
    
    for _, movie in df_sorted.iterrows():
        current_group.append(movie)
        current_size += 1
        
        # Check if we should start a new group
        if current_size >= ideal_size * (1 - flexibility) and len(groups) < n_groups - 1:
            # If we're close to the ideal size or over it, and we're not on the last group
            groups.append(current_group)
            current_group = []
            current_size = 0
    
    # Add the last group
    groups.append(current_group)
    
    # Assign group labels
    df['group'] = 0
    for i, group in enumerate(groups):
        for movie in group:
            df.loc[movie.name, 'group'] = i
    
    # Print group statistics
    for i in range(n_groups):
        group_data = df[df['group'] == i]
        print(f"Group {i}:")
        print(f"  Budget range: {group_data['budget'].min():,.0f} - {group_data['budget'].max():,.0f}")
        print(f"  Revenue range: {group_data['revenue'].min():,.0f} - {group_data['revenue'].max():,.0f}")
        print(f"  Number of movies: {len(group_data)}")
        display(group_data[['budget', 'revenue']].describe().style.format({'budget': "${:,.0f}",'revenue': "${:,.0f}"}))


# Usage
n_groups = 3  # You can change this to 2 if you prefer
df = flexible_budget_grouping(full__regression__no_outliers__none, n_groups, flexibility=0.2)

# # Optionally, create separate datasets
# datasets = [df[df['group'] == i] for i in range(n_groups)]

Group 0:
  Budget range: 38,087 - 15,454,032
  Revenue range: 31,601 - 178,821,608
  Number of movies: 1416


Unnamed: 0,budget,revenue
count,"$1,416","$1,416"
mean,"$7,613,304","$20,311,061"
std,"$4,438,099","$26,280,585"
min,"$38,087","$31,601"
25%,"$3,823,269","$2,650,607"
50%,"$7,320,218","$9,703,978"
75%,"$11,591,924","$27,348,189"
max,"$15,454,032","$178,821,608"


Group 1:
  Budget range: 15,454,032 - 35,847,294
  Revenue range: 580,004 - 445,972,927
  Number of movies: 1416


Unnamed: 0,budget,revenue
count,"$1,416","$1,416"
mean,"$25,073,689","$62,028,373"
std,"$5,689,114","$66,120,185"
min,"$15,454,032","$580,004"
25%,"$20,082,650","$16,563,547"
50%,"$24,941,462","$40,272,437"
75%,"$29,686,022","$84,547,659"
max,"$35,847,294","$445,972,927"


Group 2:
  Budget range: 35,878,794 - 513,392,778
  Revenue range: 1,184,356 - 4,298,409,952
  Number of movies: 2475


Unnamed: 0,budget,revenue
count,"$2,475","$2,475"
mean,"$86,013,898","$215,252,151"
std,"$56,199,696","$296,410,913"
min,"$35,878,794","$1,184,356"
25%,"$47,461,371","$47,993,813"
50%,"$66,356,699","$120,028,784"
75%,"$104,847,168","$261,579,157"
max,"$513,392,778","$4,298,409,952"
