# Set up

In [32]:
import duckdb
import pandas as pd
from IPython.display import display
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from eda_support_functions import *

# Suppress all warnings
import warnings

warnings.filterwarnings('ignore')

In [33]:
full__regression__no_outliers__none = pd.read_csv('../data/ml_ready_data/full__regression__no_outliers__none.csv')

In [34]:
full__regression__with_outliers__none = pd.read_csv('../data/ml_ready_data/full__regression__with_outliers__none.csv')

In [35]:
import numpy as np
from scipy.optimize import minimize

def revenue_deviation(split_points, budgets, revenues):
    # Ensure split points are sorted
    split_points = np.sort(split_points)
    
    # Add min and max budget as boundaries
    boundaries = np.concatenate(([min(budgets)], split_points, [max(budgets)]))
    
    total_deviation = 0
    for i in range(len(boundaries) - 1):
        # Get movies in this budget range
        mask = (budgets >= boundaries[i]) & (budgets < boundaries[i+1])
        group_revenues = revenues[mask]
        
        if len(group_revenues) > 0:
            # Calculate deviation as sum of squared differences from mean
            mean_revenue = np.mean(group_revenues)
            deviation = np.sum((group_revenues - mean_revenue)**2)
            total_deviation += deviation
    
    return total_deviation

In [36]:
def find_optimal_splits(budgets, revenues, n_splits):
    # Initial guess: evenly spaced split points
    initial_guess = np.linspace(min(budgets), max(budgets), n_splits + 2)[1:-1]
    
    # Define bounds for split points
    bounds = [(min(budgets), max(budgets)) for _ in range(n_splits)]
    
    # Optimize
    result = minimize(
        revenue_deviation,
        initial_guess,
        args=(budgets, revenues),
        method='L-BFGS-B',
        bounds=bounds
    )
    
    return np.sort(result.x)



In [37]:
def main(df):
    # Usage
    n_splits = 1  # for 3 groups
    optimal_splits = find_optimal_splits(df['budget'].values, df['revenue'].values, n_splits)

    # Create groups based on optimal splits
    df['group'] = pd.cut(df['budget'], 
                        bins=[-np.inf] + list(optimal_splits) + [np.inf], 
                        labels=['low', 'high'])

    for group in df['group'].unique():
        group_data = df[df['group'] == group]
        print(f"Group: {group}")
        print(f"  Budget range: {group_data['budget'].min()} - {group_data['budget'].max()}")
        print(f"  Revenue range: {group_data['revenue'].min()} - {group_data['revenue'].max()}")
        print(f"  Number of movies: {len(group_data)}")
        print(f"  Revenue standard deviation: {group_data['revenue'].std()}")
        print()

In [38]:
full__regression__no_outliers__none['budget'] = full__regression__no_outliers__none['budget_usd_adj']
full__regression__no_outliers__none['revenue'] = full__regression__no_outliers__none['revenue_usd_adj']
main(full__regression__no_outliers__none)

Group: low
  Budget range: 27091.967155539947 - 255055181.85759604
  Revenue range: 28577.961554021313 - 2148878781.951033
  Number of movies: 5854
  Revenue standard deviation: 176348862.11730936

Group: high
  Budget range: 257033895.70552143 - 513392777.597482
  Revenue range: 256291676.55371317 - 4298409951.924026
  Number of movies: 48
  Revenue standard deviation: 859028731.5824009



In [39]:
full__regression__with_outliers__none['budget'] = full__regression__with_outliers__none['budget_usd_adj']
full__regression__with_outliers__none['revenue'] = full__regression__with_outliers__none['revenue_usd_adj']
main(full__regression__with_outliers__none)

Group: low
  Budget range: 1.7205081874647092 - 255055181.85759604
  Revenue range: 2.5711404667175772 - 3898767714.998581
  Number of movies: 7576
  Revenue standard deviation: 202567626.69117427

Group: high
  Budget range: 257033895.70552143 - 513392777.597482
  Revenue range: 256291676.55371317 - 4298409951.924026
  Number of movies: 58
  Revenue standard deviation: 797470319.5153462



In [41]:
import pandas as pd
import numpy as np

def flexible_budget_grouping(df, n_groups, flexibility=0.1):
    # Sort the dataframe by budget
    df_sorted = df.sort_values('budget')
    
    # Calculate the ideal size for each group
    ideal_size = len(df) / n_groups
    
    # Initialize groups
    groups = []
    current_group = []
    current_size = 0
    
    for _, movie in df_sorted.iterrows():
        current_group.append(movie)
        current_size += 1
        
        # Check if we should start a new group
        if current_size >= ideal_size * (1 - flexibility) and len(groups) < n_groups - 1:
            # If we're close to the ideal size or over it, and we're not on the last group
            groups.append(current_group)
            current_group = []
            current_size = 0
    
    # Add the last group
    groups.append(current_group)
    
    # Assign group labels
    df['group'] = 0
    for i, group in enumerate(groups):
        for movie in group:
            df.loc[movie.name, 'group'] = i
    
    # Print group statistics
    for i in range(n_groups):
        group_data = df[df['group'] == i]
        print(f"Group {i}:")
        print(f"  Budget range: {group_data['budget'].min():,.0f} - {group_data['budget'].max():,.0f}")
        print(f"  Revenue range: {group_data['revenue'].min():,.0f} - {group_data['revenue'].max():,.0f}")
        print(f"  Number of movies: {len(group_data)}")
        print(f"  Median budget: {group_data['budget'].median():,.0f}")
        print(f"  Median revenue: {group_data['revenue'].median():,.0f}")
        print()
    
    return df

# Usage
n_groups = 2  # You can change this to 2 if you prefer
df = flexible_budget_grouping(full__regression__no_outliers__none, n_groups, flexibility=0.2)

# Optionally, create separate datasets
datasets = [df[df['group'] == i] for i in range(n_groups)]

Group 0:
  Budget range: 27,092 - 21,717,890
  Revenue range: 28,578 - 229,835,047
  Number of movies: 2361
  Median budget: 9,061,770
  Median revenue: 11,032,188

Group 1:
  Budget range: 21,717,890 - 513,392,778
  Revenue range: 1,233,353 - 4,298,409,952
  Number of movies: 3541
  Median budget: 49,876,815
  Median revenue: 88,433,111

