# Set up

In [8]:
import duckdb
import pandas as pd
from IPython.display import display
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from eda_support_functions import *

# Suppress all warnings
import warnings

warnings.filterwarnings('ignore')

In [9]:
full__regression__no_outliers__none = pd.read_csv('../data/ml_ready_data/full__regression__no_outliers__none.csv')

In [10]:
full__regression__with_outliers__none = pd.read_csv('../data/ml_ready_data/full__regression__with_outliers__none.csv')

In [11]:
full__regression__no_outliers__none.rename(columns={'budget_usd_adj': 'budget', "revenue_usd_adj": "revenue"}, inplace=True)
full__regression__with_outliers__none.rename(columns={'budget_usd_adj': 'budget', "revenue_usd_adj": "revenue"}, inplace=True)

In [12]:
display(full__regression__no_outliers__none[['budget', 'revenue']].describe().style.format({'budget': "${:,.0f}",'revenue': "${:,.0f}"}))

Unnamed: 0,budget,revenue
count,"$7,053","$7,053"
mean,"$41,068,872","$97,263,313"
std,"$48,081,659","$199,139,368"
min,"$35,568","$28,233"
25%,"$10,543,322","$8,902,603"
50%,"$25,737,369","$32,610,539"
75%,"$51,422,809","$99,679,465"
max,"$513,392,778","$4,298,409,952"


In [13]:
import pandas as pd
import numpy as np

def flexible_budget_grouping(df, n_groups, flexibility=0.1):
    # Sort the dataframe by budget
    df_sorted = df.sort_values('budget')
    
    # Calculate the ideal size for each group
    ideal_size = len(df) / n_groups
    
    # Initialize groups
    groups = []
    current_group = []
    current_size = 0
    
    for _, movie in df_sorted.iterrows():
        current_group.append(movie)
        current_size += 1
        
        # Check if we should start a new group
        if current_size >= ideal_size * (1 - flexibility) and len(groups) < n_groups - 1:
            # If we're close to the ideal size or over it, and we're not on the last group
            groups.append(current_group)
            current_group = []
            current_size = 0
    
    # Add the last group
    groups.append(current_group)
    
    # Assign group labels
    df['group'] = 0
    for i, group in enumerate(groups):
        for movie in group:
            df.loc[movie.name, 'group'] = i
    
    # Print group statistics
    for i in range(n_groups):
        group_data = df[df['group'] == i]
        print(f"Group {i}:")
        print(f"  Budget range: {group_data['budget'].min():,.0f} - {group_data['budget'].max():,.0f}")
        # Print budget std
        print(f"  Budget std: {group_data['budget'].std():,.0f}")
        # Print normalized budget std
        print(f"  Normalized budget std: {group_data['budget'].std() / group_data['budget'].mean():.2f}")
        print(f"  Revenue range: {group_data['revenue'].min():,.0f} - {group_data['revenue'].max():,.0f}")
        print(f"  Number of movies: {len(group_data)}")
        display(group_data[['budget', 'revenue']].describe().style.format({'budget': "${:,.0f}",'revenue': "${:,.0f}"}))

In [14]:
# Usage
n_groups = 3  # You can change this to 2 if you prefer
df = flexible_budget_grouping(full__regression__no_outliers__none, n_groups, flexibility=0.1)

Group 0:
  Budget range: 35,568 - 13,079,753
  Budget std: 3,780,787
  Normalized budget std: 0.62
  Revenue range: 28,233 - 152,742,645
  Number of movies: 2116


Unnamed: 0,budget,revenue
count,"$2,116","$2,116"
mean,"$6,146,895","$13,944,135"
std,"$3,780,787","$19,656,405"
min,"$35,568","$28,233"
25%,"$2,813,243","$1,984,480"
50%,"$5,944,886","$6,259,178"
75%,"$9,346,687","$17,544,814"
max,"$13,079,753","$152,742,645"


Group 1:
  Budget range: 13,079,753 - 33,874,597
  Budget std: 6,050,905
  Normalized budget std: 0.27
  Revenue range: 1,140,106 - 392,419,542
  Number of movies: 2116


Unnamed: 0,budget,revenue
count,"$2,116","$2,116"
mean,"$22,662,014","$49,782,350"
std,"$6,050,905","$56,406,308"
min,"$13,079,753","$1,140,106"
25%,"$17,205,082","$11,429,125"
50%,"$22,496,663","$29,627,174"
75%,"$27,975,700","$67,388,407"
max,"$33,874,597","$392,419,542"


Group 2:
  Budget range: 33,874,597 - 513,392,778
  Budget std: 54,525,936
  Normalized budget std: 0.67
  Revenue range: 3,162,564 - 4,298,409,952
  Number of movies: 2821


Unnamed: 0,budget,revenue
count,"$2,821","$2,821"
mean,"$81,070,224","$195,375,010"
std,"$54,525,936","$282,777,890"
min,"$33,874,597","$3,162,564"
25%,"$44,743,319","$40,713,827"
50%,"$60,846,790","$101,763,697"
75%,"$97,320,616","$238,155,583"
max,"$513,392,778","$4,298,409,952"
