In [None]:
## How this simulated data was created

capacity = *****Original file*********

def to_clean(val):
    return val.strip().lower().replace(" ","_")

capacity = capacity.rename(columns = to_clean)

#Cast as categorical data
capacity[['userid','position','team_lead_id']] = (capacity[['userid','position','team_lead_id']].astype('category'))
#Cast to float
capacity['adj_total_hours'] = pd.to_numeric(capacity.adj_total_hours, errors = 'coerce')
#Cast to date
capacity['year_month'] = pd.to_datetime(capacity['year_month'])


numeric_cols = capacity.select_dtypes(include='number').columns

capacity_stats = (
    capacity
      .groupby('year_month')[numeric_cols]
      .agg(['mean','std','min','max','median'])   
      .stack(level=0, future_stack=True)                             
      .reset_index()                              
      .rename(columns={'level_1':'column'}) 
)


def simulate_wide_fte_with_components(capacity_stats, n_per_month=50,
                                      team_lead_ids=None,
                                      position_probs=None,
                                      user_id_low=10000,
                                      user_id_high=None,
                                      round_to=2,
                                      seed=None):
    if seed is not None:
        np.random.seed(seed)
    # default team leads & positions
    if team_lead_ids is None:
        team_lead_ids = list(np.random.choice(np.arange(5000,5100), 4, False))
    if position_probs is None:
        position_probs = {'Tax Associate': 0.79, 'Tax Analyst': 0.21}

    # all metrics in capacity_stats
    all_metrics = list(capacity_stats['column'].unique())
    # we’ll simulate these three, then sum them
    components = ['new_policy_count', 'renewal_count', 'endorsement__count']
    # the rest we simulate as before
    other_metrics = [m for m in all_metrics if m not in components + ['total_filing']]

    rows = []
    # auto-expand user_id range
    months = capacity_stats['year_month'].unique()
    if user_id_high is None or user_id_high <= user_id_low:
        user_id_high = user_id_low + n_per_month * len(months) * 2

    for ym in months:
        stats_m = capacity_stats[capacity_stats['year_month']==ym].set_index('column')
        uids = np.random.choice(np.arange(user_id_low, user_id_high),
                                size=n_per_month, replace=False)

        for uid in uids:
            row = {
                'year_month': ym,
                'user_id': int(uid),
                'team_lead_id': int(np.random.choice(team_lead_ids)),
                'Position': np.random.choice(
                    list(position_probs),
                    p=list(position_probs.values())
                )
            }
            # 1) simulate the three filing components
            for comp in components:
                mu, sigma = stats_m.at[comp,'mean'], stats_m.at[comp,'std']
                mn, mx     = stats_m.at[comp,'min'],  stats_m.at[comp,'max']
                v = np.random.normal(mu, sigma)
                v = float(np.clip(v, mn, mx))
                # snap to nearest multiple of round_to
                v = np.round(v/round_to) * round_to
                row[comp] = v

            # 2) compute total_filing as their sum
            row['total_filing'] = (
                row['new_policy_count']
              + row['renewal_count']
              + row['endorsement__count']
            )

            # 3) simulate any remaining metrics as before
            for m in other_metrics:
                mu, sigma = stats_m.at[m,'mean'], stats_m.at[m,'std']
                mn, mx     = stats_m.at[m,'min'],  stats_m.at[m,'max']
                v = np.random.normal(mu, sigma)
                v = float(np.clip(v, mn, mx))
                v = np.round(v/round_to) * round_to
                row[m] = v

            rows.append(row)

    return pd.DataFrame(rows)

sim_df = simulate_wide_fte_with_components(capacity_stats, n_per_month=25, seed=42, round_to=2)

sim_df[['user_id','Position','team_lead_id']] = (sim_df[['user_id','Position','team_lead_id']].astype('category'))
sim_df.to_csv(r"C:\Users\Kyle\OneDrive\Desktop\theoretical capacity data 3 months_simulated.csv", index= False)