In [11]:
import numpy as np
import pandas as pd

In [12]:
import pydisagg

In [79]:
def data():
    np.random.seed(123)
    return pd.DataFrame(
        dict(
            uid=range(10),
            sex_id=[1] * 5 + [2] * 5,
            location_id=[1, 2] * 5,
            year_id=[2010] * 10,
            age_start=[0, 5, 10, 17, 20] * 2,
            age_end=[12, 10, 22, 21, 25] * 2,
            val=5.0,
            val_sd=1.0,
        )
    )
def pattern():
    np.random.seed(123)
    pattern_df1 = pd.DataFrame(
        dict(
            sex_id=[1] * 5 + [2] * 5,
            age_start=[0, 5, 10, 15, 20] * 2,
            age_end=[5, 10, 15, 20, 25] * 2,
            age_group_id=list(range(5)) * 2,
            draw_0=np.random.rand(10),
            draw_1=np.random.rand(10),
            draw_2=np.random.rand(10),
            year_id=[2010] * 10,
            location_id=[1]*10,
        )
    )
    pattern_df2 = pattern_df1.copy()
    pattern_df2['location_id']=2
    return pd.concat([pattern_df1,pattern_df2]).reset_index(drop=True)
def population():
    np.random.seed(123)
    sex_id = pd.DataFrame(dict(sex_id=[1, 2]))
    year_id = pd.DataFrame(dict(year_id=[2010]))
    location_id = pd.DataFrame(dict(location_id=[1, 2]))
    age_group_id = pd.DataFrame(dict(age_group_id=range(5)))

    population = (
        sex_id.merge(location_id, how="cross")
        .merge(age_group_id, how="cross")
        .merge(year_id, how="cross")
    )
    population["population"] = 1000
    return population


In [80]:
df = data()
df_pattern = pattern()
df_pop = population()
df = df.drop('sex_id',axis=1 )
df['male_pop'] = np.random.uniform(0.5,2,10)
df['female_pop'] = np.random.uniform(0.5,2,10)
df['prev_ratio_2_over_1'] = np.linspace(1,2,10)
df['prev_ratio_se']=np.random.uniform(0.1,0.2,10)

In [81]:
df

Unnamed: 0,uid,location_id,year_id,age_start,age_end,val,val_sd,male_pop,female_pop,prev_ratio_2_over_1,prev_ratio_se
0,0,1,2010,0,12,5.0,1.0,1.544704,1.014767,1.0,0.16344
1,1,2,2010,5,10,5.0,1.0,0.929209,1.593575,1.111111,0.184943
2,2,1,2010,10,22,5.0,1.0,0.840277,1.157858,1.222222,0.172446
3,3,2,2010,17,21,5.0,1.0,1.326972,0.589517,1.333333,0.161102
4,4,1,2010,20,25,5.0,1.0,1.579203,1.097066,1.444444,0.172244
5,5,2,2010,0,12,5.0,1.0,1.13466,1.606993,1.555556,0.132296
6,6,1,2010,5,10,5.0,1.0,1.971146,0.773738,1.666667,0.136179
7,7,2,2010,10,22,5.0,1.0,1.527245,0.763178,1.777778,0.122826
8,8,1,2010,17,21,5.0,1.0,1.221398,1.297327,1.888889,0.129371
9,9,2,2010,20,25,5.0,1.0,1.088176,1.297741,2.0,0.163098


In [82]:
from pydisagg.disaggregate import split_datapoint
from pydisagg.models import RateMultiplicativeModel

In [83]:
split_rows = []
for i,row in df.iterrows():
    split_result, SE = split_datapoint(
        observed_total=row['val'],
        bucket_populations=np.array([row['male_pop'],row['female_pop']]),
        rate_pattern=np.array([1.,row['prev_ratio_2_over_1']]),
        model=RateMultiplicativeModel(),
        output_type='rate',  # type: ignore, this is handeled by model_mapping
        normalize_pop_for_average_type_obs=True,
        observed_total_se=row['val_sd'],
        pattern_covariance=np.diag(
            np.array([0,row['prev_ratio_se']**2])
        ),
    )
    male_row = row.copy()
    female_row = row.copy()
    male_row['sex_id']=1
    female_row['sex_id']=2
    male_row['split_val']=split_result[0]
    female_row['split_val']=split_result[1]
    male_row['se']=SE[0]
    female_row['se']=SE[1]
    split_rows.append(male_row)
    split_rows.append(female_row)

split_df = pd.DataFrame(split_rows)

In [84]:
5.191206/4.672085

1.1111112062387565

In [86]:
def sex_split_row(row):
    split_result, SE = split_datapoint(
        observed_total=row['val'],
        bucket_populations=np.array([row['male_pop'], row['female_pop']]),
        rate_pattern=np.array([1., row['prev_ratio_2_over_1']]),
        model=RateMultiplicativeModel(),
        output_type='rate',
        normalize_pop_for_average_type_obs=True,
        observed_total_se=row['val_sd'],
        pattern_covariance=np.diag(np.array([0, row['prev_ratio_se']**2]))
    )
    return pd.Series({
        'split_val_male': split_result[0],
        'split_val_female': split_result[1],
        'se_male': SE[0],
        'se_female': SE[1]
    })

# Apply the function across the DataFrame
split_results = df.apply(sex_split_row, axis=1)

# Create new DataFrames for male and female results
split_df_male = df.copy()
split_df_female = df.copy()

split_df_male[['split_val', 'se']] = split_results[['split_val_male', 'se_male']]
split_df_female[['split_val', 'se']] = split_results[['split_val_female', 'se_female']]

split_df_male['sex_id'] = 1
split_df_female['sex_id'] = 2

# Combine the results back into one DataFrame
final_split_df = pd.concat([split_df_male, split_df_female], ignore_index=True).sort_values(['uid','sex_id']).reset_index(drop = True)


In [75]:
split_df[final_split_df.columns].reset_index(drop = True)  - final_split_df.sort_values(['uid','sex_id']).reset_index(drop = True)

Unnamed: 0,uid,location_id,year_id,age_start,age_end,val,val_sd,male_pop,female_pop,sex_ratio_2_over_1,sex_ratio_se,split_val,se,sex_id
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [88]:
def sex_split_df(df,ratio_2_over_1_col,ratio_se_col,val_col,val_se_col):
    def sex_split_row(row):
        split_result, SE = split_datapoint(
            observed_total=row[val_col],
            bucket_populations=np.array([row['male_pop'], row['female_pop']]),
            rate_pattern=np.array([1., row[ratio_2_over_1_col]]),
            model=RateMultiplicativeModel(),
            output_type='rate',
            normalize_pop_for_average_type_obs=True,
            observed_total_se=row[val_se_col],
            pattern_covariance=np.diag(np.array([0, row[ratio_se_col]**2]))
        )
        return pd.Series({
            'split_val_male': split_result[0],
            'split_val_female': split_result[1],
            'se_male': SE[0],
            'se_female': SE[1]
        })

    # Apply the function across the DataFrame
    split_results = df.apply(sex_split_row, axis=1)

    # Create new DataFrames for male and female results
    split_df_male = df.copy()
    split_df_female = df.copy()

    split_df_male[[val_col, val_se_col]] = split_results[['split_val_male', 'se_male']]
    split_df_female[[val_col, val_se_col]] = split_results[['split_val_female', 'se_female']]

    split_df_male['sex_id'] = 1
    split_df_female['sex_id'] = 2

    # Combine the results back into one DataFrame
    final_split_df = pd.concat([split_df_male, split_df_female], ignore_index=True).sort_values(['uid','sex_id']).reset_index(drop = True)
    return final_split_df

In [90]:
sex_split_df(df,'prev_ratio_2_over_1','prev_ratio_se','val','val_sd')

Unnamed: 0,uid,location_id,year_id,age_start,age_end,val,val_sd,male_pop,female_pop,prev_ratio_2_over_1,prev_ratio_se,sex_id
0,0,1,2010,0,12,5.0,1.051178,1.544704,1.014767,1.0,0.16344,1
1,0,1,2010,0,12,5.0,1.11501,1.544704,1.014767,1.0,0.16344,2
2,1,2,2010,5,10,4.672085,1.064542,0.929209,1.593575,1.111111,0.184943,1
3,1,2,2010,5,10,5.191206,1.079993,0.929209,1.593575,1.111111,0.184943,2
4,2,1,2010,10,22,4.429597,0.968827,0.840277,1.157858,1.222222,0.172446,1
5,2,1,2010,10,22,5.413951,1.119563,0.840277,1.157858,1.222222,0.172446,2
6,3,2,2010,17,21,4.535007,0.929624,1.326972,0.589517,1.333333,0.161102,1
7,3,2,2010,17,21,6.046676,1.293448,1.326972,0.589517,1.333333,0.161102,2
8,4,1,2010,20,25,4.229445,0.882801,1.579203,1.097066,1.444444,0.172244,1
9,4,1,2010,20,25,6.109198,1.274799,1.579203,1.097066,1.444444,0.172244,2
