In [96]:
import pandas as pd
import numpy as np

In [97]:
df = pd.read_csv('data/imputed/mice_df.csv')

df['date'] = pd.to_datetime(df['date'])
df.drop(columns='day', inplace=True)
df.set_index('date', inplace=True)

In [98]:
df['time_diff'] = df.index.to_series().diff()
first_12hr_idx = df[df['time_diff'] == pd.Timedelta(hours=12)].index[0] - pd.Timedelta(hours=12)

In [99]:
df_24hr_segment = df.loc[:first_12hr_idx - pd.Timedelta(hours=12)].drop(columns='time_diff').copy()
df_12hr_segment = df.loc[first_12hr_idx:].drop(columns='time_diff').copy()
df.drop(columns='time_diff', inplace=True)

In [100]:
df_24hr_segment.shape[0] + df_12hr_segment.shape[0] == df.shape[0]

True

In [101]:
missing_date = pd.to_datetime('2024-01-17')

print(f"Is {missing_date} in original index? --> {missing_date in df.index}")

full_24hr_expected_range = pd.date_range(start=df_24hr_segment.index.min().normalize(),
                                         end=df_24hr_segment.index.max(), # Use the last date from the *intended* 24hr range
                                         freq='24H')

df_24hr_segment = df_24hr_segment.reindex(full_24hr_expected_range)

print(f"Is {missing_date} in reindex 24hr segment index? --> {missing_date in df_24hr_segment.index}")

df_24hr_segment.fillna(0, inplace=True)

Is 2024-01-17 00:00:00 in original index? --> False
Is 2024-01-17 00:00:00 in reindex 24hr segment index? --> True


In [102]:
# Create a daily timestamp for grouping
df_12hr_segment['date_only'] = df_12hr_segment.index.normalize() # Gets date part (00:00:00)

# Calculate daily totals from the 12-hour data
daily_totals_12hr_segment = df_12hr_segment.groupby('date_only').sum()

In [103]:
# Prepare to calculate proportions: Merge daily totals back into the 12-hour data
df_12hr_segment_with_daily_total = df_12hr_segment.merge(
    daily_totals_12hr_segment,
    left_on='date_only',
    right_index=True,
    suffixes=('', '_daily_total') # Suffixes to differentiate original column from its daily total
)

In [104]:
# Initialize a DataFrame to store the average proportions for 00:00 and 12:00
# The columns of this DataFrame will be your original numerical columns
proportion_df = pd.DataFrame(index=['00:00', '12:00'], columns=df_24hr_segment.columns)

In [105]:
# Iterate through each numerical column to calculate its specific 12-hour proportions
for col in df_24hr_segment.columns: # Use columns from the 24hr segment, as these are the ones to process
    # Filter out rows where the daily total for this specific column is zero to avoid division by zero
    valid_rows = df_12hr_segment_with_daily_total[col + '_daily_total'] != 0

    if valid_rows.any(): # Only proceed if there are non-zero daily totals for this column
        df_temp_col = df_12hr_segment_with_daily_total[valid_rows].copy()
        df_temp_col['proportion'] = df_temp_col[col] / df_temp_col[col + '_daily_total']

        # Group by hour to get the average proportion for 00:00 and 12:00
        avg_proportions = df_temp_col.groupby(df_temp_col.index.hour)['proportion'].mean()

        # Assign to the proportion_df, handling cases where a specific hour might be missing
        if 0 in avg_proportions.index:
            proportion_df.loc['00:00', col] = avg_proportions[0]
        if 12 in avg_proportions.index:
            proportion_df.loc['12:00', col] = avg_proportions[12]
    else:
        # If all daily totals for a column are zero, default to an even spread (0.5 for each 12-hour period)
        print("Invalid row found")
        proportion_df.loc['00:00', col] = 0.5
        proportion_df.loc['12:00', col] = 0.5

In [106]:
# Fill any remaining NaNs (e.g., if a column had no data for a specific hour across the entire 12hr segment)
proportion_df = proportion_df.fillna(0.5)

# Re-normalize proportions to ensure they sum exactly to 1 for each column, accounting for potential floating point errors
proportion_df = proportion_df.div(proportion_df.sum(axis=0), axis=1)

del df_12hr_segment['date_only']

In [107]:
proportion_df

Unnamed: 0,zone_a_mwr_patients,zone_a_mwr_cat_3,zone_a_mwr_cat_4,zone_a_mwr_sets_of_emews,zone_a_mwr_deescalations,zone_a_mwr_escalations,zone_a__patients,zone_a__cat_2,zone_a__cat_3,zone_a__sets_of_emews,zone_a__deescalations,zone_a__escalations,zone_b/c_patients,zone_b/c_cat_2,zone_b/c_cat_3,zone_b/c_sets_of_emews,zone_b/c_deescalations,zone_b/c_escalations,total_number_of_patients,total_number_of_emews
00:00,0.522461,0.518188,0.554779,0.503263,0.549394,0.575328,0.513779,0.505067,0.538833,0.49587,0.529888,0.500935,0.502299,0.493167,0.516788,0.492849,0.504438,0.515841,0.507218,0.490427
12:00,0.477539,0.481812,0.445221,0.496737,0.450606,0.424672,0.486221,0.494933,0.461167,0.50413,0.470112,0.499065,0.497701,0.506833,0.483212,0.507151,0.495562,0.484159,0.492782,0.509573


In [108]:
# Filter this range to only include the 00:00 and 12:00 points corresponding to the original 24hr days
# The original 24hr data is at 00:00. We need 00:00 and 12:00 for each of those days.
# So, for each day in df_24hr_segment, we generate its 00:00 and 12:00 timestamp.
upsampled_24hr_indices = []
for dt in df_24hr_segment.index:
    upsampled_24hr_indices.append(dt) # The 00:00 point
    upsampled_24hr_indices.append(dt + pd.Timedelta(hours=12)) # The 12:00 point
upsampled_24hr_indices = pd.DatetimeIndex(sorted(list(set(upsampled_24hr_indices)))) # Sort and remove duplicates

df_24hr_upsampled = pd.DataFrame(index=upsampled_24hr_indices, columns=df_24hr_segment.columns)

In [109]:
for current_date_24hr in df_24hr_segment.index:
    original_24hr_values = df_24hr_segment.loc[current_date_24hr]

    ts_00 = current_date_24hr.replace(hour=0, minute=0, second=0)
    ts_12 = current_date_24hr.replace(hour=12, minute=0, second=0)

    for col in df_24hr_segment.columns:
        # Calculate raw proportional values (these will be floats)
        raw_value_00 = original_24hr_values[col] * proportion_df.loc['00:00', col]

        if pd.isna(raw_value_00):
               value_00_final = value_12_final = np.nan
        else:
               # Apply rounding: Round the first 12-hour segment
               value_00_rounded = int(round(raw_value_00))

               # Calculate the second 12-hour segment to ensure the sum matches the original 24-hour total
               value_12_calculated = int(original_24hr_values[col] - value_00_rounded)

               # Ensure values are non-negative (counts cannot be negative)
               value_00_final = max(0, value_00_rounded)
               value_12_final = max(0, value_12_calculated)

        # Assign these calculated integer values to the upsampled DataFrame
        if ts_00 in df_24hr_upsampled.index: # Check if timestamp exists (it should)
             df_24hr_upsampled.loc[ts_00, col] = value_00_final
        if ts_12 in df_24hr_upsampled.index: # Check if timestamp exists (it should)
             df_24hr_upsampled.loc[ts_12, col] = value_12_final

In [110]:
df_final_upsampled = pd.concat([df_24hr_upsampled, df_12hr_segment]).sort_index()

# Final check to ensure all columns are integer type after concatenation (important for count data)
# df_final_upsampled = df_final_upsampled.astype(int)

In [111]:
import os
os.makedirs("data/upsampled", exist_ok=True)
df_final_upsampled.to_csv("data/upsampled/mice_upsampled_df.csv", index=True, index_label='date')