In [26]:
import pandas as pd
import numpy as np

In [27]:
df = pd.read_csv('data/imputed/mean_df.csv')

df['date'] = pd.to_datetime(df['date'])
df.drop(columns='day', inplace=True)
df.set_index('date', inplace=True)

In [28]:
columns_to_sum = ['zone_a_mwr_patients', 'zone_a__patients', 'zone_b/c_patients']
df.loc[df['total_number_of_patients'] == 0, 'total_number_of_patients'] = df[columns_to_sum].sum(axis=1)

columns_to_sum = ['zone_a_mwr_sets_of_emews', 'zone_a__sets_of_emews', 'zone_b/c_sets_of_emews']
df.loc[df['total_number_of_emews'] == 0, 'total_number_of_emews'] = df[columns_to_sum].sum(axis=1)

In [29]:
df['time_diff'] = df.index.to_series().diff()
first_12hr_idx = df[df['time_diff'] == pd.Timedelta(hours=12)].index[0] - pd.Timedelta(hours=12)

In [30]:
df_24hr_segment = df.loc[:first_12hr_idx - pd.Timedelta(hours=12)].drop(columns='time_diff').copy()
df_12hr_segment = df.loc[first_12hr_idx:].drop(columns='time_diff').copy()
df.drop(columns='time_diff', inplace=True)

In [31]:
df_24hr_segment.shape[0] + df_12hr_segment.shape[0] == df.shape[0]

True

In [32]:
# Create a daily timestamp for grouping
df_12hr_segment['date_only'] = df_12hr_segment.index.normalize() # Gets date part (00:00:00)

# Calculate daily totals from the 12-hour data
daily_totals_12hr_segment = df_12hr_segment.groupby('date_only').sum()

In [33]:
# Prepare to calculate proportions: Merge daily totals back into the 12-hour data
df_12hr_segment_with_daily_total = df_12hr_segment.merge(
    daily_totals_12hr_segment,
    left_on='date_only',
    right_index=True,
    suffixes=('', '_daily_total') # Suffixes to differentiate original column from its daily total
)

In [34]:
# Initialize a DataFrame to store the average proportions for 00:00 and 12:00
# The columns of this DataFrame will be your original numerical columns
proportion_df = pd.DataFrame(index=['00:00', '12:00'], columns=df_24hr_segment.columns)

In [35]:
# Iterate through each numerical column to calculate its specific 12-hour proportions
for col in df_24hr_segment.columns: # Use columns from the 24hr segment, as these are the ones to process
    # Filter out rows where the daily total for this specific column is zero to avoid division by zero
    valid_rows = df_12hr_segment_with_daily_total[col + '_daily_total'] != 0

    if valid_rows.any(): # Only proceed if there are non-zero daily totals for this column
        df_temp_col = df_12hr_segment_with_daily_total[valid_rows].copy()
        df_temp_col['proportion'] = df_temp_col[col] / df_temp_col[col + '_daily_total']

        # Group by hour to get the average proportion for 00:00 and 12:00
        avg_proportions = df_temp_col.groupby(df_temp_col.index.hour)['proportion'].mean()

        # Assign to the proportion_df, handling cases where a specific hour might be missing
        if 0 in avg_proportions.index:
            proportion_df.loc['00:00', col] = avg_proportions[0]
        if 12 in avg_proportions.index:
            proportion_df.loc['12:00', col] = avg_proportions[12]
    else:
        # If all daily totals for a column are zero, default to an even spread (0.5 for each 12-hour period)
        print("Invalid row found")
        proportion_df.loc['00:00', col] = 0.5
        proportion_df.loc['12:00', col] = 0.5

In [36]:
# Fill any remaining NaNs (e.g., if a column had no data for a specific hour across the entire 12hr segment)
proportion_df = proportion_df.fillna(0.5)

# Re-normalize proportions to ensure they sum exactly to 1 for each column, accounting for potential floating point errors
proportion_df = proportion_df.div(proportion_df.sum(axis=0), axis=1)

del df_12hr_segment['date_only']

In [37]:
proportion_df

Unnamed: 0,zone_a_mwr_patients,zone_a_mwr_cat_3,zone_a_mwr_cat_4,zone_a_mwr_sets_of_emews,zone_a_mwr_deescalations,zone_a_mwr_escalations,zone_a__patients,zone_a__cat_2,zone_a__cat_3,zone_a__sets_of_emews,zone_a__deescalations,zone_a__escalations,zone_b/c_patients,zone_b/c_cat_2,zone_b/c_cat_3,zone_b/c_sets_of_emews,zone_b/c_deescalations,zone_b/c_escalations,total_number_of_patients,total_number_of_emews
00:00,0.525129,0.520211,0.545065,0.503035,0.556018,0.573245,0.51374,0.504933,0.538833,0.495449,0.529682,0.500045,0.499156,0.493693,0.509481,0.496643,0.504438,0.506052,0.496847,0.482498
12:00,0.474871,0.479789,0.454935,0.496965,0.443982,0.426755,0.48626,0.495067,0.461167,0.504551,0.470318,0.499955,0.500844,0.506307,0.490519,0.503357,0.495562,0.493948,0.503153,0.517502


In [38]:
# Filter this range to only include the 00:00 and 12:00 points corresponding to the original 24hr days
# The original 24hr data is at 00:00. We need 00:00 and 12:00 for each of those days.
# So, for each day in df_24hr_segment, we generate its 00:00 and 12:00 timestamp.
upsampled_24hr_indices = []
for dt in df_24hr_segment.index:
    upsampled_24hr_indices.append(dt) # The 00:00 point
    upsampled_24hr_indices.append(dt + pd.Timedelta(hours=12)) # The 12:00 point
upsampled_24hr_indices = pd.DatetimeIndex(sorted(list(set(upsampled_24hr_indices)))) # Sort and remove duplicates

df_24hr_upsampled = pd.DataFrame(index=upsampled_24hr_indices, columns=df_24hr_segment.columns)

In [39]:
for current_date_24hr in df_24hr_segment.index:
    original_24hr_values = df_24hr_segment.loc[current_date_24hr]

    ts_00 = current_date_24hr.replace(hour=0, minute=0, second=0)
    ts_12 = current_date_24hr.replace(hour=12, minute=0, second=0)

    for col in df_24hr_segment.columns:
       # Calculate raw proportional values (these will be floats)
       raw_value_00 = original_24hr_values[col] * proportion_df.loc['00:00', col]

       if pd.isna(raw_value_00):
              value_00_final = value_12_final = np.nan
              print("Found NaN raw val")
       else:
              # Apply rounding: Round the first 12-hour segment
              value_00_rounded = int(round(raw_value_00))

              # Calculate the second 12-hour segment to ensure the sum matches the original 24-hour total
              value_12_calculated = int(original_24hr_values[col] - value_00_rounded)

              # Ensure values are non-negative (counts cannot be negative)
              value_00_final = max(0, value_00_rounded)
              value_12_final = max(0, value_12_calculated)

       # Assign these calculated integer values to the upsampled DataFrame
       if ts_00 in df_24hr_upsampled.index: # Check if timestamp exists (it should)
              df_24hr_upsampled.loc[ts_00, col] = value_00_final
       if ts_12 in df_24hr_upsampled.index: # Check if timestamp exists (it should)
              df_24hr_upsampled.loc[ts_12, col] = value_12_final

In [40]:
df_final_upsampled = pd.concat([df_24hr_upsampled, df_12hr_segment]).sort_index()

# Final check to ensure all columns are integer type after concatenation (important for count data)
# df_final_upsampled = df_final_upsampled.astype(int)

In [41]:
df_final_upsampled

Unnamed: 0,zone_a_mwr_patients,zone_a_mwr_cat_3,zone_a_mwr_cat_4,zone_a_mwr_sets_of_emews,zone_a_mwr_deescalations,zone_a_mwr_escalations,zone_a__patients,zone_a__cat_2,zone_a__cat_3,zone_a__sets_of_emews,zone_a__deescalations,zone_a__escalations,zone_b/c_patients,zone_b/c_cat_2,zone_b/c_cat_3,zone_b/c_sets_of_emews,zone_b/c_deescalations,zone_b/c_escalations,total_number_of_patients,total_number_of_emews
2024-01-04 00:00:00,34,28,6,97,6,3,11,6,5,25,5,1,7,5,5,30,1,3,50,146
2024-01-04 12:00:00,30,25,5,95,4,3,10,6,4,25,4,0,8,5,5,30,0,2,50,156
2024-01-05 00:00:00,31,31,0,45,6,0,13,7,6,30,5,0,13,6,8,35,0,3,56,106
2024-01-05 12:00:00,28,28,0,45,4,0,13,7,6,30,5,0,14,6,7,35,0,2,56,114
2024-01-06 00:00:00,19,19,0,29,2,1,9,9,0,33,3,1,7,5,5,25,0,3,34,84
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-11-05 12:00:00,27,27,0,73,25,0,28,25,3,124,21,2,20,10,10,72,5,2,55.0,197.0
2024-11-06 00:00:00,26,26,0,86,26,0,24,23,1,154,19,4,20,10,10,72,5,2,50.0,235.0
2024-11-06 12:00:00,36,36,0,83,15,2,27,21,6,113,12,5,20,10,10,72,5,2,63.0,196.0
2024-11-07 00:00:00,33,31,2,86,11,1,42,14,28,129,30,4,20,10,10,72,5,2,95.0,287.0


In [43]:
# import os
# os.makedirs("data/upsampled", exist_ok=True)
# df_final_upsampled.to_csv("data/upsampled/knn_3_df.csv", index=True, index_label='date')