In [4]:
def prepare(df):
    import numpy as np
    import pandas as pd
    
    # define the time bin: from 0 to 240 seconds with a step of 30 seconds
    bins = np.arange(0, 241, 30)
    # columns for stations
    station_cols = ['peak_usa', 'peak_germany', 'peak_australia', 
                    'peak_italy', 'peak_iceland', 'peak_uk']
    
    # add a column to the DataFrame that determines which bin each row belongs to
    df['time_bin'] = pd.cut(df['Time'], bins=bins, right=False)
    
    # group the data by bin and sum the values for each station
    # assuming that the values in station_cols are 0 or 1
    grouped_sums = df.groupby('time_bin')[station_cols].sum()
    # grouped_sums will have shape 
    
    # calculate the threshold for each station: average value across all bins multiplied by 1.15
    means = grouped_sums.mean()
    thresholds = means * 1.15
    
    # for each bin and each station, determine if the sum exceeds the threshold
    # exceed_df is a DataFrame with boolean values where True means the threshold is exceeded
    exceed_df = grouped_sums > thresholds
    
    # for each station, create a separate column with a binary flag
    # 1 if the sum for the given bin exceeds the threshold, otherwise 0
    for station in station_cols:
        flag_col = f'bin_flag_{station}'
        # for each row in the original DataFrame using the 'time_bin' column
        # map the values from exceed_df for the corresponding station (converting the boolean value to 0 or 1)
        df[flag_col] = df['time_bin'].map(exceed_df[station].astype(int))
    
    return df


In [5]:
import os
import numpy as np
import pandas as pd
folder_path = 'grb_csv/processed'


# get the list of all files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# create a folder to save the processed files
output_folder = os.path.join(folder_path, 'processed3')
os.makedirs(output_folder, exist_ok=True)

for file_name in csv_files:
    # full path to the file
    file_path = os.path.join(folder_path, file_name)
    
    data = pd.read_csv(file_path)
    
    data2 = prepare(data)
    #print(f"peaks:{file_name}, {data['peak_australia'].sum()}")
    
    # save the processed file to the new folder
    output_file_path = os.path.join(output_folder, file_name)
    data2.to_csv(output_file_path, index=False)

print(f"file processing completed; results saved to the folder: {output_folder}")

  grouped_sums = df.groupby('time_bin')[station_cols].sum()
  grouped_sums = df.groupby('time_bin')[station_cols].sum()
  grouped_sums = df.groupby('time_bin')[station_cols].sum()
  grouped_sums = df.groupby('time_bin')[station_cols].sum()
  grouped_sums = df.groupby('time_bin')[station_cols].sum()
  grouped_sums = df.groupby('time_bin')[station_cols].sum()
  grouped_sums = df.groupby('time_bin')[station_cols].sum()
  grouped_sums = df.groupby('time_bin')[station_cols].sum()
  grouped_sums = df.groupby('time_bin')[station_cols].sum()
  grouped_sums = df.groupby('time_bin')[station_cols].sum()
  grouped_sums = df.groupby('time_bin')[station_cols].sum()
  grouped_sums = df.groupby('time_bin')[station_cols].sum()
  grouped_sums = df.groupby('time_bin')[station_cols].sum()
  grouped_sums = df.groupby('time_bin')[station_cols].sum()
  grouped_sums = df.groupby('time_bin')[station_cols].sum()
  grouped_sums = df.groupby('time_bin')[station_cols].sum()
  grouped_sums = df.groupby('time_bin')[

file processing completed; results saved to the folder: grb_csv/processed/processed3
