In [4]:
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
import os
import glob
from tqdm import tqdm
import time

In [2]:
pd.set_option('future.no_silent_downcasting', True)

def filter_duplicates(df, time_threshold=0.1):
    df['shifted_time'] = df['phase_time'].shift(-1)
    # Compute time difference between consecutive rows in seconds
    time_diff = (df['shifted_time'] - df['phase_time']).dt.total_seconds()
    # Create a shifted version of 'phase_score' to compare with the next row
    df['shifted_score'] = df['phase_score'].shift(-1)
    # Identify rows to delete based on time difference and score comparison
    # Mark rows whe;re the time difference to the next row is <= 0.05 seconds and the next row's score is greater
    mask_delete_current = (time_diff <= time_threshold) & (df['shifted_score'] > df['phase_score'])
    mask_delete_next = (time_diff <= time_threshold) & (df['shifted_score'] < df['phase_score']).shift(1).fillna(False)
    # Mark rows where the time difference to the previous row is <= 0.03 seconds and the current row's score is not greater than the previous row's score
    # mask_delete_current = mask_delete_next.shift(1).fillna(False)
    # Combine masks to identify all rows to be deleted
    mask_to_delete = mask_delete_next | mask_delete_current
    # Filter out the rows marked for deletion
    filtered_df = df.loc[~mask_to_delete].copy()
    # Drop the auxiliary columns
    filtered_df.drop(['shifted_time', 'shifted_score'], axis=1, inplace=True)
    
    print('drop ', len(df) - len(filtered_df)) 
    return filtered_df

def remove_bad_time(traces_df, pick_df, dt, column):
    # Compute 'bad_phase_time' by adding 0.01 seconds to 'starttime_formatted'
    traces_df['bad_phase_time'] = traces_df[column] + pd.to_timedelta(dt, unit='s')
    # Now, we want to remove rows in 'combined_df' where 'phase_time' matches any 'bad_phase_time' in 'info'
    # It's more efficient to do this using a merge or join operation than iterating
    # Create a DataFrame from 'bad_phase_time' for a merge operation
    bad_times_df = traces_df[['bad_phase_time']].drop_duplicates()  # Ensures unique values for efficient merging
    
    # Merge 'combined_df' with 'bad_times_df' on the condition that matches 'phase_time' to 'bad_phase_time'
    # Use an indicator to mark rows that match
    merged_df = pick_df.merge(bad_times_df, left_on='phase_time', right_on='bad_phase_time', how='left', indicator=True)
    
    # Filter out rows that were matched ('both' in the merge indicator)
    filtered_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['bad_phase_time', '_merge'])
    print('drop ', len(pick_df) - len(filtered_df)) 
    return filtered_df

In [3]:
df_list = []
yr = 2024
extra = True
extra = False
if extra:
    #### phases from the accelerometers
    file_list = glob.glob(f'./phases/phase_seisbench_{yr}_extra/*picks.txt')
else:
    #### phases from the non accelerometers
    file_list = glob.glob(f'./phases/phase_seisbench_{yr}/*picks.txt')
starttime = pd.to_datetime(f'{yr}-01-01').tz_localize('utc')
endtime = pd.to_datetime(f'{yr+1}-01-01').tz_localize('utc')

time_threshold=0.5
tolerance = 0.2

for file in file_list:
    print(file)
    # Read each file into a DataFrame
    temp_df = pd.read_csv(file, sep=' ', header=None)
    # we might picked some files multiples times
    temp_df.drop_duplicates(inplace=True)
    temp_df.drop([5], axis=1, inplace=True)
    temp_df.columns = ['station_full', 'phase_score', 'date', 'phase_time', 'phase_type']
    
    # Combine date and time columns into a single datetime column and drop the original columns
    temp_df['phase_time'] = pd.to_datetime(temp_df['date'] + ' ' + temp_df['phase_time']).dt.tz_localize('utc')
    temp_df.sort_values(by='phase_time', inplace=True)
    temp_df.drop(['date'], axis=1, inplace=True)
    temp_df = temp_df[(temp_df.phase_time>=starttime) & (temp_df.phase_time<=endtime)]
    if len(temp_df)==0:
        continue
    
    # Extract station name from the station_full column
    temp_df['station'] = temp_df['station_full'].apply(lambda x: x.split('.')[1])
    station_name = temp_df.iloc[0]['station']
    
    if station_name in ['CROS', 'PTMR', 'V0106', 'CQUE']:
        temp_df = temp_df[temp_df.phase_score>=0.5]
    if station_name in ['POZA', 'POZT', 'POZS', 'NABA', 'POZM', 'NACO', 'BAN', 'NAD', 'NAFG',
                        'BAIP', 'POZL', 'POZB', 'POZU', 'BCLI', 'MPCD', 'NAP', 'NAAG']:
        temp_df = temp_df[temp_df.phase_score>=0.6]
    elif station_name in ['V0102', 'CSTH']:
        temp_df = temp_df[temp_df.phase_score>=0.3]
    else:
        temp_df = temp_df[temp_df.phase_score>=0.3]
    
    temp_df['station_id'] = temp_df['station_full'].apply(lambda x: f"{x.split('.')[0]}.{x.split('.')[1]}..{x.split('.')[2][-2:]}")
    temp_df.drop(['station_full'], axis=1, inplace=True)
    
    # Reorder the columns
    if station_name in ['CAAM', 'COLB', 'CREM', 'CPOZ', 'POZA', 'POZT', 'POZS', 'CROS', 'NABA', 'POZM']:
        time_tolerance = 0.2
    elif station_name in ['CSOB', 'CPIS', 'CPOZ', 'CSTH', 'V0102', 'CSFT']:
        station_tolerance = 0.2
    else:
        time_tolerance = 0.5
        
    temp_df = temp_df[['station', 'phase_score', 'phase_time', 'phase_type', 'station_id']]
    temp_df = filter_duplicates(temp_df, time_threshold=time_threshold)
    temp_df = filter_duplicates(temp_df, time_threshold=time_threshold)


    traces_df = pd.read_csv(file[:-9]+'trace.txt', sep=' ', header=None)
    traces_df.drop_duplicates(inplace=True)
    traces_df['st'] = pd.to_datetime(traces_df[0] + ' ' + traces_df[1]).dt.tz_localize('utc')
    traces_df['et'] = pd.to_datetime(traces_df[2] + ' ' + traces_df[3]).dt.tz_localize('utc')
    traces_df.drop(columns=[0, 1, 2, 3], inplace=True)
    
    for dt in np.arange(-tolerance, tolerance+0.01, 0.01):
        temp_df = remove_bad_time(traces_df, temp_df, dt, column='st')
        temp_df = remove_bad_time(traces_df, temp_df, dt, column='et')
    
    df_list.append(temp_df)

In [2]:
# Concatenate all DataFrames in the list into a single DataFrame
final_df = pd.concat(df_list, ignore_index=True)
final_df.sort_values(by=["phase_time"], inplace=True)
final_df.index.name = 'phase_index'

final_df = final_df[final_df.station_id != 'IV.CSTH..EH']
final_df = final_df[final_df.station_id != 'IV.CROS..EH']
final_df = final_df[final_df.station_id != 'IV.CPV..EH']
final_df = final_df[~((final_df.station_id == 'IV.CASE..EH')&(final_df.phase_type == 'S'))]

final_df = final_df[~((final_df.station_id == 'IV.CFB1..HH') & (final_df.phase_time < pd.to_datetime('2023-6-20').tz_localize('utc')))]
final_df = final_df[~((final_df.station_id == 'IV.CFB3..HH') & (final_df.phase_time < pd.to_datetime('2023-1-1').tz_localize('utc')))]

print(final_df.station_id.unique())
final_df.to_csv(f'./phases/output/{yr}_picks_2503.csv')

In [3]:
df_22 = pd.read_csv('./phases/output/2022_picks_2503.csv')
df_23 = pd.read_csv('./phases/output/2023_picks_2503.csv')
df_24 = pd.read_csv('./phases/output/2024_picks_2503.csv')
df_25 = pd.read_csv('./phases/output/2025_picks_2503.csv')
df_24_extra = pd.read_csv('./phases/output/2024_picks_2503_extra.csv')
df_25_extra = pd.read_csv('./phases/output/2025_picks_2503_extra.csv')

In [1]:
df = pd.concat([df_22, df_23, df_24, df_24_extra, df_25, df_25_extra])
df.sort_values(by='phase_time', inplace=True)
df.phase_index = np.arange(len(df))
df.index = np.arange(len(df))

In [12]:
df.to_csv('./phases/output/2345_picks.csv', index=False)