# Convert the eventID to an time

In [22]:
import pandas as pd
import obspy

In [23]:
# read files
path_to_raw_data = '/data/wsd03/data_manuela/Illgraben/miniseed/'
feature_test = pd.read_csv(path_to_raw_data[:-9] + 'feature_files/test_features_yfilt_40s_hSNR_v4_with_thunder.csv')
feature_train = pd.read_csv(path_to_raw_data[:-9] + 'feature_files/train_features_yfilt_40s_hSNR_v4_with_thunder_without_bigsf.csv')
catalog = pd.read_csv(f'../catalog/catalog_40_hSNR_v4.csv')
catalog = catalog.drop(columns=['Unnamed: 0'])
detections = pd.read_csv(f'../detections/2017/2017_test_v4_bigsf_clf2.txt')

In [24]:
# add class 2 (earthquakes) to noise
def combine_classes_catalog_2(cat):
    cat.loc[cat['event_class'] == 2, 'event_class'] = 0 # Earthqauke Illgraben to Noise
    return cat

feature_test = combine_classes_catalog_2(feature_test)
feature_train = combine_classes_catalog_2(feature_train)

# drop everything != SF from feature_test & feature_train
feature_test = feature_test[feature_test['event_class'] == 1].reset_index(drop=True)
feature_train = feature_train[feature_train['event_class'] == 1].reset_index(drop=True)

## Test Data Set

In [25]:
# get times of test data from eventID
catalog_test = catalog[catalog['event_idx'].isin(feature_test['event_idx'])].reset_index()
catalog_test   

Unnamed: 0,index,event_idx,slice_idx,class,mean_time,station
0,45,17,0,3.0,2017-05-29T21:08:42.748500Z,ILL08
1,46,17,1,3.0,2017-05-29T21:08:55.748500Z,ILL08
2,47,17,2,3.0,2017-05-29T21:09:08.748500Z,ILL08
3,48,17,3,3.0,2017-05-29T21:09:21.748500Z,ILL08
4,55,20,0,3.0,2017-05-30T13:43:03.679400Z,ILL08
5,56,20,1,3.0,2017-05-30T13:43:16.679400Z,ILL08
6,57,20,2,3.0,2017-05-30T13:43:29.679400Z,ILL08
7,58,20,3,3.0,2017-05-30T13:43:42.679400Z,ILL08
8,69,24,0,3.0,2017-05-30T19:24:06.221800Z,ILL08
9,70,24,1,3.0,2017-05-30T19:24:19.221800Z,ILL08


In [26]:
# sum timewindows followd by an other detection -> detections
# I choosed 30s because the mean_time is 20s after the time window and I added 10 to cover the picking uncertainty
s_times = []
e_times = []
stations = []
for i,row in catalog_test.iterrows():
    if row['slice_idx'] == 0:
        start_t = obspy.UTCDateTime(f"{row['mean_time']}") - 30 # 30s = 20s(mean_time)+10s(overlap)
        s_times.append(start_t)
        stations.append('XP.{}..EHZ'.format(row['station']))
    if i == len(catalog_test)-1 or row['slice_idx'] >= catalog_test.iloc[i+1]['slice_idx']:
        end_t = obspy.UTCDateTime(f"{row['mean_time']}") + 30 # 30s
        e_times.append(end_t)

In [27]:
# built test_catalog dataframe
df_test = pd.DataFrame()
df_test['event_idx'] = catalog_test['event_idx'].unique()
df_test['start_time'] = s_times
df_test['end_time'] = e_times
df_test

Unnamed: 0,event_idx,start_time,end_time
0,17,2017-05-29T21:08:12.748500Z,2017-05-29T21:09:51.748500Z
1,20,2017-05-30T13:42:33.679400Z,2017-05-30T13:44:12.679400Z
2,24,2017-05-30T19:23:36.221800Z,2017-05-30T19:24:49.221800Z
3,25,2017-05-30T20:00:53.591400Z,2017-05-30T20:02:19.591400Z
4,26,2017-05-30T20:03:54.857500Z,2017-05-30T20:05:07.857500Z
5,27,2017-05-30T21:34:24.330100Z,2017-05-30T21:35:50.330100Z
6,29,2017-05-30T23:41:02.466200Z,2017-05-30T23:42:41.466200Z
7,52,2017-06-03T00:01:23.053900Z,2017-06-03T00:02:36.053900Z
8,61,2017-06-04T00:01:37.370500Z,2017-06-04T00:03:16.370500Z
9,64,2017-06-04T01:06:55.112300Z,2017-06-04T01:08:34.112300Z


In [28]:
# split start and end time into date and time for snuffler
s_date = [str(start).split('T')[0] for start in df_test['start_time']]
s_time = [str(start).split('T')[1][:-1] for start in df_test['start_time']]
e_date = [str(start).split('T')[0] for start in df_test['end_time']]
e_time = [str(start).split('T')[1][:-1] for start in df_test['end_time']]

In [29]:
# built test_catalog dataframe for snuffler
df_test_snuffler = pd.DataFrame()
df_test_snuffler['s_date'] = s_date 
df_test_snuffler['s_time'] = s_time
df_test_snuffler['e_date'] = e_date
df_test_snuffler['e_time'] = e_time
df_test_snuffler['dt'] = df_test['end_time']-df_test['start_time']
df_test_snuffler['bla'] = ['']*len(s_times)
df_test_snuffler['class'] = [0]*len(s_times)
df_test_snuffler['station'] = stations
df_test_snuffler

Unnamed: 0,s_date,s_time,e_date,e_time,dt,bla,class,station
0,2017-05-29,21:08:12.748500,2017-05-29,21:09:51.748500,99.0,,0,XP.ILL08..EHZ
1,2017-05-30,13:42:33.679400,2017-05-30,13:44:12.679400,99.0,,0,XP.ILL08..EHZ
2,2017-05-30,19:23:36.221800,2017-05-30,19:24:49.221800,73.0,,0,XP.ILL08..EHZ
3,2017-05-30,20:00:53.591400,2017-05-30,20:02:19.591400,86.0,,0,XP.ILL08..EHZ
4,2017-05-30,20:03:54.857500,2017-05-30,20:05:07.857500,73.0,,0,XP.ILL08..EHZ
5,2017-05-30,21:34:24.330100,2017-05-30,21:35:50.330100,86.0,,0,XP.ILL08..EHZ
6,2017-05-30,23:41:02.466200,2017-05-30,23:42:41.466200,99.0,,0,XP.ILL08..EHZ
7,2017-06-03,00:01:23.053900,2017-06-03,00:02:36.053900,73.0,,0,XP.ILL07..EHZ
8,2017-06-04,00:01:37.370500,2017-06-04,00:03:16.370500,99.0,,0,XP.ILL06..EHZ
9,2017-06-04,01:06:55.112300,2017-06-04,01:08:34.112300,99.0,,0,XP.ILL07..EHZ


## Training Data Set

In [30]:
# get times of training data from eventID
catalog_train = catalog[catalog['event_idx'].isin(feature_train['event_idx'])].reset_index()
catalog_train   

Unnamed: 0,index,event_idx,slice_idx,class,mean_time,station
0,49,18,0,3.0,2017-05-29T23:16:56.569800Z,ILL08
1,50,18,1,3.0,2017-05-29T23:17:09.569800Z,ILL08
2,51,18,2,3.0,2017-05-29T23:17:22.569800Z,ILL08
3,52,19,0,3.0,2017-05-30T07:03:52.093200Z,ILL08
4,53,19,1,3.0,2017-05-30T07:04:05.093200Z,ILL08
...,...,...,...,...,...,...
77,433,156,4,3.0,2017-06-15T02:50:08.643800Z,ILL08
78,434,157,0,3.0,2017-06-15T02:49:22.643800Z,ILL07
79,435,157,1,3.0,2017-06-15T02:49:35.643800Z,ILL07
80,436,157,2,3.0,2017-06-15T02:49:48.643800Z,ILL07


In [31]:
# sum timewindows followd by an other detection -> detections
# I choosed 30s because the mean_time is 20s after the time window and I added 10 to cover the picking uncertainty
s_times = []
e_times = []
stations = []
for i,row in catalog_train.iterrows():
    if row['slice_idx'] == 0:
        start_t = obspy.UTCDateTime(f"{row['mean_time']}") - 30 # 30s = 20s(mean_time)+10s(overlap)
        s_times.append(start_t)
        stations.append('XP.{}..EHZ'.format(row['station']))
    if i == len(catalog_train)-1 or row['slice_idx'] >= catalog_train.iloc[i+1]['slice_idx']:
        end_t = obspy.UTCDateTime(f"{row['mean_time']}") + 30 # 30s
        e_times.append(end_t)

In [32]:
# built train_catalog dataframe
df_train = pd.DataFrame()
df_train['event_idx'] = catalog_train['event_idx'].unique()
df_train['start_time'] = s_times
df_train['end_time'] = e_times
df_train

Unnamed: 0,event_idx,start_time,end_time
0,18,2017-05-29T23:16:26.569800Z,2017-05-29T23:17:52.569800Z
1,19,2017-05-30T07:03:22.093200Z,2017-05-30T07:04:48.093200Z
2,21,2017-05-30T14:25:34.983800Z,2017-05-30T14:27:00.983800Z
3,22,2017-05-30T17:56:16.107700Z,2017-05-30T17:58:08.107700Z
4,23,2017-05-30T18:12:10.885600Z,2017-05-30T18:13:23.885600Z
5,28,2017-05-30T23:00:54.694400Z,2017-05-30T23:02:07.694400Z
6,30,2017-05-31T09:26:14.586900Z,2017-05-31T09:27:53.586900Z
7,31,2017-05-31T09:30:45.763900Z,2017-05-31T09:32:24.763900Z
8,32,2017-06-01T04:06:38.321900Z,2017-06-01T04:07:51.321900Z
9,45,2017-06-02T20:17:43.855400Z,2017-06-02T20:19:09.855400Z


In [33]:
# split start and end time into date and time for snuffler
s_date = [str(start).split('T')[0] for start in df_train['start_time']]
s_time = [str(start).split('T')[1][:-1] for start in df_train['start_time']]
e_date = [str(start).split('T')[0] for start in df_train['end_time']]
e_time = [str(start).split('T')[1][:-1] for start in df_train['end_time']]

In [34]:
# built test_catalog dataframe for snuffler
df_train_snuffler = pd.DataFrame()
df_train_snuffler['s_date'] = s_date 
df_train_snuffler['s_time'] = s_time
df_train_snuffler['e_date'] = e_date
df_train_snuffler['e_time'] = e_time
df_train_snuffler['dt'] = df_train['end_time']-df_train['start_time']
df_train_snuffler['bla'] = ['']*len(s_times)
df_train_snuffler['class'] = [2]*len(s_times)
df_train_snuffler['station'] = stations
df_train_snuffler

Unnamed: 0,s_date,s_time,e_date,e_time,dt,bla,class,station
0,2017-05-29,23:16:26.569800,2017-05-29,23:17:52.569800,86.0,,2,XP.ILL08..EHZ
1,2017-05-30,07:03:22.093200,2017-05-30,07:04:48.093200,86.0,,2,XP.ILL08..EHZ
2,2017-05-30,14:25:34.983800,2017-05-30,14:27:00.983800,86.0,,2,XP.ILL08..EHZ
3,2017-05-30,17:56:16.107700,2017-05-30,17:58:08.107700,112.0,,2,XP.ILL08..EHZ
4,2017-05-30,18:12:10.885600,2017-05-30,18:13:23.885600,73.0,,2,XP.ILL08..EHZ
5,2017-05-30,23:00:54.694400,2017-05-30,23:02:07.694400,73.0,,2,XP.ILL08..EHZ
6,2017-05-31,09:26:14.586900,2017-05-31,09:27:53.586900,99.0,,2,XP.ILL08..EHZ
7,2017-05-31,09:30:45.763900,2017-05-31,09:32:24.763900,99.0,,2,XP.ILL08..EHZ
8,2017-06-01,04:06:38.321900,2017-06-01,04:07:51.321900,73.0,,2,XP.ILL08..EHZ
9,2017-06-02,20:17:43.855400,2017-06-02,20:19:09.855400,86.0,,2,XP.ILL06..EHZ


In [35]:
# split df column into two columns
#df_test['start_data'],df_test['start_times']=df_test['start_time'].str.split('T').str

## Detections

In [36]:
# create catalog with only detections (drop noise time windows)
detections = detections[detections['classes'] == 1].reset_index(drop=True)
detections

Unnamed: 0,datetimes,classes
0,2017-05-31 16:49:36.650,1
1,2017-05-31 16:49:56.650,1
2,2017-05-31 16:50:06.650,1
3,2017-05-31 16:50:16.650,1
4,2017-05-31 16:50:26.650,1
...,...,...
2088,2017-08-02 15:22:00.000,1
2089,2017-08-02 15:22:10.000,1
2090,2017-08-02 19:06:20.000,1
2091,2017-08-02 19:06:30.000,1


In [37]:
# built events from detections
s_times = []
e_times = []
for i, row in detections.iterrows():
    UTC = obspy.UTCDateTime(f"{row['datetimes']}")
    if i == 0 or UTC-11 > detections.iloc[i-1]['datetimes']: # startzeit
        s_times.append(UTC)
    if i == len(detections)-1 or UTC+11 < detections.iloc[i+1]['datetimes']: # endzeit
        e_times.append(UTC+10)
print(len(s_times),len(e_times))

703 703


In [38]:
# split start and end time into date and time for snuffler
s_date = [str(start).split('T')[0] for start in s_times]
s_time = [str(start).split('T')[1][:-1] for start in s_times]
e_date = [str(start).split('T')[0] for start in e_times]
e_time = [str(start).split('T')[1][:-1] for start in e_times]

In [39]:
df_detection = pd.DataFrame()
df_detection['start_time'] = s_times
df_detection['end_time'] = e_times
df_detection

Unnamed: 0,start_time,end_time
0,2017-05-31T16:49:36.650000Z,2017-05-31T16:49:46.650000Z
1,2017-05-31T16:49:56.650000Z,2017-05-31T16:50:36.650000Z
2,2017-05-31T18:18:16.650000Z,2017-05-31T18:18:26.650000Z
3,2017-06-01T04:07:20.000000Z,2017-06-01T04:07:30.000000Z
4,2017-06-01T04:38:20.000000Z,2017-06-01T04:39:00.000000Z
...,...,...
698,2017-08-02T10:10:30.000000Z,2017-08-02T10:11:00.000000Z
699,2017-08-02T14:42:20.000000Z,2017-08-02T14:43:00.000000Z
700,2017-08-02T15:22:00.000000Z,2017-08-02T15:22:20.000000Z
701,2017-08-02T19:06:20.000000Z,2017-08-02T19:06:40.000000Z


In [40]:
# built test_catalog dataframe for snuffler
df_detection_snuffler = pd.DataFrame()
df_detection_snuffler['s_date'] = s_date 
df_detection_snuffler['s_time'] = s_time
df_detection_snuffler['e_date'] = e_date
df_detection_snuffler['e_time'] = e_time
df_detection_snuffler['dt'] = df_detection['end_time']-df_detection['start_time']
df_detection_snuffler['bla'] = ['']*len(s_times)
df_detection_snuffler['class'] = [1]*len(s_times)
df_detection_snuffler['station'] = ['None']*len(s_times)
df_detection_snuffler

Unnamed: 0,s_date,s_time,e_date,e_time,dt,bla,class,station
0,2017-05-31,16:49:36.650000,2017-05-31,16:49:46.650000,10.0,,1,
1,2017-05-31,16:49:56.650000,2017-05-31,16:50:36.650000,40.0,,1,
2,2017-05-31,18:18:16.650000,2017-05-31,18:18:26.650000,10.0,,1,
3,2017-06-01,04:07:20.000000,2017-06-01,04:07:30.000000,10.0,,1,
4,2017-06-01,04:38:20.000000,2017-06-01,04:39:00.000000,40.0,,1,
...,...,...,...,...,...,...,...,...
698,2017-08-02,10:10:30.000000,2017-08-02,10:11:00.000000,30.0,,1,
699,2017-08-02,14:42:20.000000,2017-08-02,14:43:00.000000,40.0,,1,
700,2017-08-02,15:22:00.000000,2017-08-02,15:22:20.000000,20.0,,1,
701,2017-08-02,19:06:20.000000,2017-08-02,19:06:40.000000,20.0,,1,


## Merge all DF

In [41]:
df_merge = pd.concat([df_test_snuffler, df_train_snuffler, df_detection_snuffler], ignore_index=True)
#df_merge = df_test_snuffler.merge(df_detection_snuffler, how='outer')
df_merge

Unnamed: 0,s_date,s_time,e_date,e_time,dt,bla,class,station
0,2017-05-29,21:08:12.748500,2017-05-29,21:09:51.748500,99.0,,0,XP.ILL08..EHZ
1,2017-05-30,13:42:33.679400,2017-05-30,13:44:12.679400,99.0,,0,XP.ILL08..EHZ
2,2017-05-30,19:23:36.221800,2017-05-30,19:24:49.221800,73.0,,0,XP.ILL08..EHZ
3,2017-05-30,20:00:53.591400,2017-05-30,20:02:19.591400,86.0,,0,XP.ILL08..EHZ
4,2017-05-30,20:03:54.857500,2017-05-30,20:05:07.857500,73.0,,0,XP.ILL08..EHZ
...,...,...,...,...,...,...,...,...
742,2017-08-02,10:10:30.000000,2017-08-02,10:11:00.000000,30.0,,1,
743,2017-08-02,14:42:20.000000,2017-08-02,14:43:00.000000,40.0,,1,
744,2017-08-02,15:22:00.000000,2017-08-02,15:22:20.000000,20.0,,1,
745,2017-08-02,19:06:20.000000,2017-08-02,19:06:40.000000,20.0,,1,


In [42]:
# make the file readable for snuffler

with open('/data/wsd03/data_manuela/Illgraben/snuffler_files/labels_vs_detections_for_snuffler.txt', 'w') as file:
    file.write("# Snuffler Markers File Version 0.2\n")

df_merge.to_csv('/data/wsd03/data_manuela/Illgraben/snuffler_files/labels_vs_detections_for_snuffler.txt',
                             sep=' ', header=False, index=False, mode='a')