In [1]:
import matplotlib

# Memory leak with Matplotlib when in interactive mode and writing 1000+ plots
matplotlib.use('agg')
matplotlib.interactive(False)

import obspy, os, glob

CSV = 'Benz_catalog.csv' # 'OK_2014-2015-2016.csv' # 

# In seconds
DURATION = 20 
PRE_PADDING = 10
POST_PADDING = 10

DATA_PATH = os.path.join(os.getcwd(), 'data')
SPECTROGRAM_PATH = os.path.join(os.getcwd(), 'spectrograms')

stream_paths = glob.glob(os.path.join(DATA_PATH, 'train_mseed/*.mseed'))

In [2]:
from pathlib import Path
from datetime import date
from collections import defaultdict
from obspy import read

class StreamPath:
    def __init__(self, path):
        self.raw_path = path
        self.path = Path(path)
        self.station = self.path.parts[-1].split('_')[0]
        date_str = self.path.parts[-1].split('_')[1].replace('.mseed', "")
        month, year = date_str.split('-')
        self.date_start = date(int(year), int(month), day=1)
        
    def load(self):
        """ Loads a stream from the path """
        return read(self.raw_path)
        
    def __str__(self):
        return self.raw_path
    
paths = list(map(StreamPath, stream_paths))
s1_date_paths = {stream.date_start: stream for stream in paths if stream.station == 'GSOK029'}
s2_date_paths = {stream.date_start: stream for stream in paths if stream.station == 'GSOK027'}

In [3]:
from obspy import Stream
from operator import add
from functools import reduce
from obspy import read

# Combine all streams into one
# stream = reduce(add, map(read, stream_paths))

In [4]:
import pandas as pd

quake_csv = pd.read_csv(os.path.join(DATA_PATH, CSV))

In [5]:
def change_column_name(df, column_index, new_name):
    columns = df.columns.values
    columns[column_index] = new_name
    df.columns = columns
    
change_column_name(quake_csv, 0, 'EventNum')

quake_csv[:10]

Unnamed: 0,EventNum,Date,Time,Magnitude,Variance,origintime,utc_timestamp
0,0,02/15/2014,00:02:41,0.37,0.95,2014-02-15T00:02:41.000000Z,1392423000.0
1,1,02/15/2014,00:03:45,-0.44,0.86,2014-02-15T00:03:45.000000Z,1392423000.0
2,2,02/15/2014,00:08:07,-0.18,0.93,2014-02-15T00:08:07.000000Z,1392423000.0
3,3,02/15/2014,00:12:52,0.1,0.93,2014-02-15T00:12:52.000000Z,1392423000.0
4,4,02/15/2014,00:14:09,-0.47,0.89,2014-02-15T00:14:09.000000Z,1392423000.0
5,5,02/15/2014,00:16:37,-0.76,0.62,2014-02-15T00:16:37.000000Z,1392423000.0
6,6,02/15/2014,00:16:52,0.67,0.78,2014-02-15T00:16:52.000000Z,1392423000.0
7,7,02/15/2014,00:19:12,3.51,0.98,2014-02-15T00:19:12.000000Z,1392424000.0
8,8,02/15/2014,00:20:51,0.22,0.63,2014-02-15T00:20:51.000000Z,1392424000.0
9,9,02/15/2014,00:21:06,-0.13,0.42,2014-02-15T00:21:06.000000Z,1392424000.0


In [6]:
import sys
sys.path.append('..')
import download_waveforms.code.spectrograms as spectro
import download_waveforms.code.filter as filt
from download_waveforms.code import helpers


In [7]:
from itertools import islice
from obspy import UTCDateTime

def gen_row_date(df):
    for index, row in df.iterrows():
        year, month, day = row.origintime[:10].split('-')
        yield date(int(year), int(month), day=1), row
        
        
def gen_filter_waves(df, date_paths, pre_padding=PRE_PADDING, post_padding=POST_PADDING):
    curr_date = None
    stream = None

    for dat, row in gen_row_date(df):
        if dat != curr_date:
            try:
                stream_path = date_paths[dat]
            except KeyError:
                print(f"{dat} not in the stream_path")
                continue
                
            stream = stream_path.load()
            curr_date = dat
            
        # Start PRE_PADDING before event_time, end POST_PADDING after event_time
        yield filt.filter_waveform(stream, UTCDateTime(row.origintime), pre_padding, post_padding)
        
def gen_filter_waves_from_times(times, date_paths, pre_padding=PRE_PADDING, post_padding=POST_PADDING):
    curr_date = None
    stream = None

    for time in times:
        dat = date(int(time.year), int(time.month), day=1)
        if dat != curr_date:
            try:
                stream_path = date_paths[dat]
            except KeyError:
                print(f"{dat} not in the stream_path")
                continue
                
            stream = stream_path.load()
            curr_date = dat
            
        # Start PRE_PADDING before event_time, end POST_PADDING after event_time
        yield filt.filter_waveform(stream, time, pre_padding, post_padding)


In [8]:
date_paths = s1_date_paths
amount = 30000

In [9]:
import warnings
quake_path = os.path.join(SPECTROGRAM_PATH, 'benz/train/local')
noise_path = os.path.join(SPECTROGRAM_PATH, 'benz/train/noise')

### Write Quakes

In [11]:
quake_waves = gen_filter_waves(quake_csv, date_paths)

In [None]:
with warnings.catch_warnings():   
    warnings.simplefilter("ignore")
    spectro.async_write_spectrograms(islice(quake_waves, amount), quake_path, ignoreexceptions=True)

Writing Files...


### Write Noise

In [10]:
def get_csv_times(df):
    times = df[['origintime']]
    for row in times.iterrows():
        time = row[1].origintime
        yield UTCDateTime(time)

times = sorted(get_csv_times(quake_csv))


In [11]:
noise_times = helpers.get_noise_times(times_to_exclude=times, 
                                      startafter=times[0], 
                                      endbefore=times[-1],
                                      amount=amount, 
                                      duration=DURATION)

In [12]:
noise_times = sorted(noise_times)  # important to make lazy_loading the streams better for the next function

In [13]:
# Noise times centers around the given time by default... so pad 10 seconds in each direction
noise_waves = gen_filter_waves_from_times(noise_times, date_paths, 10, 10)

In [14]:
with warnings.catch_warnings():   
    warnings.simplefilter("ignore")
    spectro.async_write_spectrograms(noise_waves, noise_path, ignoreexceptions=True)

Writing Files...
2014-07-01 not in the stream_path
2014-07-01 not in the stream_path
2014-07-01 not in the stream_path
2014-07-01 not in the stream_path
2014-07-01 not in the stream_path
2014-07-01 not in the stream_path
2014-07-01 not in the stream_path
2014-07-01 not in the stream_path
2014-07-01 not in the stream_path
2014-07-01 not in the stream_path
2014-07-01 not in the stream_path
2014-07-01 not in the stream_path
2014-07-01 not in the stream_path
2014-07-01 not in the stream_path
2014-07-01 not in the stream_path
2014-07-01 not in the stream_path
2014-07-01 not in the stream_path
2014-07-01 not in the stream_path
2014-07-01 not in the stream_path
2014-07-01 not in the stream_path
2014-07-01 not in the stream_path
2014-07-01 not in the stream_path
2014-07-01 not in the stream_path
2014-07-01 not in the stream_path
2014-07-01 not in the stream_path
2014-07-01 not in the stream_path
2014-07-01 not in the stream_path
2014-07-01 not in the stream_path
2014-07-01 not in the stream_pa

### Clean Up

In [15]:
def clean_up(path):
    """ Removes empty dirs """
    folders = glob.glob(os.path.join(path, '*/'))
    for folder in folders:
        if not os.listdir(folder):
             os.rmdir(folder)

clean_up(quake_path)
clean_up(noise_path)