<figure>
  <IMG SRC="logoeost.png" WIDTH=100 ALIGN="right">
</figure>

# Classification of Seismic Sources - Random Forest Classifier


Based on, and with the courtesy of, the "*IA in geosciences" practical by C. Hibert / 28 January 2020*.

Adapted for the Skience2024 workshop by Thomas Lecocq.

---------

In this tutorial you will see how to implement a machine learning algorithm for a discrimination/classification problem using the Python function library `sickit-learn`. This function library is very comprehensive and one of the most widely used in the world for everything to do with Machine Learning. 

You will be working on seismological data, with the aim of achieving the best rate of correct identification between any number of source: signals generated by volcano-tectonic earthquakes, other type of volcano-generated signals, as well as noise samples. Having an algorithm that can make this discrimination on continuous data will make it possible to reconstruct chronicles of events on a volcano. These chronicles will potentially provide a better understanding of the volcano dynamics.

## Extract Features from our new detections

This notebook will compute 58 attributes for each seismic traces.

In [None]:
%matplotlib inline
import matplotlib
import os
import glob
import datetime
import traceback
from obspy.core import read, UTCDateTime
from obspy import UTCDateTime, Stream, read
from obspy.geodetics.base import gps2dist_azimuth
from obspy.core.util import AttribDict
import matplotlib
import matplotlib as mpl
new_style = {'grid': False}
mpl.rc('axes', **new_style)
# mpl.rcParams['font.family'] = 'Helvetica'
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
sns.set_style("whitegrid")
sns.set_palette("dark")
# import tqdm
from obspy.clients.filesystem.sds import Client
from obspy.signal.cross_correlation import xcorr_pick_correction
import warnings
from collections import defaultdict
from obspy.signal.cross_correlation import correlate,xcorr_max
import sys
sys.path.append(".")
from ComputeAttributesV_MAT import calculate_all_attributes, get_attribute_names


In [None]:
sds_root = r"D:\DATA\merapi\merapi_stationxml.xml"

In [None]:
df = pd.read_csv(r".\XM_trigger\XM_trigger.txt",  sep='\s+', header=None, names=["starttime", "duration"], parse_dates=["starttime"], date_parser=UTCDateTime)
df["inter_event"] = np.append(0, df.starttime.iloc[1:] - df.starttime[:-1])
df.index = pd.DatetimeIndex([d.datetime for d in df.starttime.values])
df.head()

In [None]:
min_interevent_type = 60

# Let's check the intervent time:
plt.hist(df["inter_event"], bins=np.linspace(0, 600, 60))
plt.axvline(min_interevent_type, c='r', ls='--')
plt.xlabel("inter event time (s)")
plt.ylabel("N")
plt.show()

plt.scatter(df.index, df["inter_event"], marker=".")
plt.axhline(min_interevent_type, c='r', ls='--')
# plt.scatter(df.iloc[0].starttime.datetime, 0)
plt.ylim(0,2000)
plt.show()

In [None]:
print(len(df[df.inter_event < min_interevent_type]), len(df[df.inter_event >= min_interevent_type]))
df = df[df.inter_event >= min_interevent_type]

In [None]:
station = "GRW0"
channel = "BHZ"
attributes = {}

In [None]:
outfolder = os.path.join("attributes", "%s.%s"%(station, channel))
os.makedirs(outfolder, exist_ok=True)
c = Client(sds_root)

events = []
for id, event in df.iterrows():
    print(event.starttime)
    st = c.get_waveforms("XM", station, "*", "BHZ", event.starttime, event.starttime + 60)
    if len(st):
        st.merge(fill_value="interpolate", method=1)
        events.append(st[0])
st = Stream(traces=events)



In [None]:
typ = "unknown"
outfile = os.path.join(outfolder, "%s.npy"%typ)
if os.path.isfile(outfile):
    attributes[typ] = np.load(outfile)
else:
    attributes[typ] = []
    for tr in st:
        print("Processing", tr.id, tr.stats.starttime)
        attributes[typ].append(calculate_all_attributes(tr.data, st[0].stats.sampling_rate, 0)[0])
    attributes[typ] = np.asarray(attributes[typ])
    np.save(outfile, attributes[typ])

In [None]:
names = get_attribute_names()
x = [tr.stats.starttime for tr in st]
for i in range(attributes["unknown"].shape[1]):
    _ = attributes[typ][:,i]
    # x = np.arange(len(_)) 
    plt.scatter(x, _, label=typ)
    # previous += len(x)
    plt.legend()
    plt.title(names[i])
    plt.xlabel("Event ID")
    plt.ylabel("Attribute value")
    plt.savefig(os.path.join(outfolder, "unk_attr_%02i.png"%i))
    plt.close()

In [None]:
# load, no need to initialize the loaded_rf
import joblib
RF = joblib.load("./random_forest.joblib")

In [None]:
RF

In [None]:
# clean attributes

final = []
to_keep = []
for i, attr in enumerate(attributes[typ]):
    if not (np.all(np.isfinite(attr))):
        print("kaput", i, attr)
    else:
        final.append(attr)
        to_keep.append(i)
final_df = df.copy().iloc[to_keep]
final_st = st.copy()
final_st.traces = [final_st[i] for i in to_keep]

In [None]:
predictions = RF.predict(final)

In [None]:
print(len(final_st), len(predictions))

In [None]:
classnames=['VTB','MP', "gugu_short", "gugu_long", "NN", "ND"]
plt.hist(predictions, bins=np.arange(10))
plt.xticks(np.arange(len(classnames))+0.5, classnames, rotation="vertical")

In [None]:
MAX = 50
classname = "VTB"

for i, v in enumerate(np.where(predictions==0)[0]):
    if (i % MAX) == 0:
        fig, ax = plt.subplots(1,1, figsize=(15,40))
        k = 0
    
    tr = final_st[v].copy()
    tr.filter("highpass", freq=1)
    tr.normalize()
    
    plt.plot(tr.times(), tr.data + k, c='k', lw=1)
    plt.text(0, k, tr.stats.starttime)
    plt.xlim(10,60)
    k += 1
    if k == MAX:
        plt.margins(0)
        plt.savefig('%s-%02i.png' % (classname, i))
        # plt.show()
        # break
# plt.margins(0)
# plt.savefig('VTB.png')
# plt.xlim(15,40)


In [None]:
final_df["event_class"] = [classnames[c] for c in predictions]

In [None]:
fig, ax = plt.subplots(1,1, figsize=(12,8))
plt.scatter(final_df.index, final_df.event_class, marker=".", s=40, alpha=0.2)
plt.xticks(rotation=90)

In [None]:
final_df['timestep'] = [di.strftime('%Y-%m-%d') for di in final_df.index]
rate = pd.crosstab(final_df.timestep, final_df.event_class)
rate

In [None]:
rate[["VTB","MP"]].plot(kind='bar',stacked=False, figsize=(18,5))