# Library Import
This section imports all the necessary libraries required for data processing, feature extraction, and cloud interaction.

In [3]:
import pandas as pd
import numpy as np
from obspy import read
from utils.feature_extraction_mars import *
from timeit import default_timer as timer
from multiprocessing import Pool, cpu_count
import os
# from google.cloud import storage # Uncomment this line if you want to use Cloud Storage to upload the dataframe

# Loading the Data Catalog

In [2]:
df_geral = pd.read_csv('./data/mars/training/catalogs/Mars_InSight_training_catalog_final.csv')
pd.options.display.max_columns = 500 # this will set limit of columns to 500

df_geral.head()

Unnamed: 0,filename,time_abs(%Y-%m-%dT%H:%M:%S.%f),time_rel(sec),evid
0,XB.ELYSE.02.BHV.2022-02-03HR08_evid0005.csv,2022-02-03T08:08:27.000000,507.0,evid0005
1,XB.ELYSE.02.BHV.2022-01-02HR04_evid0006.csv,2022-01-02T04:35:30.000000,2130.0,evid0006


# Function for Processing Events (CSV and mseed)
- All data from the CSV files and the statuses from the mseed files are being extracted.
- **Additionally, we have a utility file for creating new event features using mathematical equations with the Scipy library; all these new features are utilized in training the neural network.**

## Each row represents an event from the lunar dataset. For each event:
- CSV data is loaded and processed. **We are adding the filename, evid and a label to the dataframe**.
- Data from mseed files (including network and station status) is extracted.
- The CSV and mseed data are combined and returned as a single DataFrame.

In [4]:
# Initialize an empty DataFrame to hold all concatenated data
df_combined = pd.DataFrame()

for index, row in df_geral.iterrows():
    try:
        print(index)
        start = timer()
    
        filename = row['filename'].split('.csv')[0]
        time_rel_label = row['time_rel(sec)']
        data_directory = f'./data/mars/training/data/{filename}'
        
        # Read CSV data
        df_data_csv = pd.read_csv(f"{data_directory}.csv", parse_dates=['time(%Y-%m-%dT%H:%M:%S.%f)'])
        df_data_csv['label'] = (df_data_csv['rel_time(sec)'] >= time_rel_label).astype(int)  # Create label
        df_data_csv['filename'] = filename
        df_data_csv['evid'] = row['evid']
        
        # Concatenate features from mseed
        mseed_file = f'{data_directory}.mseed'
        st = read(mseed_file)
        df_data_csv['network'] = st[0].stats['network']
        df_data_csv['station'] = st[0].stats['station']
        df_data_csv['location'] = st[0].stats['location']
        df_data_csv['channel'] = st[0].stats['channel']
        df_data_csv['sampling_rate'] = st[0].stats['sampling_rate']
        df_data_csv['delta'] = st[0].stats['delta']
        df_data_csv['npts'] = st[0].stats['npts']
        df_data_csv['calib'] = st[0].stats['calib']
        
        # Concatenando novas features
        sampling_rate = st[0].stats['sampling_rate']
        features = process_seismic_data(df_data_csv, sampling_rate)
        df_data_csv['mean_velocity']= features['mean_velocity']
        df_data_csv['std_velocity']= features['std_velocity']
        df_data_csv['max_velocity']= features['max_velocity']
        df_data_csv['min_velocity']= features['min_velocity']
        df_data_csv['total_energy']= features['total_energy']
        df_data_csv['rms_value']= features['rms_value']
        df_data_csv['peak_count']= features['peak_count']
        df_data_csv['valley_count']= features['valley_count']
        df_data_csv['fft_values']= features['fft_values']
        df_data_csv['fft_freqs']= features['fft_freqs']
        df_data_csv['autocorrelation']= features['autocorrelation']
        df_data_csv['acceleration']= features['acceleration']
        df_data_csv['jerk']= features['jerk']
        df_data_csv['cumulative_energy']= features['cumulative_energy']
    
        # Concatenate df_data_csv to the main DataFrame
        df_combined = pd.concat([df_combined, df_data_csv], ignore_index=True)
        end = timer()  # Stop the timer
        elapsed_time = end - start  # Calculate elapsed time
        print(f"Took {elapsed_time:.4f} seconds to process index {index}",end='\n')
    except Exception as e:
        print(e)
        continue
    

0
Took 21.3672 seconds to process index 0
1
Took 21.2032 seconds to process index 1


In [5]:
df_combined.head()

Unnamed: 0,time(%Y-%m-%dT%H:%M:%S.%f),rel_time(sec),velocity(c/s),label,filename,evid,network,station,location,channel,sampling_rate,delta,npts,calib,mean_velocity,std_velocity,max_velocity,min_velocity,total_energy,rms_value,peak_count,valley_count,fft_values,fft_freqs,autocorrelation,acceleration,jerk,cumulative_energy
0,2022-02-03 08:00:00.009,0.0,0.0,0,XB.ELYSE.02.BHV.2022-02-03HR08_evid0005,evid0005,XB,ELYSE,2,BHV,20.0,0.05,72000,1.0,0.506711,141.951968,1824.427368,-2541.654297,1450844000.0,141.952872,18949,18949,3.648320e+04-0.000000e+ 00j,0.0,1450844000.0,0.002564,0.08941,0.0
1,2022-02-03 08:00:00.059,0.05,0.000128,0,XB.ELYSE.02.BHV.2022-02-03HR08_evid0005,evid0005,XB,ELYSE,2,BHV,20.0,0.05,72000,1.0,0.506711,141.951968,1824.427368,-2541.654297,1450844000.0,141.952872,18949,18949,-9.771833e+05-1.100181e+ 06j,0.000278,1352846000.0,0.007034,-0.150863,4.108762e-10
2,2022-02-03 08:00:00.109,0.1,0.000703,0,XB.ELYSE.02.BHV.2022-02-03HR08_evid0005,evid0005,XB,ELYSE,2,BHV,20.0,0.05,72000,1.0,0.506711,141.951968,1824.427368,-2541.654297,1450844000.0,141.952872,18949,18949,-1.674679e+04-5.972604e+ 05j,0.000556,1234131000.0,-0.012522,-0.539167,1.319276e-08
3,2022-02-03 08:00:00.159,0.15,-0.001124,0,XB.ELYSE.02.BHV.2022-02-03HR08_evid0005,evid0005,XB,ELYSE,2,BHV,20.0,0.05,72000,1.0,0.506711,141.951968,1824.427368,-2541.654297,1450844000.0,141.952872,18949,18949,-1.897180e+05-7.708000e+ 05j,0.000833,1149519000.0,-0.046882,0.21144,5.714996e-08
4,2022-02-03 08:00:00.209,0.2,-0.003985,0,XB.ELYSE.02.BHV.2022-02-03HR08_evid0005,evid0005,XB,ELYSE,2,BHV,20.0,0.05,72000,1.0,0.506711,141.951968,1824.427368,-2541.654297,1450844000.0,141.952872,18949,18949,1.740404e+05-7.705140e+ 04j,0.001111,1023060000.0,0.008622,0.107737,4.856965e-07


# Saving the File Locally

In [7]:
df_combined.to_csv("./training_mars.csv")