In [1]:
import pandas as pd
import h5py
import numpy as np
from sklearn.metrics import confusion_matrix
import seaborn as sns
import gc
import random
from datetime import datetime
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import scipy
import os
from scipy import signal as sp
import torch.nn.functional as F
from lightning import seed_everything

import my_utils_pre_post as myu
import utils_pre_post as u


dataToProcess = "NRCA" #sys.argv[1]
num_classes = 9 #int(sys.argv[2])
seed = 42 #int(sys.argv[3])
path = '/scicore/home/dokman0000/liu0003/projects/seisLM/data/wetransfer_classify_generic_norcia-py_2024-06-24_1530/'
trValTest_split_rnd = False
force_traces_in_test=[]

In [2]:
def my_preprocess():
  df_pre = pd.read_pickle(path+'dataframe_pre_'+dataToProcess+'.csv')
  df_post = pd.read_pickle(path+'dataframe_post_'+dataToProcess+'.csv')

  if num_classes % 2 == 1:
    df_visso = pd.read_pickle(path+'dataframe_visso_'+dataToProcess+'.csv')
  else:
    df_visso = None


  df_pre, df_visso, df_post = myu.equallize_dataset_length(
    df_pre, df_visso, df_post, num_classes, seed=seed
  )


  df_pre['trace_start_time'] = df_pre['trace_start_time'].apply(
    lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%fZ'))

  if isinstance(df_visso, pd.DataFrame):
    df_visso['trace_start_time'] = df_visso['trace_start_time'].apply(
      lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%fZ')
  )

  df_post['trace_start_time'] = df_post['trace_start_time'].apply(
    lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%fZ'))

  if num_classes==2:
    df = pd.concat([df_pre, df_post], ignore_index=True)
  else:
    frames_pre = myu.split_df_into_class_dependent_frames(df_pre, num_classes, pre_or_post="pre")
    frames_post = myu.split_df_into_class_dependent_frames(df_post, num_classes, pre_or_post="post")
    if isinstance(df_visso, pd.DataFrame):
      frames_visso = myu.split_df_into_class_dependent_frames(df_visso, num_classes, pre_or_post="visso")
      df=pd.concat(
        [pd.concat(frames_pre),pd.concat(frames_visso),pd.concat(frames_post)],
        ignore_index=True
      )
    else:
      df=pd.concat(
        [pd.concat(frames_pre),pd.concat(frames_post)], ignore_index=True
      )

  df['source_origin_time'] = df['source_origin_time'].apply(
    lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%S'))
  df['TTF'] = df.apply(lambda row: myu.compute_norcia_ttf(row), axis=1)
  return df


def laurenti_preprocess():
  seed_everything(seed)
  df_empty = pd.DataFrame(columns = ['E_channel', 'N_channel', 'Z_channel', 'trace_name', 'label',
      'trace_start_time', 'network_code', 'receiver_name', 'receiver_type',
      'receiver_elevation_m', 'receiver_latitude', 'receiver_longitude',
      'source_id', 'source_depth_km', 'source_latitude', 'source_longitude',
      'source_magnitude_type', 'source_magnitude', 'source_origin_time', 'p_travel_sec'])
  df_pre = df_empty.copy()
  df_visso = df_empty.copy() # if num_classes!=9 this df will remain empty
  df_post = df_empty.copy()

  df_pre = pd.read_pickle(path+'dataframe_pre_'+dataToProcess+'.csv')
  df_post = pd.read_pickle(path+'dataframe_post_'+dataToProcess+'.csv')
  if num_classes==9:
      df_visso = pd.read_pickle(path+'dataframe_visso_'+dataToProcess+'.csv')

  df_pre, df_visso, df_post=u.pre_post_equal_length(df_pre, df_visso, df_post,force_traces_in_test, num_classes)

  for i in force_traces_in_test:
      if (i not in df_pre['trace_name'].values) and (i not in df_visso['trace_name'].values) and (i not in df_post['trace_name'].values):
          print("WARNING: ", i," not in df_pre and df_post. This will cause an error.")

  df_pre['trace_start_time'] = df_pre['trace_start_time'].apply(lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%fZ'))
  df_visso['trace_start_time'] = df_visso['trace_start_time'].apply(lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%fZ'))
  df_post['trace_start_time'] = df_post['trace_start_time'].apply(lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%fZ'))


  if num_classes==2:
      df=pd.concat([df_pre, df_post], ignore_index=True)
  else:
      frames_pre = u.frames_N_classes(df_pre,num_classes, pre_or_post="pre")
      frames_post = u.frames_N_classes(df_post,num_classes, pre_or_post="post")
      if num_classes==9:
          frames_visso = u.frames_N_classes(df_visso,num_classes, pre_or_post="visso")
          df=pd.concat([pd.concat(frames_pre),pd.concat(frames_visso),pd.concat(frames_post)], ignore_index=True)
      else:
          df=pd.concat([pd.concat(frames_pre),pd.concat(frames_post)], ignore_index=True)

  df['source_origin_time'] = df['source_origin_time'].apply(lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%S'))
  df['TTF'] = df.apply (lambda row: u.add_TTF_in_sec(row), axis=1)
  return df


In [5]:
df1 = my_preprocess()
df2 = laurenti_preprocess()

df1.equals(df2)



Seed set to 42


len(df_pre): 4595 len(df_visso): 886 len(df_post): 3628
len(df_pre): 3544 len(df_visso): 886 len(df_post): 3544


Seed set to 42


len(df_pre): 4595 len(df_visso): 886 len(df_post): 3628
len(df_pre): 3544 len(df_visso): 886 len(df_post): 3544


True

In [None]:
len(df1[0])

3544

In [None]:
len(df2[0])

3544

In [None]:
len(df1)

3

In [None]:
len(df2)

3

In [None]:
df2

(                                              E_channel  \
 0     [-5488.498688773891, 3555.492691384645, 7863.4...   
 1     [-14.764081897217693, -234.4251182926355, -234...   
 2     [-67.77683746645266, -235.56564294569966, -267...   
 3     [94.19324644243034, 10.191393004812653, -7.810...   
 4     [8.327841045835612, 90.45252623401439, 75.5772...   
 ...                                                 ...   
 3539  [147.44815550997782, 152.47687975246322, 116.5...   
 3540  [13.268149814414983, -13.66365878780357, 33.40...   
 3541  [435.27932903577175, 394.2184383194443, 336.15...   
 3542  [-531.7565098660771, 863.2471203718039, 395.25...   
 3543  [-124.99925209772232, -132.89439224022635, -14...   
 
                                               N_channel  \
 0     [8442.890139867315, -9601.09437584314, -2554.0...   
 1     [332.7531817009409, -31.09784131623121, -415.9...   
 2     [244.167194513082, 166.32038054955467, 130.473...   
 3     [65.76865560730138, 37.82220237

In [None]:
# frames_pre = frames_N_classes(df_pre, num_classes, pre_or_post="pre")
# frames_post = frames_N_classes(df_post, num_classes, pre_or_post="post")


In [None]:


df, X_train, y_train, index_train, X_val, y_val, index_val, X_test, y_test, index_test=u.train_val_test_split(df, train_percentage=0.70, val_percentage=0.10, test_percentage=0.20,force_in_test=force_traces_in_test, split_random=trValTest_split_rnd)
batch_size = 32
tr_dl = u.create_dataloader(X=X_train, y=y_train, index=index_train,target_dataset="train_dataset", batch_size=batch_size)
val_dl = u.create_dataloader(X=X_val, y=y_val, index=index_val,target_dataset="val_dataset", batch_size=batch_size)
test_dl = u.create_dataloader(X=X_test, y=y_test, index=index_test,target_dataset="test_dataset", batch_size=batch_size)


NameError: name 'df' is not defined