In [5]:
import os
import datetime
from collections import defaultdict

root_dir = "/home/gridsan/mknuth/01_Seismic_Wave_Data_Prediction/01_Data/01_Seismic_Wave_Data"


def expected_dates(year):
    start_date = datetime.date(year, 1, 1)
    end_date = datetime.date(year, 12, 31)
    return set((start_date + datetime.timedelta(days=i)).strftime('%Y-%m-%d') 
               for i in range((end_date - start_date).days + 1))

def validate_waveform_files(root_dir):
    results = defaultdict(lambda: defaultdict(lambda: defaultdict(set)))
    all_channels_found = defaultdict(set)
    issues = []

    print(f"🔍 Scanning absolute directory: {root_dir}\n")
    
    for root, _, files in os.walk(root_dir):
        abs_root = os.path.abspath(root)
        parts = abs_root.replace(root_dir, "").strip(os.sep).split(os.sep)

        #print(f"📂 Exploring: {abs_root} | Depth: {len(parts)} | Parts: {parts}")

        if len(parts) != 3:
            continue  # skip folders that are not year/station/channel
        

        year, station, channel = parts
        all_channels_found[(year, station)].add(channel)

        for file in files:
            if file.endswith(".mseed") and not file.endswith("_processed.mseed"):
                try:
                    date_str = file.split('_')[-1].replace('.mseed', '')
                    results[year][station][channel].add(date_str)
                    #print(f"    📄 Found file: {file} (Date: {date_str})")
                except Exception as e:
                    print(f"    ⚠️ Failed to parse file: {file}, Error: {e}")

    for (year, station), channels in all_channels_found.items():
        year_int = int(year)
        expected = expected_dates(year_int)
        print(f"\n🗓️  Year {year}, Station {station} — expecting {len(expected)} days")

        for ch in ["BHE", "BHN", "BHZ"]:
            found_dates = results[year][station].get(ch, set())
            print(f"    🔄 Checking channel {ch}: Found {len(found_dates)} / Expected {len(expected)} days")

            missing = expected - found_dates
            if missing:
                issues.append({
                    "year": year,
                    "station": station,
                    "channel": ch,
                    "missing_days": sorted(missing)
                })
                print(f"    🚨 Missing {len(missing)} days")
            else:
                print(f"    ✅ Complete")

    if not issues:
        print("\n✅ All stations have complete channel coverage for all days.")
    else:
        print("\nSummary of Issues:")
        for issue in issues:
            print(f"🚨 {issue['year']} - {issue['station']} - {issue['channel']}: {len(issue['missing_days'])} days missing.")
            print(f"    Example missing dates: {', '.join(issue['missing_days'][:5])} ...")

validate_waveform_files(root_dir)


🔍 Scanning absolute directory: /home/gridsan/mknuth/01_Seismic_Wave_Data_Prediction/01_Data/01_Seismic_Wave_Data


🗓️  Year 2020, Station SDD — expecting 366 days
    🔄 Checking channel BHE: Found 366 / Expected 366 days
    ✅ Complete
    🔄 Checking channel BHN: Found 366 / Expected 366 days
    ✅ Complete
    🔄 Checking channel BHZ: Found 366 / Expected 366 days
    ✅ Complete

🗓️  Year 2020, Station IPT — expecting 366 days
    🔄 Checking channel BHE: Found 170 / Expected 366 days
    🚨 Missing 196 days
    🔄 Checking channel BHN: Found 170 / Expected 366 days
    🚨 Missing 196 days
    🔄 Checking channel BHZ: Found 170 / Expected 366 days
    🚨 Missing 196 days

🗓️  Year 2020, Station CYP — expecting 366 days
    🔄 Checking channel BHE: Found 366 / Expected 366 days
    ✅ Complete
    🔄 Checking channel BHN: Found 366 / Expected 366 days
    ✅ Complete
    🔄 Checking channel BHZ: Found 366 / Expected 366 days
    ✅ Complete

🗓️  Year 2020, Station BHP — expecting 366 days
    🔄 Che

In [4]:
import os
import logging
from obspy import read, Stream

# Correct logging configuration
logging.shutdown()
logging.basicConfig(filename='combine_streams.log', level=logging.INFO, format='%(asctime)s - %(message)s', filemode='w', force=True)

def combine_streams(num_days):
    """
    Combines processed daily stream files of the same channel into multi-day streams.
    Saves the combined stream files in a folder named "Combined_Processed_Streams_numberofdays".
    """
    
    base_dir = "01_Data/01_Seismic_Wave_Data"
    output_dir = os.path.join(base_dir, f"Combined_Processed_Streams_{num_days}")
    os.makedirs(output_dir, exist_ok=True)
    print(f"Created directory: {output_dir}")

    # Define channels (BHE, BHN, BHZ)
    channels = ["BHE", "BHN", "BHZ"]
    # Define years 
    years = ["2020", "2021","2022", "2023"]

    for channel in channels:
        file_list = []
        print(f"Processing channel: {channel}")

        # Collect all processed.mseed files for the channel over all years
        for year in years:
            print(f"Processing year: {year}")
            year_dir = os.path.join(base_dir, year)
            if os.path.isdir(year_dir):
                channel_dir = os.path.join(year_dir, channel)
                if os.path.isdir(channel_dir):
                    for file in os.listdir(channel_dir):
                        if file.endswith("_processed.mseed"):
                            file_path = os.path.join(channel_dir, file)
                            file_list.append(file_path)

        # Sort the file list chronologically
        file_list.sort()

        # Combine the files in chunks of num_days
        for i in range(0, len(file_list), num_days):
            group_files = file_list[i:i + num_days]

            if len(group_files) == 0:
                continue

            try:
                # Read and merge the files with interpolation to fill gaps
                combined_stream = Stream()
                for file in group_files:
                    st = read(file)
                    combined_stream += st

                # Merge traces using interpolation for gaps
                combined_stream.merge(method=1, fill_value='interpolate')

                # Determine start and end dates for naming
                start_date = os.path.basename(group_files[0]).split('_')[2].split('.')[0]
                end_date = os.path.basename(group_files[-1]).split('_')[2].split('.')[0]

                # Save the combined stream
                output_filename = f"{channel}_{start_date}_to_{end_date}.mseed"
                output_path = os.path.join(output_dir, output_filename)
                combined_stream.write(output_path, format='MSEED')
                logging.info(f"Saved combined stream: {output_filename}")

            except Exception as e:
                logging.error(f"Error processing file group starting with {group_files[0]}: {str(e)}")

# Example usage:
combine_streams(10)  # Combine daily processed files into 7-day files

Created directory: 01_Data/01_Seismic_Wave_Data/Combined_Processed_Streams_10
Processing channel: BHE
Processing year: 2020
Processing year: 2021
Processing year: 2022
Processing year: 2023
Processing channel: BHN
Processing year: 2020
Processing year: 2021
Processing year: 2022
Processing year: 2023
Processing channel: BHZ
Processing year: 2020
Processing year: 2021
Processing year: 2022
Processing year: 2023


In [None]:
import os
import logging
from obspy import read, Stream
from typing import Optional

# ------------------------------------------------------------------
#  CONFIGURE LOGGING (fresh file on every run)
# ------------------------------------------------------------------
logging.shutdown()
logging.basicConfig(
    filename="combine_streams.log",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    filemode="w",
    force=True,
)

def combine_streams(window_len: int, shift: Optional[int] = None) -> None:
    """
    Combine daily processed *.mseed files into multi-day streams.

    Parameters
    ----------
    window_len : int
        Number of consecutive days to merge per output file.
    shift : int, optional
        Number of days to advance the window before the next merge.
        • shift == window_len  →  non-overlapping windows (default)
        • shift  < window_len  →  overlapping / sliding windows
        • shift  > window_len  →  gapped windows
    """
    if shift is None:
        shift = window_len
    if shift <= 0 or window_len <= 0:
        raise ValueError("`window_len` and `shift` must be positive integers")

    base_dir  = "01_Data/01_Seismic_Wave_Data"
    channels  = ("BHE", "BHN", "BHZ")
    out_dir   = os.path.join(base_dir, f"Combined_Processed_Streams_{window_len}")
    os.makedirs(out_dir, exist_ok=True)
    print(f"Output directory: {out_dir}\n")

    for year in sorted(os.listdir(base_dir)):
        year_path = os.path.join(base_dir, year)
        if not os.path.isdir(year_path):
            continue

        for station in sorted(os.listdir(year_path)):
            station_path = os.path.join(year_path, station)
            if not os.path.isdir(station_path):
                continue

            for channel in channels:
                chan_path = os.path.join(station_path, channel)
                if not os.path.isdir(chan_path):
                    continue      # channel missing for this station-year

                # ------------------------------------------------------
                #  Gather daily files for this station-year-channel
                # ------------------------------------------------------
                daily_files = [
                    os.path.join(chan_path, f)
                    for f in os.listdir(chan_path)
                    if f.endswith("_processed.mseed")
                ]
                daily_files.sort()            # YYYY-MM-DD in name → lexical = chrono
                n_files = len(daily_files)
                if n_files == 0:
                    continue

                # ------------------------------------------------------
                #  Plan windows (sliding / gapped / non-overlapping)
                # ------------------------------------------------------
                start_idxs   = range(0, n_files - window_len + 1, shift)
                start_idxs_list = list(start_idxs)
                n_windows    = len(start_idxs_list)
                last_used    = (start_idxs_list[-1] + window_len) if n_windows else 0
                leftovers    = n_files - last_used


                print(
                    f"Processing {year}  {station}  {channel}"
                    f"→ {n_files} files   "
                    f"({n_windows} window(s) of {window_len} d, shift {shift} d, "
                    f"{leftovers} leftover)"
                )

                # ------------------------------------------------------
                #  Merge each window
                # ------------------------------------------------------
                for w, start in enumerate(start_idxs, 1):
                    group_files = daily_files[start : start + window_len]
                    try:
                        st_merged = Stream()
                        for fp in group_files:
                            st_merged += read(fp)

                        st_merged.merge(method=1, fill_value="interpolate")

                        first_date = os.path.basename(group_files[0]).split("_")[2]
                        last_date  = os.path.basename(group_files[-1]).split("_")[2]

                        out_name = (
                            f"{station}_{channel}_{first_date}_to_{last_date}.mseed"
                        )
                        st_merged.write(os.path.join(out_dir, out_name), format="MSEED")
                        logging.info("Saved %s", out_name)

                    except Exception as exc:
                        logging.error(
                            "Error combining %s – %s: %s",
                            group_files[0],
                            group_files[-1],
                            exc,
                        )

                # ------------------------------------------------------
                #  Summary print for this triplet
                # ------------------------------------------------------
                if n_windows:
                    print(
                        f"  ✔  Wrote {n_windows} combined streams; "
                        f"skipped {leftovers} leftover day(s)\n"
                    )
                else:
                    print("  ⚠  Not enough data for a single window; nothing saved\n")

if __name__ == "__main__":
    # Examples
    # combine_streams(window_len=10)            # original behaviour (10-day, no overlap)
    # combine_streams(window_len=10, shift=5)   # 10-day windows every 5 days
    combine_streams(window_len=20, shift=5)


Output directory: 01_Data/01_Seismic_Wave_Data/Combined_Processed_Streams_20

Processing 2020  PASC  BHE→ 366 files   (70 window(s) of 20 d, shift 5 d, 1 leftover)


In [None]:

import os
import logging
from obspy import read, Stream, Trace
from typing import Optional, Dict
import numpy as np

def combine_streams(window_len: int, shift: Optional[int] = None) -> None:
    """
    Combine daily processed *.mseed files into multi-day streams.
    Creates a single trace spanning the entire window by handling location code
    conflicts - ensures all traces have the same location code before merging.

    Parameters
    ----------
    window_len : int
        Number of consecutive days to merge per output file.
    shift : int, optional
        Number of days to advance the window before the next merge.
        • shift == window_len  →  non-overlapping windows (default)
        • shift  < window_len  →  overlapping / sliding windows
        • shift  > window_len  →  gapped windows
    """
    if shift is None:
        shift = window_len
    if shift <= 0 or window_len <= 0:
        raise ValueError("`window_len` and `shift` must be positive integers")

    base_dir  = "/home/gridsan/mknuth/01_Seismic_Wave_Data_Prediction/01_Data/01_Seismic_Wave_Data"
    channels  = ("BHE", "BHN", "BHZ")
    out_dir   = os.path.join(base_dir, f"Combined_Processed_Streams_{window_len}_new_2025_07_28")
    os.makedirs(out_dir, exist_ok=True)
    print(f"Output directory: {out_dir}\n")

    for year in sorted(os.listdir(base_dir)):
        print(year)
        year_path = os.path.join(base_dir, year)
        if not os.path.isdir(year_path):
            continue

        for station in sorted(os.listdir(year_path)):
            station_path = os.path.join(year_path, station)
            if not os.path.isdir(station_path):
                continue

            for channel in channels:
                chan_path = os.path.join(station_path, channel)
                if not os.path.isdir(chan_path):
                    continue      # channel missing for this station-year

                # ------------------------------------------------------
                #  Gather daily files for this station-year-channel
                # ------------------------------------------------------
                daily_files = [
                    os.path.join(chan_path, f)
                    for f in os.listdir(chan_path)
                    if f.endswith("_processed.mseed")
                ]
                daily_files.sort()            # YYYY-MM-DD in name → lexical = chrono
                n_files = len(daily_files)
                if n_files == 0:
                    continue

                # ------------------------------------------------------
                #  Plan windows (sliding / gapped / non-overlapping)
                # ------------------------------------------------------
                start_idxs   = range(0, n_files - window_len + 1, shift)
                n_windows    = len(list(start_idxs))
                last_used    = (list(start_idxs)[-1] + window_len) if n_windows else 0
                leftovers    = n_files - last_used

                print(
                    f"Processing {year}  {station}  {channel}  "
                    f"→ {n_files} files   "
                    f"({n_windows} window(s) of {window_len} d, shift {shift} d, "
                    f"{leftovers} leftover)"
                )

                # ------------------------------------------------------
                #  Merge each window
                # ------------------------------------------------------
                for w, start in enumerate(start_idxs, 1):
                    group_files = daily_files[start : start + window_len]
                    try:
                        # First, determine the dominant location code
                        loc_count: Dict[str, int] = {}
                        all_traces = []
                        
                        for fp in group_files:
                            st = read(fp)
                            for tr in st:
                                all_traces.append(tr)
                                # Handle empty location field (None, empty string, etc.)
                                loc = tr.stats.location if tr.stats.location else ""
                                loc_count[loc] = loc_count.get(loc, 0) + 1
                        
                        # Find most common location
                        if not loc_count:
                            raise ValueError("No traces found in window files")
                            
                        dominant_loc = max(loc_count, key=loc_count.get)
                        # Format location for logging - show empty as "(empty)"
                        display_loc = dominant_loc if dominant_loc else "(empty)"
                        logging.info(f"Location counts: {loc_count}, using dominant location: {display_loc}")
                        
                        # Set all traces to the dominant location
                        st_uniform = Stream()
                        for tr in all_traces:
                            # Create a new trace with consistent location
                            new_tr = Trace(data=tr.data)
                            # Copy all stats
                            new_tr.stats = tr.stats.copy()
                            # Override location with dominant one
                            # Empty strings are normalized to empty string
                            new_tr.stats.location = dominant_loc if dominant_loc else ""
                            st_uniform.append(new_tr)
                        
                        # Now we can merge as usual
                        st_merged = st_uniform.merge(method=1, fill_value="interpolate")

                        first_date = os.path.basename(group_files[0]).split("_")[2]
                        last_date  = os.path.basename(group_files[-1]).split("_")[2]

                        out_name = (
                            f"{station}_{channel}_{first_date}_to_{last_date}.mseed"
                        )
                        
                        st_merged.write(os.path.join(out_dir, out_name), format="MSEED")
                        logging.info(
                            "Saved %s (unified location: %s, from %d traces)", 
                            out_name, display_loc, len(all_traces)
                        )

                    except Exception as exc:
                        logging.error(
                            "Error combining %s – %s: %s",
                            group_files[0],
                            group_files[-1],
                            exc,
                        )

                # ------------------------------------------------------
                #  Summary print for this triplet
                # ------------------------------------------------------
                if n_windows:
                    print(
                        f"  ✔  Wrote {n_windows} combined streams; "
                        f"skipped {leftovers} leftover day(s)\n"
                    )
                else:
                    print("  ⚠  Not enough data for a single window; nothing saved\n")
                    
                    
                    
                    
                    
if __name__ == "__main__":
    # Examples
    # combine_streams(window_len=10)            # original behaviour (10-day, no overlap)
    # combine_streams(window_len=10, shift=5)   # 10-day windows every 5 days
    combine_streams(window_len=30, shift=7)


Output directory: /home/gridsan/mknuth/01_Seismic_Wave_Data_Prediction/01_Data/01_Seismic_Wave_Data/Combined_Processed_Streams_30_new_2025_07_28

.ipynb_checkpoints
2020
2021
2022
2023
Processing 2023  AGM  BHE  → 365 files   (48 window(s) of 30 d, shift 7 d, 6 leftover)
  ✔  Wrote 48 combined streams; skipped 6 leftover day(s)

Processing 2023  AGM  BHN  → 365 files   (48 window(s) of 30 d, shift 7 d, 6 leftover)
  ✔  Wrote 48 combined streams; skipped 6 leftover day(s)

Processing 2023  AGM  BHZ  → 365 files   (48 window(s) of 30 d, shift 7 d, 6 leftover)
  ✔  Wrote 48 combined streams; skipped 6 leftover day(s)

Processing 2023  BAI  BHE  → 365 files   (48 window(s) of 30 d, shift 7 d, 6 leftover)
  ✔  Wrote 48 combined streams; skipped 6 leftover day(s)

Processing 2023  BAI  BHN  → 365 files   (48 window(s) of 30 d, shift 7 d, 6 leftover)
  ✔  Wrote 48 combined streams; skipped 6 leftover day(s)

Processing 2023  BAI  BHZ  → 365 files   (48 window(s) of 30 d, shift 7 d, 6 leftover

In [None]:
import os
import torch
import pandas as pd
from datetime import datetime
from obspy import read



def get_max_magnitude_in_next_30_days(end_date, earthquake_csv):
    """
    Get the maximum earthquake magnitude in the next 30 days.
    """
    future_end_date = end_date + pd.Timedelta(days=30)
    df = pd.read_csv(earthquake_csv, parse_dates=["Time"])
    mask = (df["Time"] >= end_date) & (df["Time"] <= future_end_date)
    future_earthquakes = df[mask]

    if future_earthquakes.empty:
        return 0.0  # No event found

    max_magnitude = future_earthquakes["Converted Magnitude"].max()
    return max_magnitude

def create_train_dataset(earthquake_csv):
    """
    Precompute the train dataset with file paths and labels.
    Returns a list of dictionaries containing file paths and labels.
    """
    train_data = []

    # Separate lists for each channel
    bhe_files = []
    bhn_files = []
    bhz_files = []

    # Access the folder and categorize files
    for file_name in os.listdir(combined_stream_dir
                               
                               
                               ):
        if file_name.startswith("BHE"):
            bhe_files.append(file_name)
        elif file_name.startswith("BHN"):
            bhn_files.append(file_name)
        elif file_name.startswith("BHZ"):
            bhz_files.append(file_name)

    # Sort each list based on the start date extracted from the filename
    # Alphabetically sort the files
    bhe_files.sort()
    bhn_files.sort()
    bhz_files.sort()

    # Combine the sorted lists to create triplets
    for bhe, bhn, bhz in zip(bhe_files, bhn_files, bhz_files):
        try:
            # Collect file paths for the current triplet
            file_paths = [
                os.path.join(combined_stream_dir, bhe),
                os.path.join(combined_stream_dir, bhn),
                os.path.join(combined_stream_dir, bhz)
            ]

            # Extract the end date from the filename
            end_date_str = bhe[-16:-6]  # Example: '2023-11-26'
            end_date = datetime.strptime(end_date_str, "%Y-%m-%d")

            # Get the label (maximum magnitude in the next 30 days)
            max_magnitude = get_max_magnitude_in_next_30_days(end_date, earthquake_csv)

            # Append the file paths and label to the dataset
            train_data.append({
                "file_paths": file_paths,
                "label": max_magnitude
            })

            print(f"Processing triplet: {bhe}, {bhn}, {bhz}")

        except Exception as e:
            print(f"Error processing triplet: {bhe}, {bhn}, {bhz}, Error: {e}")

    print(f"Number of training samples: {len(train_data)}")
    return train_data


# Base directory for combined streams
combined_stream_dir = "/home/gridsan/mknuth/01_Seismic_Wave_Data_Prediction/01_Data/01_Seismic_Wave_Data/Combined_Processed_Streams_30_new_2025_07_28"
channels = ["BHE", "BHN", "BHZ"]
earthquake_csv = "/home/gridsan/mknuth/01_Seismic_Wave_Data_Prediction/01_Data/02_Seismic_Event_Data/earthquake_features.parquet"
train_data = create_train_dataset(earthquake_csv)
print(f"Number of training samples: {len(train_data)}")
train_data


Processing triplet: BHE_2022-01-01_to_2022-01-30.mseed, BHN_2022-01-01_to_2022-01-30.mseed, BHZ_2022-01-01_to_2022-01-30.mseed
Processing triplet: BHE_2022-01-31_to_2022-03-01.mseed, BHN_2022-01-31_to_2022-03-01.mseed, BHZ_2022-01-31_to_2022-03-01.mseed
Processing triplet: BHE_2022-03-02_to_2022-03-31.mseed, BHN_2022-03-02_to_2022-03-31.mseed, BHZ_2022-03-02_to_2022-03-31.mseed
Processing triplet: BHE_2022-04-01_to_2022-04-30.mseed, BHN_2022-04-01_to_2022-04-30.mseed, BHZ_2022-04-01_to_2022-04-30.mseed
Processing triplet: BHE_2022-05-01_to_2022-05-30.mseed, BHN_2022-05-01_to_2022-05-30.mseed, BHZ_2022-05-01_to_2022-05-30.mseed
Processing triplet: BHE_2022-05-31_to_2022-06-29.mseed, BHN_2022-05-31_to_2022-06-29.mseed, BHZ_2022-05-31_to_2022-06-29.mseed
Processing triplet: BHE_2022-06-30_to_2022-07-29.mseed, BHN_2022-06-30_to_2022-07-29.mseed, BHZ_2022-06-30_to_2022-07-29.mseed
Processing triplet: BHE_2022-07-30_to_2022-08-28.mseed, BHN_2022-07-30_to_2022-08-28.mseed, BHZ_2022-07-30_to_2

[{'file_paths': ['01_Data/01_Seismic_Wave_Data/Combined_Processed_Streams_30/BHE_2022-01-01_to_2022-01-30.mseed',
   '01_Data/01_Seismic_Wave_Data/Combined_Processed_Streams_30/BHN_2022-01-01_to_2022-01-30.mseed',
   '01_Data/01_Seismic_Wave_Data/Combined_Processed_Streams_30/BHZ_2022-01-01_to_2022-01-30.mseed'],
  'label': 4.218933177022274},
 {'file_paths': ['01_Data/01_Seismic_Wave_Data/Combined_Processed_Streams_30/BHE_2022-01-31_to_2022-03-01.mseed',
   '01_Data/01_Seismic_Wave_Data/Combined_Processed_Streams_30/BHN_2022-01-31_to_2022-03-01.mseed',
   '01_Data/01_Seismic_Wave_Data/Combined_Processed_Streams_30/BHZ_2022-01-31_to_2022-03-01.mseed'],
  'label': 4.38305978898007},
 {'file_paths': ['01_Data/01_Seismic_Wave_Data/Combined_Processed_Streams_30/BHE_2022-03-02_to_2022-03-31.mseed',
   '01_Data/01_Seismic_Wave_Data/Combined_Processed_Streams_30/BHN_2022-03-02_to_2022-03-31.mseed',
   '01_Data/01_Seismic_Wave_Data/Combined_Processed_Streams_30/BHZ_2022-03-02_to_2022-03-31.mse

In [5]:
def extract_start_date(filename):
    """Extract the start date from the file name."""
    try:
        # Example filename: BHE_2022-01-01_to_2022-03-01.mseed
        start_date_str = filename.split('_')[1]
        start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
        print(start_date)
        return start_date
    except Exception as e:
        print(f"Error extracting date from filename {filename}: {e}")
        return None

def create_train_dataset(earthquake_csv):
    """
    Precompute the train dataset with file paths and labels.
    Returns a list of dictionaries containing file paths and labels.
    """
    train_data = []

    # Separate lists for each channel
    bhe_files = []
    bhn_files = []
    bhz_files = []

    # Access the folder and categorize files
    for file_name in os.listdir(combined_stream_dir):
        if file_name.startswith("BHE"):
            bhe_files.append(file_name)
        elif file_name.startswith("BHN"):
            bhn_files.append(file_name)
        elif file_name.startswith("BHZ"):
            bhz_files.append(file_name)

    # Sort each list based on the start date extracted from the filename
    bhe_files.sort(key=lambda x: extract_start_date(x))
    bhn_files.sort(key=lambda x: extract_start_date(x))
    bhz_files.sort(key=lambda x: extract_start_date(x))
    
    print(len(bhe_files))
    print(len(bhn_files))
    print(len(bhz_files))

    # Combine the sorted lists to create triplets
    for bhe, bhn, bhz in zip(bhe_files, bhn_files, bhz_files):
        try:
            # Collect file paths for the current triplet
            file_paths = [
                os.path.join(combined_stream_dir, bhe),
                os.path.join(combined_stream_dir, bhn),
                os.path.join(combined_stream_dir, bhz)
            ]

            # Extract the end date from the filename
            end_date_str = bhe[-16:-6]  # Example: '2023-11-26'
            end_date = datetime.strptime(end_date_str, "%Y-%m-%d")

            # Get the label (maximum magnitude in the next 30 days)
            max_magnitude = get_max_magnitude_in_next_30_days(end_date, earthquake_csv)

            # Append the file paths and label to the dataset
            train_data.append({
                "file_paths": file_paths,
                "label": max_magnitude
            })

            print(f"Processing triplet: {bhe}, {bhn}, {bhz}")

        except Exception as e:
            print(f"Error processing triplet: {bhe}, {bhn}, {bhz}, Error: {e}")

    print(f"Number of training samples: {len(train_data)}")
    return train_data

earthquake_csv = "01_Data/02_Seismic_Event_Data/earthquake_event_data_PASC_2020_2024_preprocessed.csv"
train_data = create_train_dataset(earthquake_csv)
print(f"Number of training samples: {len(train_data)}")
train_data

2023-02-25 00:00:00
2023-10-23 00:00:00
2023-12-22 00:00:00
2022-10-28 00:00:00
2023-04-26 00:00:00
2022-01-01 00:00:00
2022-03-02 00:00:00
2022-08-29 00:00:00
2023-08-24 00:00:00
2022-12-27 00:00:00
2022-06-30 00:00:00
2023-06-25 00:00:00
2022-05-01 00:00:00
2023-10-23 00:00:00
2022-01-01 00:00:00
2022-03-02 00:00:00
2022-10-28 00:00:00
2023-08-24 00:00:00
2023-06-25 00:00:00
2023-02-25 00:00:00
2023-12-22 00:00:00
2022-06-30 00:00:00
2022-08-29 00:00:00
2023-04-26 00:00:00
2022-05-01 00:00:00
2022-12-27 00:00:00
2022-03-02 00:00:00
2022-10-28 00:00:00
2023-10-23 00:00:00
2022-05-01 00:00:00
2022-01-01 00:00:00
2023-08-24 00:00:00
2022-06-30 00:00:00
2023-06-25 00:00:00
2023-02-25 00:00:00
2022-12-27 00:00:00
2023-12-22 00:00:00
2023-04-26 00:00:00
2022-08-29 00:00:00
13
13
13
Processing triplet: BHE_2022-01-01_to_2022-03-01.mseed, BHN_2022-01-01_to_2022-03-01.mseed, BHZ_2022-01-01_to_2022-03-01.mseed
Processing triplet: BHE_2022-03-02_to_2022-04-30.mseed, BHN_2022-03-02_to_2022-04-30

[{'file_paths': ['01_Data/01_Seismic_Wave_Data/Combined_Processed_Streams_60/BHE_2022-01-01_to_2022-03-01.mseed',
   '01_Data/01_Seismic_Wave_Data/Combined_Processed_Streams_60/BHN_2022-01-01_to_2022-03-01.mseed',
   '01_Data/01_Seismic_Wave_Data/Combined_Processed_Streams_60/BHZ_2022-01-01_to_2022-03-01.mseed'],
  'label': 4.38305978898007},
 {'file_paths': ['01_Data/01_Seismic_Wave_Data/Combined_Processed_Streams_60/BHE_2022-03-02_to_2022-04-30.mseed',
   '01_Data/01_Seismic_Wave_Data/Combined_Processed_Streams_60/BHN_2022-03-02_to_2022-04-30.mseed',
   '01_Data/01_Seismic_Wave_Data/Combined_Processed_Streams_60/BHZ_2022-03-02_to_2022-04-30.mseed'],
  'label': 4.547186400937867},
 {'file_paths': ['01_Data/01_Seismic_Wave_Data/Combined_Processed_Streams_60/BHE_2022-05-01_to_2022-06-29.mseed',
   '01_Data/01_Seismic_Wave_Data/Combined_Processed_Streams_60/BHN_2022-05-01_to_2022-06-29.mseed',
   '01_Data/01_Seismic_Wave_Data/Combined_Processed_Streams_60/BHZ_2022-05-01_to_2022-06-29.mse