# SORT SUNSET BAY HYDROPHONE CHUNKS FROM AWS
## Isabelle Brandicourt, 10-31-2024

In [11]:
from datetime import datetime, timedelta
import pytz
import os
import boto3
from botocore.config import Config
from botocore import UNSIGNED
import pandas as pd
import openpyxl
import subprocess
import re
import ffmpeg
import glob
from pathlib import Path
import shutil

### Step 1: open portal to AWS

Makes the connection to the AWS bucket and the sunset bay hydrophone node. Draws out all of the possible sub-folders (called buckets here) after a certain date. 

In [12]:
# Set up the S3 client with unsigned configuration
s3_client = boto3.client('s3', config=Config(signature_version=UNSIGNED))

bucket_name = 'audio-orcasound-net'
prefix = 'rpi_sunset_bay/hls/'
directories, num_directories = [], []
files, buckets = [], []
start_buck = 1728111618

# List objects in the specified bucket and prefix
response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix, Delimiter='/')

# Print directory (pseudo-folder) names
if 'CommonPrefixes' in response:
    for prefix_info in response['CommonPrefixes']:
        directories.append(prefix_info['Prefix'])
        # Get the numeric part of the directory name
        num = prefix_info['Prefix'].split('/')[-2]  # Get the last directory name before the trailing '/'
        num_directories.append(int(num))

# Print file names
if 'Contents' in response:
    for obj in response['Contents']:
        files.append(obj['Key'])

# List all bucketsinside rpi_sunset_bay/hls/
paginator = s3_client.get_paginator('list_objects_v2')
operation_parameters = {
    'Bucket': bucket_name,
    'Prefix': prefix,
    'Delimiter': '/'
}
for page in paginator.paginate(**operation_parameters):
    if 'CommonPrefixes' in page:
            for prefix_info in page['CommonPrefixes']:
                dir_name = prefix_info['Prefix']
                dir_number = int(dir_name.split('/')[-2])
                if dir_number >= start_buck:
                    buckets.append(dir_number)


### Step 2: Read your CSV file to pull the specific chunks of time that you're interested in looking at.

Your csv file needs to have a column titled: date_time_pstpdt

and that date_time_pstpdt column must have this format: YYYY-MM-DD_HH.MM.SS

In [19]:
file_path = 'orcasound_entries_visuals.xlsx'
orcasound_entries = pd.read_excel(file_path)
#buckets = orcasound_entries['aws_bucket'].unique()
#sorted_by_bucket = {name: group for name, group in orcasound_entries.groupby('aws_bucket')}

# Convert the date_time column to a new unix time column, sort the whole file based on that new column
orcasound_entries['unix'] = pd.to_datetime(orcasound_entries['date_time_pstpdt'], format='%Y-%m-%d_%H.%M.%S').astype(int) // 10**9
orcasound_entries = orcasound_entries.sort_values(by='unix', ascending=True)

### Step 3: Get all of your functions sorted that you will need later!

#### choose_bucket(date, dur):
Needs the specific date_time starting value and will then calculate the end time based on the input duration variable. You can either add a duration column into the CSV file to get a different duration for each start time (will need to adjust the code to pull that and save it as "duration" in the next code chunk) or use the pre-set duration variable.

Returns the bucket that you need to pull from, the start and end times and corresponding liveXXX.ts files in the bucket.

#### pull_ts_files(use_bucket, start_time, start_live, end_live)
A continuation of the choose_bucket function, this one needs all of those outputs to download the selected range of ts files that correspond with the given date and time.

Returns the name of the temporary folder created to hold the downloaded ts files.

#### concate_ts_files(local_folder, chunk_name):
Pulls all of the files that were just downloaded and concatenated them into one file to store for processing.

#### convert_ts_to_wav(chunk):
Takes the concatenated ts file and converts it to a wav file, saves each of them in the corresponding ts or wav folders in your working directory.


In [17]:
# takes the date and time in pst and returns the bucket that contains that time
def choose_bucket(date, dur):
    dt = datetime.strptime(date, '%Y-%m-%d_%H.%M.%S')
    end_dt = dt + timedelta(seconds=dur)
    end_date = end_dt.strftime('%Y-%m-%d_%H.%M.%S')

    year, month, day = int(date[0:4]), int(date[5:7]), int(date[8:10])
    start_hour, start_min, start_sec = int(date[11:13]), int(date[14:16]), int(date[17:19])
    end_hour, end_min, end_sec = int(end_date[11:13]), int(end_date[14:16]), int(end_date[17:19])

    # save the input times to datetime objects in PST/PDT
    pst = pytz.timezone('America/Los_Angeles')
    start_time = pst.localize(datetime(year, month, day, start_hour, start_min, start_sec))
    end_time = pst.localize(datetime(year, month, day, end_hour, end_min, end_sec))
    print('Starting time:', start_time, '\nEnding time:', end_time)

    # Identify the bucket based on converted unix start and end times from PST/PDT, uses the max of a list of all buckets below the start time
    start_unix_time = int(start_time.timestamp())
    end_unix_time = int(end_time.timestamp())
    use_bucket = int(max([val for val in buckets if val < start_unix_time]))

    # buffers ~20s before and after the calculated time for files to pull within the bucket
    start_live = round((start_unix_time - use_bucket)/10)-1
    end_live = round((end_unix_time - use_bucket)/10)+1
    print(f'Using the bucket {use_bucket} with a live range of .{start_live} to .{end_live}')

    return use_bucket, start_time, end_time, start_live, end_live


def pull_ts_files(use_bucket, start_time, start_live, end_live):
    loc_fol = f'{use_bucket}_{start_time}'
    os.makedirs(loc_fol, exist_ok=True)

    # pulls the live files identified and saves them in a folder named with the bucket and the start time of the chunk
    s = start_live
    while s <= end_live:
        aws_filename = f'live{s}.ts'
        s3_key = f'rpi_sunset_bay/hls/{use_bucket}/{aws_filename}'
        download_path = os.path.join(loc_fol, aws_filename)
        try:
            s3_client.download_file(bucket_name, s3_key, download_path)
        except Exception as e:
            print(f"An error occurred in downloading the audio files from AWS: {e}")
        s += 1

    return loc_fol


def concate_ts_files(local_folder, chunk_name):
    output_file = os.path.join("ts", f"{chunk_name}.ts")
    with open(output_file, 'wb') as outfile:
        ts_files = sorted(glob.glob(os.path.join(local_folder, "*.ts")))
        for ts_file in ts_files:
            with open(ts_file, 'rb') as infile:
                outfile.write(infile.read())
    shutil.rmtree(local_folder)


def convert_ts_to_wav(chunk):
    in_ts = f"ts/{chunk}.ts"
    out_wav = os.path.join("wavs", f"{chunk}.wav")

    if not os.path.isfile(in_ts):
        print(f"File not found: {in_ts}. Skipping...")
        return
    
    if os.path.isfile(out_wav):
        print(f"Output file already exists: {out_wav}. Skipping...")
        return
    
    try:
        ffmpeg.input(in_ts).output(out_wav, acodec='pcm_s16le', ac=2, ar='44100').run(quiet=True, overwrite_output=True)
    except ffmpeg.Error as e:
        print(f"Error converting {in_ts}: {e}")

### Step 4a: Pull a singular file, manually identifying the start date and time, as well as the duration. Use this for testing only.

In [15]:
# INPUT YOUR DESIRED TIME IN PST/PDT AS LISTED ON ORCASOUND'S ONLINE INTERFACE
year, month, day = "2025", "01", "12"
start_hour, start_min, start_sec = "21", "07", "38"
date = f'{year}-{month}-{day}_{start_hour}.{start_min}.{start_sec}'
duration = 30

# make a ts and wav folder to save stuff in
Path("ts").mkdir(exist_ok=True)
Path("wavs").mkdir(exist_ok=True)

use_bucket, start_time, end_time, start_live, end_live = choose_bucket(date, duration)
local_folder = pull_ts_files(use_bucket, start_time, start_live, end_live)

chunk_name = os.path.join(start_time.strftime('%Y-%m-%d_%H.%M.%S') + '_to_' + end_time.strftime('%H.%M.%S'))
concate_ts_files(local_folder, chunk_name)
convert_ts_to_wav(chunk_name)

Starting time: 2025-01-12 21:07:38-08:00 
Ending time: 2025-01-12 21:08:08-08:00
Using the bucket 1736668818 with a live range of .7603 to .7608


### Step 4b: Use your orcasound_entries CSV file to pull files from a list of start times. Make sure your CSV file follows the proper naming conventions.

In [20]:
duration = 30

Path("ts").mkdir(exist_ok=True)
Path("wavs").mkdir(exist_ok=True)

for _, row in orcasound_entries.iterrows():
    date = row['date_time_pstpdt']
    print(date)

    use_bucket, start_time, end_time, start_live, end_live = choose_bucket(date, duration)
    local_folder = pull_ts_files(use_bucket, start_time, start_live, end_live)

    chunk_name = os.path.join(start_time.strftime('%Y-%m-%d_%H.%M.%S') + '_to_' + end_time.strftime('%H.%M.%S'))
    concate_ts_files(local_folder, chunk_name)
    convert_ts_to_wav(chunk_name)


2024-10-07_17.55.48
Starting time: 2024-10-07 17:55:48-07:00 
Ending time: 2024-10-07 17:56:18-07:00
Using the bucket 1728284418 with a live range of .6452 to .6457
2024-10-20_18.26.26
Starting time: 2024-10-20 18:26:26-07:00 
Ending time: 2024-10-20 18:26:56-07:00
Using the bucket 1729407621 with a live range of .6635 to .6641
2024-10-20_18.32.53
Starting time: 2024-10-20 18:32:53-07:00 
Ending time: 2024-10-20 18:33:23-07:00
Using the bucket 1729407621 with a live range of .6674 to .6679
2024-10-20_18.37.54
Starting time: 2024-10-20 18:37:54-07:00 
Ending time: 2024-10-20 18:38:24-07:00
Using the bucket 1729407621 with a live range of .6704 to .6709
2024-10-20_18.44.03
Starting time: 2024-10-20 18:44:03-07:00 
Ending time: 2024-10-20 18:44:33-07:00
Using the bucket 1729407621 with a live range of .6741 to .6746
2024-10-20_18.48.13
Starting time: 2024-10-20 18:48:13-07:00 
Ending time: 2024-10-20 18:48:43-07:00
Using the bucket 1729407621 with a live range of .6766 to .6771
2024-10-20