In [None]:
# Load packages
import os
import glob
from pathlib import Path
import shutil
import pandas as pd
from dotenv import load_dotenv

In [None]:
# Load local file paths from .env file and create variables
load_dotenv()

data_dirs = os.getenv('data_dirs')
data_dirs_parent = os.getenv('data_dirs_parent')
data_paths_txt = os.getenv('data_paths_txt')
prev_sample_dirs = os.getenv('prev_sample_dirs')
new_sample_dir = os.getenv('new_sample_dir')

In [None]:
# Define function to get file names from comma-separated list of directories
def list_files(dirs):
    
    # Split directories into separate file paths where there are multiple
    dirs_list = dirs.split(",")

    # Get list of .WAV audio file paths from all specified folders
    file_paths = [file for x in dirs_list for file in glob.glob(f"{x}/*.WAV")]

    # Return list
    return file_paths

In [None]:
# Get full list of files to sample from
file_list = list_files(data_dirs)

# Add metadata (cage number, audiomoth number) from file path to file name, and drop rest of file path
file_list = [x.rsplit('/', 2)[1] + '-' + x.rsplit('/', 2)[2] for x in file_list]

In [None]:
# For sampling from a newline separated .txt of file paths
# NOTE: Comment out if you're selecting the sample from a folder specified in .env instead
with open(data_paths_txt, 'r') as file:
    data_paths = [line.strip() for line in file.readlines()]

data_paths = [data_dirs + '/' + x for x in data_paths]

file_list = [x.rsplit('/', 2)[1] + '-' + x.rsplit('/', 2)[2] for x in data_paths]

In [None]:
# Get list of previous samples to avoid sampling (i.e. take a sample without replacement)
prev_samples = list_files(prev_sample_dirs)

# Get file name, removing rest of file path
# Previous samples already have the metadata I need (cage number, audiomoth number) in the file name
prev_samples = [x.rsplit('/', 1)[1] for x in prev_samples]

In [None]:
# Get list of files to sample from, excluding previous samples
file_list_sampling = set(file_list) - set(prev_samples)

In [None]:
# Make dataframe with 1 row per file
file_df = pd.DataFrame({'file_path': list(file_list_sampling)})

In [None]:
# Prep variables to stratify by
file_df['cage'] = file_df['file_path'].str.slice(0, 2)
file_df['date'] = file_df['file_path'].str.slice(32, 40)
file_df['start_time'] = file_df['file_path'].str.slice(41, 47)
file_df['start_hour'] = file_df['start_time'].str.slice(0, 2)
file_df['file_name'] = file_df['file_path'].str.replace("/", "_")

In [None]:
# Split start times into 3 bins
# My AudioMoths were configured to record from 20:00 to 03:30 which makes these roughly equal
file_df['time_bin'] = file_df['start_hour'].case_when(
    [
        (file_df['start_hour'].isin(['20', '21', '22']), 'early'),
        (file_df['start_hour'].isin(['23', '00', '01']), 'mid'),
        (file_df['start_hour'].isin(['02', '03', '04']), 'late')
    ]
)

In [None]:
# Check whether door was open or closed based on date
file_df['door_open'] = file_df['date'].case_when(
    [
        (file_df['date'] < '20230624', False),
        (file_df['date'] >= '20230624', True)
    ]
)

In [None]:
# Group by variables and take stratified random sample
sample_df = file_df.groupby(['cage', 'time_bin', 'door_open']).sample(n=20, random_state=1)

In [None]:
# Tidy up list of file paths to copy from
sample = sample_df['file_path'].tolist()

In [None]:
# Copy selected files to a new folder
save_dir = new_sample_dir
os.makedirs(save_dir, exist_ok=True)

for file_name in sample:
    file_path = data_dirs_parent + file_name.replace("-", "/")
    save_path = save_dir + '/' + file_name
    shutil.copyfile(file_path, save_path)