# Filter a list of audiomoth files by their dates and start times

In [1]:
import datetime
from opensoundscape.audiomoth import audiomoth_start_time
import pytz
import numpy as np
import pandas as pd
from pathlib import Path
from glob import glob

### Define the local timezone
- all date ranges, time ranges, and comparisons will use this time zone
- AudioMoth start time from file name will be converted to this time zone automatically

To list all timezones available, print `pytz.all_timezones`

In [2]:
time_zone = pytz.timezone('US/Eastern')

## select date and time ranges
note that these are the allowed recording **start times** (eg if a recording starts within a period and extends beyond it, it will be included)

In this example, we filter to files starting between 8 and 9pm Eastern time on dates from April 15 to May 5 from any of 3 years: 2020, 2021, or 2022

In [3]:
time_ranges = [
    [datetime.time(20,0,0),datetime.time(21,0,0)], 
    # can list multiple time periods to include
]

date_ranges = [
    [datetime.date(2020,4,15),datetime.date(2020,5,5)],
    [datetime.date(2021,4,15),datetime.date(2021,5,5)],
    [datetime.date(2022,4,15),datetime.date(2022,5,5)],
    #can list multiple date ranges to include
]

get a list of all audio files in your dataset
 - this globbing pattern assumes audio files are .WAV and located in subfolders of the `dataset_path`. For instance, audio files would have paths like, `/path/to/dataset/folder1/file1.WAV`

In [18]:
#folder containig your audio data
dataset_path = f'/path/to/dataset/' 

files = glob(f"{dataset_path}/*/*.WAV")

#if you don't have subfolders, use this line instead:
# files = glob(f"{dataset_path}/*.WAV")

df = pd.DataFrame({'file':files})
print(f"Found {len(df)} files")

Found 2 files


## Filter files
This cell filters the dataframe to include only files starting in one of the specified date and time ranges

In [21]:
## FILTER FILES ## 

# add date and time information to the dataframe
df['datetime_utc'] = df["file"].apply(lambda f: audiomoth_start_time(Path(f).name))
df['datetime_edt'] = df['datetime_utc'].apply(lambda t: t.astimezone(time_zone)) 
df['date'] = df['datetime_edt'].apply(lambda t: t.date()) 
df['time'] = df['datetime_edt'].apply(lambda t: t.time())

def in_range(x,r): #helper function
    if x>=r[0] and x<=r[1]:
        return True
    return False

#filter to files that *start in* one of the date ranges
#note that the start time and date are converted to the user-specified timezone before
#checking if they fall within the date and time range
df = df[df['date'].apply(lambda t: max([in_range(t, date_range) for date_range in date_ranges]))]
print(f"Filtered by date: now has {len(df)} files")

#filter to files *starting in* one of the time_ranges
df = df[df['time'].apply(lambda t: max([in_range(t, time_range) for time_range in time_ranges]))]
print(f"Filtered by start times: now has {len(df)} files")

Filtered by date: now has 0 files
Filtered by start times: now has 0 files
