# preliminaries

## import packages

In [1]:
!git clone https://github.com/nickjourjine/peromyscus-pup-vocal-evolution.git
%cd peromyscus-pup-vocal-evolution
import sys
sys.path.append("/content/peromyscus-pup-vocal-evolution")
from src import features, annotation, parameters, segmentation, spectrogramming

Cloning into 'peromyscus-pup-vocal-evolution'...
remote: Enumerating objects: 1051, done.[K
remote: Counting objects: 100% (126/126), done.[K
remote: Compressing objects: 100% (89/89), done.[K
remote: Total 1051 (delta 75), reused 83 (delta 37), pack-reused 925 (from 1)[K
Receiving objects: 100% (1051/1051), 37.50 MiB | 11.23 MiB/s, done.
Resolving deltas: 100% (709/709), done.
/content/peromyscus-pup-vocal-evolution


In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

#file system
import os
import glob

#data handling
import numpy as np
import pandas as pd

#plotting
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

# custom code
from src import features, annotation, parameters, segmentation, spectrogramming


## load path variables

In [9]:
#paths to raw unprocessed recordings for each of the four datasets (from Dryad: https://doi.org/10.5061/dryad.g79cnp5ts)
all_wav_development = '/content/peromyscus-pup-vocal-evolution/data/audio/raw/development/'
all_wav_bw_po_cf = '/content/peromyscus-pup-vocal-evolution/data/audio/raw/bw_po_cf/'
all_wav_bw_po_f1 = '/content/peromyscus-pup-vocal-evolution/data/audio/raw/bw_po_f1/'
all_wav_bw_po_f2 = '/content/peromyscus-pup-vocal-evolution/data/audio/raw/bw_po_f2/'

#paths to clips for developmental dataset
all_clips_development = '/content/peromyscus-pup-vocal-evolution/data/audio/clips/development'
all_clips_bw_po_cf = '/content/peromyscus-pup-vocal-evolution/data/audio/clips/bw_po_cf/'
all_clips_bw_po_f1 = '/content/peromyscus-pup-vocal-evolution/data/audio/clips/bw_po_f1/'
all_clips_bw_po_f2 = '/content/peromyscus-pup-vocal-evolution/data/audio/clips/bw_po_f2/'

#root directory for all of the segments (start and stop times)
segments_root = '/content/peromyscus-pup-vocal-evolution/data/audio/clips/audio/segments/'

#directory to save wav files of each vocalization
clips_root = '/content/peromyscus-pup-vocal-evolution/data/audio/clips/'

#directory to save spectrograms and umap embeddings
specs_root = '/content/peromyscus-pup-vocal-evolution/data/features/spectrograms/'

#directroy to save data on background noise levels for each recording
noise_root = '/content/peromyscus-pup-vocal-evolution/data/audio/noise/'

#directroy to save data on clipping levels for each vocalizations
clipping_root = '/content/peromyscus-pup-vocal-evolution/data/features/clipping/amplitude_segmented/'

# Directory to save SNR and clipping data
snr_clipping_root = '/content/peromyscus-pup-vocal-evolution/data/features/snr_clipping/'

# segment the audio using amplitude thresholding

## set the segmenting parameters

In [10]:
import os
import numpy as np
from datetime import datetime
from scipy.ndimage import gaussian_filter1d
from src import parameters  # assuming you're in the peromyscus repo

# -----------------------------
# 🔧 音声区間検出関数の定義
# -----------------------------
def get_onsets_offsets(spec, seg_params):
    amp_trace = np.mean(spec, axis=0)
    smoothing_sigma = seg_params["smoothing_timescale"] * seg_params["fs"]
    amp_trace_smooth = gaussian_filter1d(amp_trace, sigma=smoothing_sigma)

    mask = (amp_trace_smooth > seg_params["th_1"]) | \
           (amp_trace_smooth > seg_params["th_2"]) | \
           (amp_trace_smooth > seg_params["th_3"])

    diff = np.diff(mask.astype(int))
    onsets = np.where(diff == 1)[0]
    offsets = np.where(diff == -1)[0]

    # 修正：対応が取れるように数合わせ
    if offsets.size > 0 and (onsets.size == 0 or onsets[0] > offsets[0]):
        offsets = offsets[1:]
    if onsets.size > offsets.size:
        onsets = onsets[:offsets.size]
    elif offsets.size > onsets.size:
        offsets = offsets[:onsets.size]

    return onsets, offsets

# -----------------------------
# 📦 セグメントパラメータ定義
# -----------------------------
seg_params = {
    'min_freq': 20e3,
    'max_freq': 125e3,
    'nperseg': 1024,
    'noverlap': 1024 // 2,
    'spec_min_val': 0.8,
    'spec_max_val': 6,
    'fs': 250000,
    'th_1': 0.3,
    'th_2': 0.3,
    'th_3': 0.35,
    'min_dur': 0.015,
    'max_dur': 1,
    'min_intersyllable': 0.004,
    'smoothing_timescale': 0.00025,
    'softmax': False,
    'temperature': 0.01,
    'thresholds_path': None,
    'algorithm': get_onsets_offsets  # 関数をここで指定（保存時には除外）
}

# -----------------------------
# 🗂️ 保存先の準備
# -----------------------------
dataset = 'bw_po_cf'
segments_root = 'peromyscus_output/segments'  # 必要なら修正
iteration = datetime.now().strftime("%Y%m%d_%H%M%S")
params_save_name = f"{dataset}_{iteration}"
params_save_dir = os.path.join(segments_root, dataset, 'amplitude_segmented', iteration, '00_params')

# ディレクトリ作成
os.makedirs(params_save_dir, exist_ok=True)

# -----------------------------
# 💾 パラメータ保存（algorithmを除く）
# -----------------------------
import copy
seg_params_for_save = copy.deepcopy(seg_params)
seg_params_for_save.pop("algorithm")  # 関数は保存できない

# 保存実行
parameters.save(params=seg_params_for_save, save_dir=params_save_dir, save_name=params_save_name)

# 確認出力
print("✅ パラメータ保存完了")
print("📝 保存先:", params_save_dir)
print("📁 ファイル名:", params_save_name)
print("🔁 iteration:", iteration)


making a new params file...
saved the params file to:
 peromyscus_output/segments/bw_po_cf/amplitude_segmented/20250527_113136/00_params/bw_po_cf_20250527_113136
✅ パラメータ保存完了
📝 保存先: peromyscus_output/segments/bw_po_cf/amplitude_segmented/20250527_113136/00_params
📁 ファイル名: bw_po_cf_20250527_113136
🔁 iteration: 20250527_113136


In [11]:
#choose the parameters for  MZ
seg_params = {
    'min_freq': 20e3, # minimum frequency
    'max_freq': 125e3, # maximum frequency
    'nperseg': 1024, # FFT
    'noverlap': 1024//2, # FFT
    'spec_min_val': 2, # minimum log-spectrogram value
    'spec_max_val': 6, # maximum log-spectrogram value
    'fs': 250000, # audio samplerate
    'th_1':.3, # segmenting threshold 1
    'th_2':.3, # segmenting threshold 2
    'th_3':.35, # segmenting threshold 3
    'min_dur':0.015, # minimum syllable duration
    'max_dur': 1, # maximum syllable duration
    'min_intersyllable': .004,
    'smoothing_timescale': 0.00025, # amplitude
    'softmax': False, # apply softmax to the frequency bins to calculate
                      # amplitude
    'temperature':0.01, # softmax temperature parameter
    'thresholds_path': None,
    'algorithm': '<function get_onsets_offsets at 0x7f944cc355f0>', # (defined above)
}


#name them
params_save_name = ('_').join(['MZ',dataset,iteration])
params_save_dir = os.path.join(segments_root,dataset,'amplitude_segmented',iteration,'00_params')+'/'

#save them
assert os.path.exists(params_save_dir)
assert params_save_dir.endswith('/')
parameters.save(params = seg_params, save_dir = params_save_dir, save_name = params_save_name)



making a new params file...
saved the params file to:
 peromyscus_output/segments/bw_po_cf/amplitude_segmented/20250527_113136/00_params/MZ_bw_po_cf_20250527_113136


## segment

In [12]:
#get raw audio to segment

# change these if you want to keep segmenting from a previous iterationn

if dataset == 'development':
    raw_dir = all_wav_raw
elif dataset == 'bw_po_cf':
    raw_dir = all_wav_bw_po_cf
elif dataset == 'bw_po_f1':
    raw_dir = all_wav_bw_po_f1
elif dataset == 'bw_po_f2':
    raw_dir = all_wav_bw_po_f2

if dataset == 'bw_po_cf':
    species_list = ['BW', 'PO', 'CF-BW', 'CF-PO']
elif dataset == 'development':
    species_list = ['NB', 'PO', 'MZ', 'BK', 'LL', 'MU', 'BW', 'GO', 'SW', 'LO', 'IS']

#if the dataset is bw_po_f2, ensure that the corrupted unsegmented files aren't in raw_dir
unsegmentable = ['ch8_BWxPO-cross-F2_26878x27490_fam-D3_ltr6_pup4_ch8_4700_f_333_298_fr0_p9_2021-04-16_15-26-07.wav',
                 'ch8_BWxPO-cross-F2_26878x27490_fam-D3_ltr6_pup7_ch8_3800_f_338_285_fr0_p7_2021-04-14_17-27-18.wav',
                 'ch8_BWxPO-cross-F2_27404x27407_fam-A7_ltr1_pup7_ch8_4200_m_328_275_fr1_p7_2021-01-05_13-47-10.wav']

for i in unsegmentable:
    assert i not in os.listdir(raw_dir)

#specify the directory where segment csvs will be saved
save_dir = os.path.join(segments_root,dataset,'amplitude_segmented',iteration)

#load the params and make sure everything looks ok
print('\ndata set is:\n\t',dataset, '\n')

# load the parameters
params_save_dir = os.path.join(segments_root,dataset,'amplitude_segmented',iteration,'00_params')+'/'
params_save_name = ('_').join([dataset,iteration])
seg_params = parameters.load(save_dir = params_save_dir, save_name = params_save_name)

if dataset == 'development':
    MZ_seg_params = parameters.load(save_dir = params_save_dir, save_name = ('_').join(['MZ',dataset,iteration]))

print('\nthey are:\n')
for key in seg_params.keys():
    print('\t',key,':',seg_params[key])

if dataset == 'development':
    print('\nMZ specific params are:\n')
    for key in seg_params.keys():
        print('\t',key,':',MZ_seg_params[key])

print('\nstart and stop times will be identified in raw wav files here:\n\t', raw_dir)
print('\nand saved here:\n\t', save_dir)

val = input('everything look ok for segmenting? y/n')
assert val in ['y','n']
if val == 'n':
    print('ok - doing nothing')
elif val == 'y':

    #iterate through each species you want and segment
    for species in species_list:

        #get the MZ specific parameters if you're segmenting MZ
        if species=='MZ':
            seg_params = parameters.load(save_dir = params_save_dir, save_name = ('_').join(['MZ',dataset,iteration]))

        segmentation.get_amplitude_segments(audio_dir = raw_dir,
                                            save_dir = save_dir,
                                            seg_params = seg_params,
                                            species = species,
                                            thresholds_path = seg_params['thresholds_path'],
                                            intersyll_threshold = seg_params['min_intersyllable'],
                                            duration_threshold = seg_params['min_dur'])



FileNotFoundError: [Errno 2] No such file or directory: '/content/peromyscus-pup-vocal-evolution/data/audio/raw/bw_po_cf/'

## aggregate the segments files

In [None]:
#choose the dataset and segmenting iteration if you want (otherwise will existing values for dataset and iteration
#- these are the only things you have to change in this cell to process a new dataset
dataset = 'bw_po_f2'
iteration = '20220921_040238'

#path to the dir containing one csv with segment start and stop times per raw audio file
segments_path = os.path.join(segments_root,dataset,'amplitude_segmented',iteration)


#make sure you segmented every wav file
if dataset == 'development':
    all_wav_raw = all_wav_raw
elif dataset == 'bw_po_cf':
    all_wav_raw = all_wav_bw_po_cf
elif dataset == 'bw_po_f1':
    all_wav_raw = all_wav_bw_po_f1
elif dataset == 'bw_po_f2':
    all_wav_raw = all_wav_bw_po_f2


raw_wavs = [i for i in os.listdir(all_wav_raw) if not i.startswith('.')]
segmented_wavs = [i.split('.')[0]+'.wav' for i in os.listdir(segments_path) if not i.startswith('.') and 'all' not in i]
assert sorted(raw_wavs) == sorted(segmented_wavs), "You haven't segmented all of the raw wav files..."

#make a list of the segments files for each pup
pup_segments = [os.path.join(segments_path, i) for i in os.listdir(segments_path) if i.endswith('.csv') and 'all' not in i and not i.startswith('.')]

#combine them
to_combine = []
for temp in pup_segments:
    temp_df = pd.read_csv(temp)
    to_combine.append(temp_df)

all_combined = pd.concat(to_combine)

#add species column info and fix up the source_file columns
all_combined['species'] = [i.split('/')[-1].split('_')[0] for i in all_combined['source_file']]
all_combined['source_file'] = [os.path.split(i)[-1] for i in all_combined['source_file']]

#check for na and duplications, make sure source file is formatted correctly, then save

assert all_combined.isna().sum().sum() == 0
assert all_combined.duplicated().sum() == 0
assert set([i.split('_')[0] for i in all_combined['source_file']]) == set(all_combined['species'])
all_combined.to_csv(os.path.join(segments_path, 'all_combined.csv'), index=False)
print('saved a combined file to:\n\t', os.path.join(segments_path, 'all_combined.csv'))

#preview to make sure column names look ok
all_combined.head()


# generate wav clips from amplitude segmented segments

## write wav files for vocalizations

In [None]:
#choose the dataset and iteration

#load the combined segments csv
segments_path = os.path.join(segments_root,dataset,'amplitude_segmented',iteration)
source_data_path = os.path.join(segments_path, 'all_combined.csv')
source_data = pd.read_csv(source_data_path)

#make it's what you expect and that you cleaned up any na and duplications
assert source_data.isna().sum().sum() == 0
assert source_data.duplicated().sum() == 0
assert set(source_data.columns) == set(['start_seconds', 'stop_seconds', 'source_file', 'duration', 'species'])

if dataset == 'bw_po_cf':
    assert set(source_data['species'].unique()) == set(['BW', 'PO', 'CF-BW', 'CF-PO'])
elif dataset == 'bw_po_f1':
    assert set(source_data['species'].unique()) == set(['cross-BW', 'cross-PO', 'BW-PO-cross-F1'])
elif dataset == 'bw_po_f2':
    assert set(source_data['species'].unique()) == set(['ch1', 'ch2', 'ch3', 'ch4', 'ch5', 'ch6', 'ch7', 'ch8'])
    #drop the unsegmentable bw_po_f2
    unsegmentable = ['ch8_BWxPO-cross-F2_26878x27490_fam-D3_ltr6_pup4_ch8_4700_f_333_298_fr0_p9_2021-04-16_15-26-07.wav',
                 'ch8_BWxPO-cross-F2_26878x27490_fam-D3_ltr6_pup7_ch8_3800_f_338_285_fr0_p7_2021-04-14_17-27-18.wav',
                 'ch8_BWxPO-cross-F2_27404x27407_fam-A7_ltr1_pup7_ch8_4200_m_328_275_fr1_p7_2021-01-05_13-47-10.wav']
    source_data = source_data.loc[~source_data['source_file'].isin(unsegmentable)]

elif dataset == 'development':
    assert sorted(species_list) == sorted(['BW', 'BK', 'NB', 'SW', 'PO', 'LO', 'GO', 'LL', 'MU', 'MZ'])

#get the path to the raw unsegmented wavs
if dataset == 'bw_po_cf':
    wavs_dir = all_wav_bw_po_cf
elif dataset == 'bw_po_f1':
    wavs_dir = all_wav_bw_po_f1
elif dataset == 'bw_po_f2':
    wavs_dir = all_wav_bw_po_f2
elif dataset == 'development':
    wavs_dir = all_wav_raw

bar = '######################################################################################################'
#set the directory for saving and make it TODO - get the data from datetime
save_root = os.path.join(clips_root,'amplitude_segmented',dataset, iteration)
if iteration not in os.listdir(os.path.join(clips_root,'amplitude_segmented',dataset)):
    os.mkdir(save_root)
    print('made a directory to save vocalization clips:', save_root)
assert os.path.exists(save_root)

print(bar)
print('segmenting wavs from this directory:\n\n',wavs_dir,'\n')
print(bar)
print('using start and stop times from this file:\n\n',source_data_path,'\n')
print(bar)
print('saving them to individual species directories here:\n\n',save_root,'\n')
print(bar)
print('those species are:\n\n',sorted(source_data['species'].unique()),'\n')
print(bar)
print(len(source_data), 'vocalization clips will be written to wav files')

val = input("continue? y/n")
assert val in ['y', 'n']
if val == 'n':
    print('ok - doing nothing.')
elif val == 'y':
#iterate through the species
    species_list = list(source_data['species'].unique())
    for species in species_list:
        print(species)

        save_location = os.path.join(save_root,species)
        start_column = 'start_seconds'
        end_column = 'stop_seconds'

        if species not in os.listdir(save_root):
            print('making a directory to store', species, 'vocalization clips...')
            os.mkdir(save_location)

        segmentation.get_wav_clips(wavs_dir = wavs_dir,
                                   save_location = save_location,
                                   source_data = source_data,
                                   start_column = start_column,
                                   end_column = end_column,
                                   label_column = None,
                                   species = species,
                                   margin = 0,
                                   units = 's')

## write wav files for inter-vocalization intervals

In [None]:
#segment background using - useful for finding wav clips that don't have any vocalizations in them
# dataset = 'bw_po_f2'
# iteration = '20220921_040238'

#choose the species and directories where things will be saved
species =['MZ']

#name the columns that will record start and stop time of background in seconds
start_column = 'start_seconds'
stop_column = 'stop_seconds'

#set directories automatically
raw_wavs_dir = os.path.join(all_wavs_raw, dataset, iteration)
save_dir = os.path.join(save_dir, '01_background_clips')
if not os.path.exists(save_dir):
    os.path.mkdir(save_dir)
    print('made a directory at', save_dir)

#path to the csv made in the previous cell
all_segments_df = os.path.join(segments_path, 'all_combined.csv')


for s in species:
    save_location = save_dir+s+'/'

    #make a directory for the species
    if s not in os.listdir(save_dir):
        print('making a directory to store', s, 'background clips...')
        save_location = save_dir+s+'/'
        os.mkdir(save_location)

    segmentation.get_background_clips(raw_wavs_dir=raw_wavs_dir ,
                                      save_location=save_location,
                                      all_segments_df=all_segments_df,
                                      start_column=start_column,
                                      stop_column=stop_column,
                                      label_column = None,
                                      species = s,
                                      units = 's')

## choose noise clips

In [None]:
#best to run this cell one species at a time and save clips to species specific directories
species = 'BK'

#path to directory containing raw audio
audio_dir = all_wav_raw

#path to directory containing segments generated in section 2.2 above
dataset = 'development'
iteration='20230118_083823'
seg_df = os.path.join(segments_root,dataset,'amplitude_segmented',iteration, 'all_combined.csv')

#path to save wav clips
save_dir = os.path.join(noise_root, 'test_20230120', species)

if not os.path.exists(save_dir):
    os.mkdir(save_dir)
    print('made the directory', save_dir)

pups = sorted([i.split('.')[0] for i in os.listdir(all_wav_raw) if species in i])

for pup in pups:
    annotation.get_noise_clip(pup=pup,
                              audio_dir=audio_dir,
                              seg_csv=seg_df,
                              save_dir=save_dir,
                              margin=0,
                              min_dur=2,
                              max_dur=3,
                              units = 's')

## calculate noise floors

In [None]:
#use the background clips in cell 3.3 above to calculate "noise floors", ie the spectrogram value
#below which you will consider a pixel "background"

save_dir = os.path.join(noise_root, 'test_20230120')
species_list = ['BW', 'BK', 'NB', 'SW', 'PO', 'LO', 'GO', 'LL', 'MU', 'MZ', 'IS']


noise_spec_params = {
    'min_freq': 5000, # minimum frequency
    'max_freq': 125000, # maximum frequency
    'nperseg': 512, # FFT
    'noverlap': 512 // 4, # FFT
    'fs': 250000, # audio samplerate

}

parameters.save_parameters(params = noise_spec_params,
                           save_dir = save_root,
                           save_name = 'noise_spec_params')

noise_floors_df = []
for species in species_list:
    df =  annotation.get_noise_floor(noise_dir = os.path.join(noise_root, species),
                                     thresh = 2,
                                     species = species,
                                     save_dir = save_dir,
                                     spec_params = noise_spec_params,
                                     verbose=False,
                                     save = False)

    noise_floors_df.append(df)

new = pd.concat(noise_floors_df)
print('done.')

done.


# get clipping

## set spectrogramming parameters

In [None]:
#choose params that match the ones you will use for getting features
noise_spec_params = {
    'min_freq': 5000, # minimum frequency
    'max_freq': 12500, # maximum frequency
    'nperseg': 1024, # FFT
    'noverlap': 1024 // 4, # FFT
    'fs': 250000, # audio samplerate
}

spec_params = {
    'min_freq': 5000, # minimum frequency
    'max_freq': 124999, # maximum frequency
    'nperseg': 256, # FFT
    'noverlap': 256 // 4, # FFT
    'spec_min_val': 0.7, # minimum log-spectrogram value
    'fs': 250000, # audio samplerate
    'downsample_by':2, #2 means take every other pixel from the original spectrogram
    'log_resize_scaling_factor':None
}


## get noise floors

In [None]:
noise_floors_path = '/peromyscus-pup-vocal-evolution/data/audio/noise/all_noise_floors.csv'
nfdf = pd.read_csv(noise_floors_path)

## get clipping

In [None]:
clipping_threshold = 0.95
dataset = 'bw_po_cf'
iteration = '20230206_050454'
#iteration = '20230206_99thresh'
save = True

if save:
    if not os.path.exists(os.path.join(snr_clipping_root,dataset,iteration)):
        os.mkdir(os.path.join(snr_clipping_root,dataset,iteration))

################################################################################################################

if dataset == 'bw_po_cf':
    to_process_dir = os.path.join(all_clips_bw_po_cf, iteration)
    species_list = os.listdir(to_process_dir)
    assert set(species_list) == set(['BW', 'PO', 'CF-BW', 'CF-PO'])

elif dataset == 'bw_po_f1':
    to_process_dir = all_clips_bw_po_f1
    species_list = os.listdir(to_process_dir)
    assert set(species_list) == set(['cross-BW', 'cross-PO', 'BW-PO-cross-F1'])

elif dataset == 'bw_po_f2':
    to_process_dir = all_clips_bw_po_f2
    species_list = os.listdir(to_process_dir)
    assert set(species_list) == set(['ch1', 'ch2', 'ch3', 'ch4', 'ch5', 'ch6', 'ch7', 'ch8'])

    #drop the unsegmentable bw_po_f2
    unsegmentable = ['ch8_BWxPO-cross-F2_26878x27490_fam-D3_ltr6_pup4_ch8_4700_f_333_298_fr0_p9_2021-04-16_15-26-07.wav',
                 'ch8_BWxPO-cross-F2_26878x27490_fam-D3_ltr6_pup7_ch8_3800_f_338_285_fr0_p7_2021-04-14_17-27-18.wav',
                 'ch8_BWxPO-cross-F2_27404x27407_fam-A7_ltr1_pup7_ch8_4200_m_328_275_fr1_p7_2021-01-05_13-47-10.wav']
    source_data = source_data.loc[~source_data['source_file'].isin(unsegmentable)]

elif dataset == 'development':
    print('dataset is:', dataset)
    to_process_dir = all_clips_development
    species_list = os.listdir(to_process_dir)
    assert sorted(species_list) == sorted(['BW', 'BK', 'NB', 'SW', 'PO', 'LO', 'GO', 'LL', 'MU', 'MZ', 'IS'])

#get clipping percents and corresponding to each wav file

clipping_dfs = []
for species in species_list:
    print('calculating clipping for each vocalization clip in:\n\t', os.path.join(to_process_dir, species))
    clipping_df = features.get_clipping_batch(wav_dir = os.path.join(to_process_dir, species),
                                              threshold = clipping_threshold,
                                              species = species)
    if save:
        save_dir = os.path.join(snr_clipping_root,dataset,iteration)
        save_name = ('_').join([species,'clipping.csv'])

        if save_name not in os.listdir(save_dir):
            clipping_df.to_csv(os.path.join(save_dir,save_name), index=False)
            print('\tsaved clipping csv to:', os.path.join(save_dir,save_name))
        else:
            print('\tclipping csv already exists...')

print('done.')



In [None]:
## aggregate clipping cvs

save = True

dataset = 'bw_po_cf'
iteration = iteration

save_dir = os.path.join(snr_clipping_root,dataset,iteration)
save_name = ('_').join(['all', dataset, 'clipping.csv'])
to_aggregate = [i for i in glob.glob(os.path.join(save_dir, '*clipping.csv')) if os.path.split(i)[-1] != save_name]

all_clipping = []
for file in to_aggregate:
    df = pd.read_csv(file)
    all_clipping.append(df)

all_clipping_df = pd.concat(all_clipping)

if save:
    save_dir = os.path.join(snr_clipping_root,dataset,iteration)
    if not os.path.exists(os.path.join(save_dir,save_name)):
        all_clipping_df.to_csv(os.path.join(save_dir,save_name), index=False)
        print('saved a file:', os.path.join(save_dir,save_name))

print('done.')



# UMAP embedding

## set species and paths

In [None]:
#set the species to process
species_list = ['PO','BW', 'BK', 'NB', 'SW', 'LO', 'GO', 'LL', 'MU', 'MZ']

#the dataset for umap
dataset = 'development'
segment_iteration = '20230118_083823' #this is the segmenting iteration for the segments in the preprint

#unique iteration ID for each time you do an embedding for a dataset
umap_iteration = parameters.get_date_time()

#all predicted start and stop times for this dataset
aggregated_segments_path = os.path.join(segments_root,dataset,'amplitude_segmented',segment_iteration,'all_combined.csv')
seg_df = pd.read_csv(aggregated_segments_path)

#path to wav files for each segmented vocalization
all_wav_dir = os.path.join(clips_root,'amplitude_segmented',dataset,segment_iteration)
assert os.path.exists(all_wav_dir)

#path to directory where spectrograms and umap embedding coordinates will be save
all_spec_dir = os.path.join(specs_root,'amplitude_segmented',dataset, segment_iteration)
if segment_iteration not in os.listdir(os.path.join(specs_root,'amplitude_segmented',dataset)):
    os.mkdir(all_spec_dir)
    print('made a directory to store spectrograms for umap embeddings :\t\n', all_spec_dir, '\n')
assert os.path.exists(all_spec_dir)

#path to the noise floors csvs generated by annotation.get_noise_clip() and annotation.get_noise_floor()
assert os.path.exists(noise_floors_path)


## choose and save spectrogram parameters

In [None]:
#write the spec params

umap_iteration = '20230120_105045'

#make necessary directories for spectrograms
specs_dir = os.path.join(all_spec_dir,umap_iteration)+'/'
if umap_iteration not in os.listdir(all_spec_dir):
    os.mkdir(specs_dir)
    print('made a root directory to store umap embeddings from this version_name:\t\n', specs_dir, '\n')

if '00_params' not in os.listdir(specs_dir):
    os.mkdir(os.path.join(specs_dir,'00_params'))
    print('made a params directory to store umap embeddings from this version_name:\t\n', specs_dir+'00_params\n')

#write the params dictionaries for each species
for species in species_list:
    species_param_name = ('_').join([species,'spec_params',umap_iteration])
    species_wav_clips_dir = os.path.join(clips_root, 'amplitude_segmented', dataset, segment_iteration, 'vocalization_clips', species)
    max_dur = float(np.max(seg_df['duration'].loc[seg_df['species'] == species]))
    print('longest predicted voc from', species, 'is', max_dur, 'seconds')

    spec_params = {
        'species': species,
        'min_freq': 5000, # minimum frequency
        'max_freq': 125000, # maximum frequency
        'nperseg': 512, # FFT
        'noverlap': 512 // 4, # FFT
        'spec_min_val': .5, # minimum log-spectrogram value - update from noise floors dataframe if noise_floors_path provided
        'fs': 250000, # audio samplerate
        'fill_value': .5,
        'max_duration':max_dur,
        'num_time_bins':128,
        'num_freq_bins':128,
        'spec_max_val':10,
        'wav_clips_source':species_wav_clips_dir,
        'noise_floors_path': noise_floors_path
    }

    #save spec params if they don't exist
    parameters.save_parameters(spec_params, os.path.join(specs_dir,'00_params'), species_param_name)


## find UMAP embeddings for each species

In [None]:
#set the directory for saving and make it TODO - get the data from datetime
dataset = 'development'
species_list = ['PO','BW', 'BK', 'NB', 'SW', 'LO', 'GO', 'LL', 'MU', 'MZ']
segment_iteration = '20230118_083823'
umap_iteration = '20230120_105045'

for species in species_list:
    print('##########################################################################')
    species_wav_clips_dir = os.path.join(clips_root, 'amplitude_segmented', dataset, segment_iteration, 'vocalization_clips', species)
    species_param_name = ('_').join([species,'spec_params',umap_iteration])
    print(species)
    print('##########################################################################')
    print('getting umap embedding from wav clips here.......\n\n',species_wav_clips_dir ,'\n')
    print('using these parameters..............\n\n', os.path.join(specs_dir+'00_params/', species_param_name), '\n')
    print('and saving umap coordinates here.......\n\n',specs_dir,'\n')
    print(len(seg_df.loc[seg_df['species'] == species]), 'vocalization clips will be processed')

val = input('continue?' 'y/n')

assert val in ['y', 'n']

if val == 'n':
    print('ok - doing nothing.')

elif val == 'y':

    for species in species_list:
        params_dir = os.path.join(specs_dir+'00_params/')
        params_name = ('_').join([species,'spec_params',umap_iteration])

        #load the spec params
        print('loading parameters...')
        spec_params = parameters.load_parameters(params_dir, params_name)
        print('done.')

        #get the clips for the embedding (excluding noise)
        print('getting paths to wav clips...')
        clips_to_process = [i for i in glob.glob(os.path.join(spec_params['wav_clips_source'],'*.wav'))]
        print('done.')

        #make the umap
        print('getting umap embeddings...')
        spectrogramming.wavs_to_umap(clips_dir=None,
                                     noise_floors_path = spec_params['noise_floors_path'],
                                     species = None,
                                     noise_floor = None,
                                     spec_params = spec_params,
                                     num_to_process = 'all',
                                     filtered_clips = clips_to_process,
                                     version=umap_iteration,
                                     save_root = specs_dir)