# TEXTGRID HANDLING
This notebook is all the process from the textgrids to the creation of the sound snippets.
It works from the moment when all the sound files and textgrids were matched, and renamed and were put in one single directory. (This process was semi-manual, so I don’t have a clean notebook for that, because I was using both python and bash commands and some file explorer manual work.)

## Imports

In [1]:
# Import packages
from praatio import textgrid
import pandas as pd
import numpy as np
import os

## Getting files list

<span style="color:red;font-weight:bold">PICK TEST OR LIVE PATH WISELY</span>

In [2]:
# TEST FOLDER PATH
#folder = "/Users/rblc/code/canine_vocalization/sound_samples/"

# LIVE FOLDER PATH
folder = "/Volumes/LaCie/laica_sounds/all_sounds/"

In [3]:
# Create file_names list
def get_files(folder = folder):
    files = os.listdir(folder)
    return files

In [4]:
# Test file_names function
files = get_files()
files

['bark_001.TextGrid',
 'snippets',
 'bark_001.wav',
 'bark_002.TextGrid',
 'bark_002.wav',
 'bark_003.TextGrid',
 'bark_003.wav',
 'bark_004.TextGrid',
 'bark_004.wav',
 'bark_005.TextGrid',
 'bark_005.wav',
 'bark_006.TextGrid',
 'bark_006.wav',
 'bark_007.TextGrid',
 'bark_007.wav',
 'bark_008.TextGrid',
 'bark_008.wav',
 'bark_009.TextGrid',
 'bark_009.wav',
 'bark_010.TextGrid',
 'bark_010.wav',
 'bark_011.TextGrid',
 'bark_011.wav',
 'bark_012.TextGrid',
 'bark_012.wav',
 'bark_013.TextGrid',
 'bark_013.wav',
 'bark_014.TextGrid',
 'bark_014.wav',
 'bark_015.TextGrid',
 'bark_015.wav',
 'bark_016.TextGrid',
 'bark_016.wav',
 'bark_017.TextGrid',
 'bark_017.wav',
 'bark_018.TextGrid',
 'bark_018.wav',
 'bark_019.TextGrid',
 'bark_019.wav',
 'bark_020.TextGrid',
 'bark_020.wav',
 'bark_021.TextGrid',
 'bark_021.wav',
 'bark_022.TextGrid',
 'bark_022.wav',
 'bark_023.TextGrid',
 'bark_023.wav',
 'bark_024.TextGrid',
 'bark_024.wav',
 'bark_025.TextGrid',
 'bark_025.wav',
 'bark_026.T

In [5]:
# Create a list of wav files and textgrid files
def wavs_and_textgrids(files = files):
    wavs = sorted([x for x in files if x.endswith(".wav")])
    textgrids = sorted([x for x in files if x.endswith(".TextGrid")])
    return wavs, textgrids
    

In [6]:
wavs, textgrids = wavs_and_textgrids()

In [7]:
# Quick check to make sure the lists are the same length
print(f'Wavs: {len(wavs)}, Textgrids: {len(textgrids)}')

# If there are many files, comment this out
'''for x in wavs:
    print(x)

for x in textgrids:
    print(x)'''

Wavs: 922, Textgrids: 922


'for x in wavs:\n    print(x)\n\nfor x in textgrids:\n    print(x)'

## Raw dataframe from textgrids

In [8]:
def textgrid_to_raw_df(textgrids = textgrids, folder = folder):
    entry_dicts = []
    
    for i in range(len(textgrids)):
    
        textgrid_name = textgrids[i]
        file_path = os.path.join(folder, textgrid_name)
        try:
            tg = textgrid.openTextgrid(file_path, False)
            # loop through all possible tier levels
            for tier_name in tg.tierNames:
                tier = tg.getTier(tier_name)
                # append entry_dicts with a dictionary for each entry
                for entry in tier.entries:
                    entry_dict = {
                                'file_name': textgrid_name,
                                'tier_name': tier_name,
                                'start': entry.start,
                                'end': entry.end,
                                'label': entry.label}
                    entry_dicts.append(entry_dict)
        except:
            print(f'Error with {textgrid_name}')

    # create a pandas DataFrame from the list of dictionaries
    df = pd.DataFrame(entry_dicts)
    
    # convert all labels to lowercase
    df = df.apply(lambda x: x.astype(str).str.lower())
    
    # convert start and end columns to float type
    df['start'] = df['start'].astype(float)
    df['end'] = df['end'].astype(float)

    return df

In [9]:
raw_df = textgrid_to_raw_df()

In [10]:
# rename label content: whining to whine
raw_df['label'] = raw_df['label'].replace('whining', 'whine')
raw_df['label'] = raw_df['label'].replace('barking/yelping', 'yelp')
raw_df['label'] = raw_df['label'].replace('sound', 'bark')
raw_df['label'] = raw_df['label'].replace('growlgrowl', 'growl')
raw_df['label'] = raw_df['label'].replace('soundsound', 'bark')
raw_df['label'] = raw_df['label'].replace('soun', 'bark')
raw_df['label'] = raw_df['label'].replace('soundsoundsound', 'bark')
raw_df['label'] = raw_df['label'].replace('panting', 'pant')
raw_df['label'] = raw_df['label'].replace('sounds', 'bark')
raw_df['label'] = raw_df['label'].replace('other voclaization', 'other vocalization')
raw_df['label'] = raw_df['label'].replace('other vocalization', 'generic_dog_sound')

In [11]:
# Check value counts in label column
print(raw_df['label'].value_counts())

bark                 8609
f0                   3762
whine                2647
growl                2033
pant                 1073
x                     851
yelp                  849
generic_dog_sound     356
start                  86
end                    86
other                   1
Name: label, dtype: int64


## Creating the splicer dataframe

<span style="color:red;font-weight:bold">ONE CELL HAS TO BE ACTIVATED</span>


In [12]:
feature_wishlist = ['bark', 'whine', 'growl', 'pant', 'yelp']

In [13]:
def splicer_df_maker(raw_df = raw_df, feature_wishlist = feature_wishlist):
    slicer_df = raw_df[raw_df['label'].isin(feature_wishlist)]
    return slicer_df

In [14]:
splicer_df = splicer_df_maker()

In [15]:
# Check value counts in label column
print(splicer_df['label'].value_counts())

bark     8609
whine    2647
growl    2033
pant     1073
yelp      849
Name: label, dtype: int64


In [16]:
splicer_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15211 entries, 0 to 20352
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   file_name  15211 non-null  object 
 1   tier_name  15211 non-null  object 
 2   start      15211 non-null  float64
 3   end        15211 non-null  float64
 4   label      15211 non-null  object 
dtypes: float64(2), object(3)
memory usage: 713.0+ KB


In [19]:
#############################
# RUN ONLY IF REALLY NEEDED #
#############################


splicer_df.to_pickle('splicer_df.pkl')

In [22]:
#############################
# RUN ONLY IF REALLY NEEDED #
#############################


#splicer_df.to_csv('splicer_df.csv')

# CREATING SNIPPETS

## Imports

In [23]:
from pydub import AudioSegment
import math

## Checking the splicer dataframe

In [17]:
splicer_df

Unnamed: 0,file_name,tier_name,start,end,label
0,bark_001.textgrid,silences,0.362390,0.594390,bark
1,bark_001.textgrid,silences,0.922390,1.618390,bark
2,bark_001.textgrid,silences,2.026390,2.266390,bark
3,bark_001.textgrid,silences,2.490390,2.698390,bark
4,bark_001.textgrid,silences,3.730390,4.282390,bark
...,...,...,...,...,...
20348,threat_194.textgrid,silences,69.711196,71.743516,growl
20349,threat_194.textgrid,silences,73.173805,75.378593,growl
20350,threat_194.textgrid,silences,76.909300,77.822912,growl
20351,threat_194.textgrid,silences,78.782833,79.962138,growl


In [18]:
splicer_df.label.value_counts()

bark     8609
whine    2647
growl    2033
pant     1073
yelp      849
Name: label, dtype: int64

In [26]:
splicer_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15211 entries, 0 to 20352
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   file_name  15211 non-null  object 
 1   tier_name  15211 non-null  object 
 2   start      15211 non-null  float64
 3   end        15211 non-null  float64
 4   label      15211 non-null  object 
dtypes: float64(2), object(3)
memory usage: 713.0+ KB


## Auido splicing happens here

<span style="color:red;font-weight:bold">ONE CELL HAS TO BE ACTIVATED</span>

In [27]:
def export_audio_snippets(df = splicer_df, folder = folder):
    # Create folder if it doesn't exist
    os.makedirs(os.path.join(folder, 'snippets'), exist_ok=True)
    
    # Group the dataframe by label
    label_groups = df.groupby('label')
    
    # Loop over label groups
    for label, group in label_groups:
        # Loop over rows in the group
        for i, row in group.iterrows():
            # Extract information from the row
            file_name = row['file_name']
            start = row['start']
            end = row['end']
            
            # Create wav file name from file_name
            wav_file_name = os.path.splitext(file_name)[0]+'.wav'
            
            # Create the path to the input audio file
            path = os.path.join(folder, wav_file_name)
            
            # Read the input audio file
            audio = AudioSegment.from_wav(path)
            
            # Compute the start and end times in milliseconds
            t1 = int(start * 1000)
            t2 = int(end * 1000)
            
            # Extract the audio snippet
            snippet = audio[t1:t2]
            
            # Create the output file name
            output_name = f"{label}_{i:05d}.wav"
            
            # Export the audio snippet to a file
            output_path = os.path.join(folder, 'snippets', output_name)
            snippet.export(output_path, format="wav")

In [28]:
######################## 
## RUN THIS ONLY ONCE ##
## THIS CREATES FILES ##
##   MANY MANY FILES  ##
########################

#export_audio_snippets()

## Check the audiosnippets

In [29]:
print(f'Now go and check your snippets folder here: {folder}snippets' )

Now go and check your snippets folder here: /Volumes/LaCie/laica_sounds/all_sounds/snippets
