# Processing for White Hmong research project
Macalester College, updated 08/06/2025 by Henry Heyden

In [1]:
import parselmouth
import tgt # TextGridTools, helps parsing text grids https://github.com/hbuschme/TextGridTools
import pandas as pd
from pandas import isnull
from numpy import nan
from math import log
from functools import cache

SEMITONE_REFERENCES = pd.read_csv('./hmong_semitone_references.csv')

In [None]:
# This is the newest code block, it adds the normalization function and various helpers thereof

def normalize_semitones(value: float, ref: float) -> float:
    """
    Given a value in Hz, and a ref pitch in Hz, returns the value normalized to semitones, following Zhang 2018.
    """
    return (12 / log(2, 10)) * log( value / ref , 10)

@cache
def get_ref(speaker: str, method: str, formantSwitch: bool) -> float:
    """
    Given a speaker, a method (e.g. Avg), and formant switch (True if F1, False if F0), returns the ref value for the normalize semitones function
    """
    row = SEMITONE_REFERENCES.loc[SEMITONE_REFERENCES['Speaker'] == speaker]
    f = 1 if formantSwitch else 0
    return row.iloc[-1][f'{method}F{f}']

In [3]:
def textPitch(file):
    """
    This function takes in the name of a file (.wav and .textgrid have to match) and returns the various python objects needed for processing
    :returns: tgt TextGrid object, parselmouth Pitch object, and parselmouth Formant object
    """
    audio = parselmouth.Sound(f"./wav/{file}.wav") # Parselmouth Sound object
    text = tgt.read_textgrid(f"./textgrid/{file}.TextGrid") # tgt TextGrid object
    pitch = audio.to_pitch_cc(time_step = 0.001, pitch_floor = 50, pitch_ceiling = 400, very_accurate = True) # TODO consider changing the default variables here, e.g. octave jump cost
    formants = audio.to_formant_burg()
    return text, pitch, formants

In [4]:
"""
This cell defines our two methods by which we sample F0 and F1 from a target token.
These functions return a length num_points (default: 10) list of values, given start time, duration, and pitch/formant object!
"""

def samplePoints(start_time, duration, sourceObject, ref, formantSwitch = False, num_points = 10):
    """
    This is the method that just takes the values at num_points (default: 10) equidistant points.
    """
    # We get num_points equidistant time points by dividing into num_points - 1 intervals, and taking those points as well as the beginning and end.
    subinterval_duration = duration / (num_points - 1)
    samples = []
    for i in range(num_points):
        if not formantSwitch:
            next_sample = sourceObject.get_value_at_time(start_time + (i * subinterval_duration))
        else:
            next_sample = sourceObject.get_value_at_time(1, start_time + (i * subinterval_duration))
        samples.append(normalize_semitones(next_sample, ref))
    return samples



def sampleAveragedIntervals(start_time, duration, sourceObject, ref, formantSwitch = False, num_points = 10):
    """
    This is the method that averages over num_points (default: 10) intervals.
    """
    subinterval_duration = duration / num_points
    samples = []
    for i in range(num_points):
        start = start_time + (i * subinterval_duration)
        end = start_time + ((i+1) * subinterval_duration)
        samples.append(sampleInterval(start, end, sourceObject, ref, formantSwitch))
    return samples

def sampleInterval(start_time, end_time, sourceObject, ref, formant_switch):
    """
    Helper for sampleAveragedIntervals! :D returns the average of 1-millisecond samples over the given interval
    """
    t = start_time
    samples = []
    while t < end_time:
        if not formant_switch:
            next_sample = sourceObject.get_value_at_time(t)
        else:
            next_sample = sourceObject.get_value_at_time(1, t)

        if not isnull(next_sample): #! as of right now, we *ignore* any null (voiceless) values. could change that!
            samples.append(normalize_semitones(next_sample, ref))

        t += 0.001

    if len(samples) < 1: return nan
    else: return sum(samples) / len(samples)



def createDataPoints(start_time, duration, sourceObject, ref: float, formantSwitch = False, num_points = 10, method = "samplePoints"):
    match method:
        case "samplePoints":
            return samplePoints(start_time, duration, sourceObject, ref, formantSwitch, num_points)
        case "sampleAveragedIntervals":
            return sampleAveragedIntervals(start_time, duration, sourceObject, ref, formantSwitch, num_points)
        case _:
            print('Invalid method :/')
            return [0 for _ in range(num_points)]
        

In [5]:
# Function to arrange the information into a data line
def arrangeDataLine(speaker_info, tone, interval_text, previous_tone, f0_values, f1_values):
    """
    :param speaker_info: is as described in the docstring of processFile below
    """

    filename, speaker = speaker_info

    currentIntervalTextList = interval_text.split('_') 
    # The above line gives a list of strings, where index 0 has the word, and index 1+ has phrase information, if given

    # Encode phrase info as binary 0/1
    ui = 1 if 'ui' in currentIntervalTextList else 0
    uf = 1 if 'uf' in currentIntervalTextList else 0
    p = 1 if 'p' in currentIntervalTextList else 0
    df = 1 if 'df' in currentIntervalTextList else 0

    # Headers: token_id, FileName, Speaker, tone, TextGrid text (w/ phrase info e.g. _ui), ui?, uf?, p?, df?, PreviousTone, F0_values (list), F1_values (list)
    # (^ from write to csv cell)
    return [filename, speaker, tone, interval_text, ui, uf, p, df, previous_tone, f0_values, f1_values]

In [6]:
def processFile(text, pitch, formants, tone, speaker_info, method = 'samplePoints'):
    """
    :param speaker_info: is a size-2 tuple of strings containing (file, speaker), TO BE EXPANDED LATER WITH: age, gender, etc.
    :param method: either samplePoints or sampleAveragedIntervals. thats the sampling method over the f0 and f1 metrics
    """
    previousTone = None
    dataForCurrentFile = []
    for interval in text.tiers[0]:
        interval_text = interval.text
        interval_text_list = interval_text.split('_')

        currentWord = interval_text_list[0]
        currentTone = currentWord[-1:]
        if currentTone not in ['b', 's', 'j', 'v', 'm', 'g', 'd']:
            currentTone = '0'

        # Encode previous tone
        if 'ui' in interval_text_list:
            previousTone = None
        
        if currentTone == tone:
            f0_ref = get_ref(speaker_info[1], 'Avg', False)
            f0_vals = createDataPoints(interval.start_time, interval.duration(), pitch, f0_ref, method = method)
            f1_ref = get_ref(speaker_info[1], 'Avg', True)
            f1_vals = createDataPoints(interval.start_time, interval.duration(), formants, f1_ref, formantSwitch = True, method = method)

            dataForCurrentFile.append(arrangeDataLine(speaker_info, tone, interval_text, previousTone, f0_vals, f1_vals))
            
        previousTone = currentTone

    return dataForCurrentFile

## The actual code that runs the important process is right here:

In [None]:
# Code that works through every given file for each speaker and story

tone = 'v' # Target tone, '0' for mid-level, otherwise use orthographic representation
# speakers = ['Cha', 'Chingla', 'Ellina', 'Gozong', 'Long', 'MaiXee', 'MaiXor', 'Ma']
speakers = ['Cha']
stories = ['1', '2', '3']

# TODO in this section, we can construct lists of sociological factors
# As of right now, I'm not sure where to get that information
age = []
gender = []

data = []
token_id = 0
for speakerIndex in range(len(speakers)):
    speaker = speakers[speakerIndex]
    for story in stories:
        file = f'{speaker}Story{story}'
        
        text, pitch, formants = textPitch(file)

        # See write to csv section for data headers :)
        speaker_info = (file, speaker)
        for fileLine in processFile(text, pitch, formants, tone, speaker_info, method = 'sampleAveragedIntervals'):
            newLine = [token_id]
            for item in fileLine:
                newLine.append(item)
            data.append(newLine)
            token_id += 1

# Write to csv

In [None]:
import csv
def hmongCSV(data, tone):
    with open(f'hmongData-{tone}-ST_AVG.csv', 'w', newline='') as f:
        writer = csv.writer(f)

        # Headers: token_id, FileName, Speaker, tone, TextGrid text (w/ phrase info e.g. _ui), ui?, uf?, p?, df?, PreviousTone, F0_values (list), F1_values (list)
        writer.writerow(['token_id', 'Filename', 'Speaker', 'Tone', 'Word', 'ui', 'uf', 'p', 'df', 'PreviousTone', 'F0_values', 'F1_values'])
        writer.writerows(data)

hmongCSV(data, tone)

This block is when we wanna do all tones in bulk. It is NOT the most efficient way, but it fits well with what we already do.

In [None]:
tones = ['b', 's', 'j', 'v', 'm', 'g', 'd', '0']
for tone in tones:

    # Code that works through every given file for each speaker and story
    speakers = ['Cha', 'Chingla', 'Ellina', 'Gozong', 'Long', 'MaiXee', 'MaiXor', 'Ma']
    stories = ['1', '2', '3']

    # TODO in this section, we can construct lists of sociological factors
    # As of right now, I'm not sure where to get that information
    age = []
    gender = []

    data = []
    token_id = 0
    for speakerIndex in range(len(speakers)):
        speaker = speakers[speakerIndex]
        print(f'Doing speaker {speaker} for tone {tone}')
        for story in stories:
            file = f'{speaker}Story{story}'
            
            text, pitch, formants = textPitch(file)

            # See write to csv section for data headers :)
            speaker_info = (file, speaker)
            for fileLine in processFile(text, pitch, formants, tone, speaker_info):
                newLine = [token_id]
                for item in fileLine:
                    newLine.append(item)
                data.append(newLine)
                token_id += 1

    hmongCSV(data = data, tone = tone)

Doing speaker Cha for tone b
Doing speaker Chingla for tone b
Doing speaker Ellina for tone b
Doing speaker Gozong for tone b
Doing speaker Long for tone b
Doing speaker MaiXee for tone b
Doing speaker MaiXor for tone b
Doing speaker Ma for tone b
Doing speaker Cha for tone s
Doing speaker Chingla for tone s
Doing speaker Ellina for tone s
Doing speaker Gozong for tone s
Doing speaker Long for tone s
Doing speaker MaiXee for tone s
Doing speaker MaiXor for tone s
Doing speaker Ma for tone s
Doing speaker Cha for tone j
Doing speaker Chingla for tone j
Doing speaker Ellina for tone j
Doing speaker Gozong for tone j
Doing speaker Long for tone j
Doing speaker MaiXee for tone j
Doing speaker MaiXor for tone j
Doing speaker Ma for tone j
Doing speaker Cha for tone v
Doing speaker Chingla for tone v
Doing speaker Ellina for tone v
Doing speaker Gozong for tone v
Doing speaker Long for tone v
Doing speaker MaiXee for tone v
Doing speaker MaiXor for tone v
Doing speaker Ma for tone v
Doing sp