In [1]:
from pathlib import Path
import re
from typing import List, Tuple

from colorama import Fore, Style
import pandas as pd

In [2]:
presentation_words_path = "./lw1.01.0.word.csv"
presentation_phonemes_path = "./lw1.01.0.phoneme.csv"
tokenized_path = "./lw1.tokenized.txt"
story_name = "lw1"
output_dir = "."

In [3]:
tokens = Path(tokenized_path).read_text().split(" ")

In [4]:
# TODO fix encoding mistake in tokenizer. it's the em dash
tokens[95:105]

['.',
 'ĠHarmon',
 "'s",
 'Ġface',
 'Ġslowly',
 'Ġanimated',
 'ĠâĢĶ',
 'Ġjoy',
 'Ġsweeping',
 'Ġin']

In [5]:
words_df = pd.read_csv(presentation_words_path, index_col=0)
phonemes_df = pd.read_csv(presentation_phonemes_path, index_col=0)

In [6]:
words_df

Unnamed: 0,story,story_uid,sound_id,kind,meg_file,start,sound,word,sequence_id,condition,word_index,speech_rate,voice,pronounced,onset,duration,value,sample
0,lw1,0.0,0.0,word,A0167_MASC_1_16Mar17_01.con,0.000000,stimuli/audio/lw1_0.wav,Tara,0.0,sentence,0.0,205.0,Allison,1.0,23.506,0.30,697,23506
1,lw1,0.0,0.0,word,A0167_MASC_1_16Mar17_01.con,0.310000,stimuli/audio/lw1_0.wav,stood,0.0,sentence,1.0,205.0,Allison,1.0,23.816,0.24,698,23816
2,lw1,0.0,0.0,word,A0167_MASC_1_16Mar17_01.con,0.550000,stimuli/audio/lw1_0.wav,stock,0.0,sentence,2.0,205.0,Allison,1.0,24.056,0.37,699,24056
3,lw1,0.0,0.0,word,A0167_MASC_1_16Mar17_01.con,1.080000,stimuli/audio/lw1_0.wav,still,0.0,sentence,3.0,205.0,Allison,1.0,24.586,0.40,700,24586
4,lw1,0.0,0.0,word,A0167_MASC_1_16Mar17_01.con,1.630000,stimuli/audio/lw1_0.wav,waiting,0.0,sentence,4.0,205.0,Allison,1.0,25.136,0.41,701,25136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
663,lw1,0.0,3.0,word,A0167_MASC_1_16Mar17_01.con,50.070000,stimuli/audio/lw1_3.wav,end,52.0,sentence,15.0,205.0,Allison,1.0,361.097,0.17,3119,361097
664,lw1,0.0,3.0,word,A0167_MASC_1_16Mar17_01.con,50.250000,stimuli/audio/lw1_3.wav,for,52.0,sentence,16.0,205.0,Allison,1.0,361.277,0.14,3120,361277
665,lw1,0.0,3.0,word,A0167_MASC_1_16Mar17_01.con,50.460000,stimuli/audio/lw1_3.wav,project,52.0,sentence,18.0,205.0,Allison,1.0,361.487,0.58,3121,361487
666,lw1,0.0,3.0,word,A0167_MASC_1_16Mar17_01.con,51.179999,stimuli/audio/lw1_3.wav,and,52.0,sentence,19.0,205.0,Allison,1.0,362.207,0.15,3122,362207


In [7]:
def patch_story(words_df, story_name):
    # Ignore word list content.
    words_df = words_df[~(words_df.condition == "word_list")]
    # Ignore pseudowords.
    # TODO maybe also ignore following N?
    words_df = words_df[~(words_df.condition == "pseudo_words")]
    
    if story_name == "lw1":
        # Typo fix
        assert words_df.loc[571, "word"] == "It's"
        words_df.loc[571, "word"] = "Its"
    else:
        raise ValueError(f"Unknown story {story_name}.")
        
    return words_df

In [8]:
words_df = patch_story(words_df, story_name)
words_df

Unnamed: 0,story,story_uid,sound_id,kind,meg_file,start,sound,word,sequence_id,condition,word_index,speech_rate,voice,pronounced,onset,duration,value,sample
0,lw1,0.0,0.0,word,A0167_MASC_1_16Mar17_01.con,0.000000,stimuli/audio/lw1_0.wav,Tara,0.0,sentence,0.0,205.0,Allison,1.0,23.506,0.30,697,23506
1,lw1,0.0,0.0,word,A0167_MASC_1_16Mar17_01.con,0.310000,stimuli/audio/lw1_0.wav,stood,0.0,sentence,1.0,205.0,Allison,1.0,23.816,0.24,698,23816
2,lw1,0.0,0.0,word,A0167_MASC_1_16Mar17_01.con,0.550000,stimuli/audio/lw1_0.wav,stock,0.0,sentence,2.0,205.0,Allison,1.0,24.056,0.37,699,24056
3,lw1,0.0,0.0,word,A0167_MASC_1_16Mar17_01.con,1.080000,stimuli/audio/lw1_0.wav,still,0.0,sentence,3.0,205.0,Allison,1.0,24.586,0.40,700,24586
4,lw1,0.0,0.0,word,A0167_MASC_1_16Mar17_01.con,1.630000,stimuli/audio/lw1_0.wav,waiting,0.0,sentence,4.0,205.0,Allison,1.0,25.136,0.41,701,25136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
663,lw1,0.0,3.0,word,A0167_MASC_1_16Mar17_01.con,50.070000,stimuli/audio/lw1_3.wav,end,52.0,sentence,15.0,205.0,Allison,1.0,361.097,0.17,3119,361097
664,lw1,0.0,3.0,word,A0167_MASC_1_16Mar17_01.con,50.250000,stimuli/audio/lw1_3.wav,for,52.0,sentence,16.0,205.0,Allison,1.0,361.277,0.14,3120,361277
665,lw1,0.0,3.0,word,A0167_MASC_1_16Mar17_01.con,50.460000,stimuli/audio/lw1_3.wav,project,52.0,sentence,18.0,205.0,Allison,1.0,361.487,0.58,3121,361487
666,lw1,0.0,3.0,word,A0167_MASC_1_16Mar17_01.con,51.179999,stimuli/audio/lw1_3.wav,and,52.0,sentence,19.0,205.0,Allison,1.0,362.207,0.15,3122,362207


In [9]:
assert (words_df[["story", "sequence_id", "word_index"]].apply(lambda xs: " ".join(map(str, xs)), axis=1).value_counts() == 1).all(), \
    "(story, sequence_id, word_index) should identify a unique word in the presentation"

assert (words_df.story == story_name).all()
assert (phonemes_df.story == story_name).all()

In [10]:
only_punct_re = re.compile(r"^[^A-zÀ-ž0-9]+$")


class Aligner:
    
    flag_types = {
        "recap": 0,  # the word was repeated one or more times in the FA
    }
    
    def __init__(self, tokens, words_df,
                 max_skip_patience: int = 20):
        self.tokens = tokens
        self.words_df = words_df
        
        # Tracks alignment between indices in FA corpus (original, prior to filtering)
        # and indices in tokens_flat. Third element indicates various metadata about
        # alignment (see `flag_types`).
        self.alignment: List[Tuple[int, int, int]] = []
        # Track current transaction for modifying alignment
        self.transaction = []
        
        self.tok_cursor = 0
        self.word_cursor = 0
        
        self.max_skip_patience = max_skip_patience
        self.skip_patience = self.max_skip_patience
    
    def process_token(self, token):
        return token.replace("Ġ", "").lower()

    @property
    def tok_el(self) -> str:
        return self.process_token(self.tokens[self.tok_cursor])
    
    @property
    def word_row(self) -> pd.Series:
        return self.words_df.iloc[self.word_cursor]
    
    @property
    def word_el(self) -> str:
        return self.word_row.word.lower()
    
    @property
    def word_index(self) -> int:
        return self.words_df.index[self.word_cursor]
    
    def advance(self, first_delta=1):
        next_token = None
        while self.tok_cursor + 1 < len(self.tokens) and \
            (next_token is None or only_punct_re.match(next_token)):
            self.tok_cursor += first_delta if next_token is None else 1

            next_token = self.tok_el

        # print("///", tok_cursor, next_token)
        
    def start_transaction(self):
        self._orig_tok_cursor = self.tok_cursor
        self._orig_word_cursor = self.word_cursor
        
    def commit_transaction(self):
        self.skip_patience = self.max_skip_patience
        self.alignment += self.transaction
        self.transaction = []
        
    def drop_transaction(self):
        self.skip_patience = self.max_skip_patience
        self.tok_cursor = self._orig_tok_cursor
        self.word_cursor = self._orig_word_cursor
        self.transaction = []
        
    def stage(self, flags=None, do_advance=True):
        print(f"{self.word_el} -- {self.tok_el}")
        self.transaction.append((self.word_index, self.tok_cursor, flags))

        # Reset skip patience
        self.skip_patience = self.max_skip_patience

        # Advance cursor
        if do_advance:
            try:
                self.advance()
            except IndexError:
                raise StopIteration
                
    def attempt_match(self) -> bool:
        fa_el = self.word_el

        if fa_el == self.tok_el:
            self.start_transaction()
            self.stage()
            self.commit_transaction()
            return True
        elif fa_el.startswith(self.tok_el):
            self.start_transaction()
            while fa_el.startswith(self.tok_el):
                fa_el = fa_el[len(self.tok_el):]
                self.stage()
                
            if len(fa_el) > 0:
                self.err(f"Residual FA el {fa_el} not covered by token {self.tok_el}. Drop transaction.")
                print(self.skip_patience)
                self.drop_transaction()
                return False
            else:
                self.commit_transaction()
                return True
        else:
            return False

    def err(self, msg):
        print(f"{Fore.RED}{msg}{Style.RESET_ALL}")
        print(self.words_df.iloc[self.word_cursor - 5 : self.word_cursor + 5])
        print(self.tokens[self.tok_cursor - 5 : self.tok_cursor + 5])
        # raise ValueError(str((self.word_el, self.tok_el)))
        
    def __call__(self):
        # Reset state
        self.tok_cursor = 0
        self.word_cursor = 0
        self.skip_patience = self.max_skip_patience
        self.alignment = []
        
        while True:
            if self.word_cursor >= len(self.words_df):
                return self.alignment

            result = self.attempt_match()
            if result:
                self.word_cursor += 1 
            elif self.skip_patience > 0:
                # Try skipping this token and see if we find success in the near future.
                print(f"{Fore.YELLOW}Skipping token {self.tok_el}, didn't match with {self.word_el}{Style.RESET_ALL}")
                self.advance()
                self.skip_patience -= 1
            else:
                self.err("Failed to find alignment. Stop.")
                break

In [11]:
alignment = Aligner(tokens, words_df, max_skip_patience=30)()
alignment = pd.DataFrame(alignment, columns=["word_idx", "token_idx", "flags"])

tara -- t
tara -- ara
stood -- stood
stock -- stock
still -- still
waiting -- waiting
for -- for
the -- the
first -- first
tiny -- tiny
gleam -- gle
gleam -- am
from -- from
the -- the
scout -- scout
craft -- craft
to -- to
appear -- appear
in -- in
the -- the
darkness -- darkness
of -- of
the -- the
[33mSkipping token worm, didn't match with the[0m
[33mSkipping token hole, didn't match with the[0m
the -- the
gentle -- gentle
constant -- constant
breeze -- breeze
of -- of
recycled -- recycled
air -- air
from -- from
the -- the
vent -- vent
above -- above
blew -- blew
an -- an
annoying -- annoying
hair -- hair
against -- against
her -- her
nose -- nose
but -- but
she -- she
ignored -- ignored
it -- it
a -- a
gasp -- gasp
from -- from
the -- the
psychic -- psychic
broke -- broke
her -- her
silent -- silent
vigil -- vigil
and -- and
she -- she
turned -- turned
results -- results
harmon -- harmon
she -- she
suppressed -- suppressed
the -- the
surge -- surge
of -- of
annoyance -- annoya

In [12]:
# NB inner join -- will duplicate word rows if there are multiple corresponding tokens.
words_df = pd.merge(words_df.rename_axis("word_idx").reset_index(),
                    alignment.drop(columns=["flags"]),
                    on="word_idx")
words_df

Unnamed: 0,word_idx,story,story_uid,sound_id,kind,meg_file,start,sound,word,sequence_id,condition,word_index,speech_rate,voice,pronounced,onset,duration,value,sample,token_idx
0,0,lw1,0.0,0.0,word,A0167_MASC_1_16Mar17_01.con,0.000000,stimuli/audio/lw1_0.wav,Tara,0.0,sentence,0.0,205.0,Allison,1.0,23.506,0.30,697,23506,0
1,0,lw1,0.0,0.0,word,A0167_MASC_1_16Mar17_01.con,0.000000,stimuli/audio/lw1_0.wav,Tara,0.0,sentence,0.0,205.0,Allison,1.0,23.506,0.30,697,23506,1
2,1,lw1,0.0,0.0,word,A0167_MASC_1_16Mar17_01.con,0.310000,stimuli/audio/lw1_0.wav,stood,0.0,sentence,1.0,205.0,Allison,1.0,23.816,0.24,698,23816,2
3,2,lw1,0.0,0.0,word,A0167_MASC_1_16Mar17_01.con,0.550000,stimuli/audio/lw1_0.wav,stock,0.0,sentence,2.0,205.0,Allison,1.0,24.056,0.37,699,24056,3
4,3,lw1,0.0,0.0,word,A0167_MASC_1_16Mar17_01.con,1.080000,stimuli/audio/lw1_0.wav,still,0.0,sentence,3.0,205.0,Allison,1.0,24.586,0.40,700,24586,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
634,663,lw1,0.0,3.0,word,A0167_MASC_1_16Mar17_01.con,50.070000,stimuli/audio/lw1_3.wav,end,52.0,sentence,15.0,205.0,Allison,1.0,361.097,0.17,3119,361097,918
635,664,lw1,0.0,3.0,word,A0167_MASC_1_16Mar17_01.con,50.250000,stimuli/audio/lw1_3.wav,for,52.0,sentence,16.0,205.0,Allison,1.0,361.277,0.14,3120,361277,919
636,665,lw1,0.0,3.0,word,A0167_MASC_1_16Mar17_01.con,50.460000,stimuli/audio/lw1_3.wav,project,52.0,sentence,18.0,205.0,Allison,1.0,361.487,0.58,3121,361487,921
637,666,lw1,0.0,3.0,word,A0167_MASC_1_16Mar17_01.con,51.179999,stimuli/audio/lw1_3.wav,and,52.0,sentence,19.0,205.0,Allison,1.0,362.207,0.15,3122,362207,923


In [13]:
# Asof merge: join phonemes to most recent corresponding word onset. NB the merged token idx will be the last subword token of the corresponding word
phonemes_df = pd.merge_asof(phonemes_df, words_df[["onset", "duration", "token_idx", "word_idx"]].assign(end_word=words_df.onset + words_df.duration),
                            on="onset", direction="backward", suffixes=("", "_word"))

In [17]:
# But this means phonemes of words unassociated with elements of words_df will
# be matched with onsets of the most recent word that is indeed in words_df.
# That's no good. Find these and drop.
phonemes_to_drop = phonemes_df.onset >= phonemes_df.end_word
print(f"Dropping {phonemes_to_drop.sum()} phoneme instances from words not matched with tokens")

phonemes_df = phonemes_df[~phonemes_to_drop]
print(f"{len(phonemes_df)} phonemes remain.")

Dropping 255 phoneme instances from words not matched with tokens
2207 phonemes remain.


In [18]:
# Verify there is a reasonable word length distribution
phonemes_df.groupby("word_idx").size().sort_values()

word_idx
253     1
602     1
43      1
212     1
561     1
       ..
359    10
88     12
438    12
69     12
508    16
Length: 613, dtype: int64

In [19]:
# Remove words with missing phoneme data.
words_df = words_df[words_df.word_idx.isin(set(phonemes_df.word_idx))]

In [20]:
assert set(words_df.word_idx) == set(phonemes_df.word_idx), \
    "Word and phoneme level annotations should cover the same set of word IDs"

In [21]:
words_df.to_csv(Path(output_dir) / "word.csv")
phonemes_df.to_csv(Path(output_dir) / "phoneme.csv")