In [1]:
vocab = {'ɖ': 0, 'ɭ': 1, 'ɳ': 2, 'ʂ': 3, 'ʈ': 4, 'a': 5, 'ɑː': 6, 'b': 7, 'd': 8, 'e': 9, 'ə': 10, 'eː': 11, 'f': 12, 'ɡ': 13, 'h': 14, 'ɪ': 15, 'iː': 16, 'j': 17, 'k': 18, 'l': 19, 'm': 20, 'n': 21, 'ŋ': 22, 'ʊ': 23, 'uː': 24, 'p': 25, 'r': 26, 's': 27, 'ɧ': 28, 't': 29, 'ɕ': 30, 'ɵ': 31, 'ʉː': 32, 'v': 33, 'ʏ': 34, 'yː': 35, 'ɛ': 36, 'æː': 37, 'æ': 38, 'ɛː': 39, 'œ': 40, 'œ̞ː': 41, 'œ̞': 42, 'øː': 43, 'ɔ': 44, 'oː': 45, '|': 46, '<v>': 47, '~': 48, '<unk>': 49}

In [2]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel

In [15]:
class LongestMatchTokenizer:
    def __init__(self, vocab, unk_token="<unk>"):
        self.vocab = vocab  # Dictionary: token -> ID
        self.unk_token = unk_token  # Handle unknown tokens
        self.unk_id = vocab.get(unk_token, len(vocab))  # Default unknown token ID

    def encode(self, text):
        """Tokenize text using longest-match-first approach."""
        tokens = []
        i = 0
        while i < len(text):
            match = None
            match_token = None

            # Try to match the longest possible token
            for j in range(len(text), i, -1):
                sub = text[i:j]
                if sub in self.vocab:
                    match = sub
                    match_token = self.vocab[sub]
                    break  # Longest match found

            if match:
                tokens.append((match, match_token))
                i += len(match)  # Move forward in the text
            else:
                tokens.append((self.unk_token, self.unk_id))
                i += 1  # Move forward by 1 character if no match

        tokenized_str = " ".join([t[0] for t in tokens])
        token_ids = [t[1] for t in tokens]

        return tokenized_str, token_ids

In [16]:
tokenizer = LongestMatchTokenizer(vocab=vocab, unk_token="<unk>")

In [17]:
text = "apstrakʂuːn"

In [19]:
# encoded = tokenizer.encode(text)
# " ".join(encoded.tokens)
tokenizer.encode(text)

('a p s t r a k ʂ uː n', [5, 25, 27, 29, 26, 5, 18, 3, 24, 21])

In [13]:
text = "ɖɭɳʂʈ aɑː bd eə eːfɡ hɪiː j"

In [20]:
%pip install panphon

Collecting panphon
  Downloading panphon-0.21.2-py2.py3-none-any.whl (75 kB)
[K     |████████████████████████████████| 75 kB 5.8 MB/s eta 0:00:01
Collecting unicodecsv
  Downloading unicodecsv-0.14.1.tar.gz (10 kB)
Collecting munkres
  Downloading munkres-1.1.4-py2.py3-none-any.whl (7.0 kB)
Collecting editdistance
  Downloading editdistance-0.8.1-cp310-cp310-macosx_10_9_x86_64.whl (80 kB)
[K     |████████████████████████████████| 80 kB 11.9 MB/s eta 0:00:01
Building wheels for collected packages: unicodecsv
  Building wheel for unicodecsv (setup.py) ... [?25ldone
[?25h  Created wheel for unicodecsv: filename=unicodecsv-0.14.1-py3-none-any.whl size=10768 sha256=b2ba466b3ff3319ff1e7e73cb98bd7ed6722ddd96a30cff6482edd296917169d
  Stored in directory: /Users/joregan/Library/Caches/pip/wheels/9c/ea/66/8e45247b09052a933eb1a680b7c64802298faba58aac9b346b
Successfully built unicodecsv
Installing collected packages: unicodecsv, munkres, editdistance, panphon
Successfully installed editdistanc

In [21]:
sample = 'ɛːɡandə'

In [22]:
from panphon.segment import Segment

In [24]:
import panphon
ft = panphon.FeatureTable()

In [25]:
ft.word_fts(sample)

[<Segment [+syl, +son, -cons, +cont, -delrel, -lat, -nas, -strid, +voi, -sg, -cg, 0ant, -cor, 0distr, -lab, -hi, -lo, -back, -round, -velaric, -tense, +long, 0hitone, 0hireg]>,
 <Segment [-syl, -son, +cons, -cont, -delrel, -lat, -nas, -strid, +voi, -sg, -cg, -ant, -cor, 0distr, -lab, +hi, -lo, +back, -round, -velaric, 0tense, -long, 0hitone, 0hireg]>,
 <Segment [+syl, +son, -cons, +cont, -delrel, -lat, -nas, -strid, +voi, -sg, -cg, 0ant, -cor, 0distr, -lab, -hi, +lo, +back, -round, -velaric, +tense, -long, 0hitone, 0hireg]>,
 <Segment [-syl, +son, +cons, -cont, -delrel, -lat, +nas, -strid, +voi, -sg, -cg, +ant, +cor, -distr, -lab, -hi, -lo, -back, -round, -velaric, 0tense, -long, 0hitone, 0hireg]>,
 <Segment [-syl, -son, +cons, -cont, -delrel, -lat, -nas, -strid, +voi, -sg, -cg, +ant, +cor, -distr, -lab, -hi, -lo, -back, -round, -velaric, 0tense, -long, 0hitone, 0hireg]>,
 <Segment [+syl, +son, -cons, +cont, -delrel, -lat, -nas, -strid, +voi, -sg, -cg, 0ant, -cor, 0distr, -lab, -hi, -l

In [27]:
ft.word_fts("fœ̞ːrjʊʈ")

[<Segment [-syl, -son, +cons, +cont, -delrel, -lat, -nas, +strid, -voi, -sg, -cg, +ant, -cor, 0distr, +lab, -hi, -lo, -back, -round, -velaric, 0tense, -long, 0hitone, 0hireg]>,
 <Segment [+syl, +son, -cons, +cont, -delrel, -lat, -nas, -strid, +voi, -sg, -cg, 0ant, -cor, 0distr, -lab, -hi, -lo, -back, +round, -velaric, -tense, -long, 0hitone, 0hireg]>,
 <Segment [-syl, +son, +cons, +cont, 0delrel, -lat, -nas, -strid, +voi, -sg, -cg, +ant, +cor, -distr, -lab, 0hi, 0lo, -back, -round, -velaric, 0tense, -long, 0hitone, 0hireg]>,
 <Segment [-syl, +son, -cons, +cont, -delrel, -lat, -nas, -strid, +voi, -sg, -cg, -ant, -cor, 0distr, -lab, +hi, -lo, -back, -round, -velaric, 0tense, -long, 0hitone, 0hireg]>,
 <Segment [+syl, +son, -cons, +cont, -delrel, -lat, -nas, -strid, +voi, -sg, -cg, 0ant, -cor, 0distr, -lab, +hi, -lo, +back, +round, -velaric, -tense, -long, 0hitone, 0hireg]>,
 <Segment [-syl, -son, +cons, -cont, -delrel, -lat, -nas, -strid, -voi, -sg, -cg, -ant, +cor, -distr, -lab, -hi, -l

In [33]:
ft.word_fts("db")

[<Segment [-syl, -son, +cons, -cont, -delrel, -lat, -nas, -strid, +voi, -sg, -cg, +ant, +cor, -distr, -lab, -hi, -lo, -back, -round, -velaric, 0tense, -long, 0hitone, 0hireg]>,
 <Segment [-syl, -son, +cons, -cont, -delrel, -lat, -nas, -strid, +voi, -sg, -cg, +ant, -cor, 0distr, +lab, -hi, -lo, -back, -round, -velaric, 0tense, -long, 0hitone, 0hireg]>]

In [34]:
ft.word_fts("ɧʂ")

[<Segment [-syl, -son, +cons, +cont, +delrel, -lat, -nas, -strid, -voi, -sg, -cg, -ant, +cor, +distr, -lab, +hi, -lo, -back, -round, -velaric, 0tense, -long, 0hitone, 0hireg]>,
 <Segment [-syl, -son, +cons, +cont, -delrel, -lat, -nas, +strid, -voi, -sg, -cg, -ant, +cor, -distr, -lab, -hi, -lo, -back, -round, -velaric, 0tense, -long, 0hitone, 0hireg]>]