# Word Segmentation on Brent with a Transition Probability Model

Herman Kamper, 2021

## Preliminaries

In [33]:
from datetime import datetime
from pathlib import Path
from tqdm import tqdm
from wordseg.algos import tp
import numpy as np
import sys

sys.path.append("..")

from utils import eval_segmentation

## Utility functions

In [34]:
def get_segmented_sentence(ids, boundaries, id_to_symbol, join_char=""):
    output = ""
    cur_word = []
    for i_symbol, boundary in enumerate(boundaries):
        cur_word.append(id_to_symbol[ids[i_symbol]])
        if boundary:
            output += join_char.join(cur_word)
            output += " "
            cur_word = []
    return output.strip()

def sentence_to_boundaries(sentence):
    boundaries = []
    for char in sentence:
        if char == " ":
            boundaries[-1] = True
        else:
            boundaries.append(False)
    boundaries[-1] = True
    return np.array(boundaries)

## Data

In [35]:
# Load data
fn = Path("../data")/"br-phono.txt"
print("Reading:", fn)
sentences_ref = []
with open(fn) as f:
    for line in f:
        sentences_ref.append(line.strip())
print("No. sentences:", len(sentences_ref))
train_sentences_ref = sentences_ref[:]

print("\nExample training sentence reference:")
print(train_sentences_ref[0])

Reading: ../data/br-phono.txt
No. sentences: 9790

Example training sentence reference:
yu want tu si D6 bUk


## Segmentation 

In [36]:
prepared_text = [" ".join([j for j in i.replace(" ", "")]).strip() for i in train_sentences_ref]
print("Example training sentence:")
print(prepared_text[0])

Example training sentence:
y u w a n t t u s i D 6 b U k


In [37]:
# Segment
threshold="relative"
dependency="ftp"
segmented_sentences = list(
    tp.segment(prepared_text, threshold=threshold, dependency=dependency)
    )

## Evaluation

In [38]:
reference_boundaries = []
predicted_boundaries = []
for ref, pred in tqdm(zip(train_sentences_ref, segmented_sentences)):
    reference_boundaries.append(sentence_to_boundaries(ref))
    predicted_boundaries.append(sentence_to_boundaries(pred))

9790it [00:00, 198927.59it/s]


In [39]:
p, r, f  = eval_segmentation.score_boundaries(
    reference_boundaries, predicted_boundaries
    )
print("-"*(79 - 4))
print("Word boundaries:")
print("Precision: {:.4f}%".format(p*100))
print("Recall: {:.4f}%".format(r*100))
print("F-score: {:.4f}%".format(f*100))
print("OS: {:.4f}%".format(eval_segmentation.get_os(p, r)*100))
print("-"*(79 - 4))

p, r, f = eval_segmentation.score_word_token_boundaries(
    reference_boundaries, predicted_boundaries
    )
print("Word token boundaries:")
print("Precision: {:.4f}%".format(p*100))
print("Recall: {:.4f}%".format(r*100))
print("F-score: {:.4f}%".format(f*100))
print("OS: {:.4f}%".format(eval_segmentation.get_os(p, r)*100))
print("-"*(79 - 4))

---------------------------------------------------------------------------
Word boundaries:
Precision: 58.8965%
Recall: 70.9433%
F-score: 64.3611%
OS: 20.4541%
---------------------------------------------------------------------------
Word token boundaries:
Precision: 43.8684%
Recall: 50.2111%
F-score: 46.8259%
OS: 14.4585%
---------------------------------------------------------------------------
