# Word Segmentation on Brent

Copyright (c) 2021-2022 Herman Kamper, MIT License

Train a duration penalized dynamic programming autoencoding recurrent neural network (DPDP AE-RNN) and perform word segmentation on the Brent corpus.

## Preliminaries

In [1]:
from datetime import datetime
from pathlib import Path
from scipy.stats import gamma
from sklearn import cluster
from torch.utils.data import DataLoader
from tqdm import tqdm
import numpy as np
import sys
import torch
import torch.nn as nn

sys.path.append("..")

from dpdp_aernn import datasets, models, viterbi
from utils import eval_segmentation

## Utility functions

In [2]:
def get_segmented_sentence(ids, boundaries, id_to_symbol, join_char=""):
    output = ""
    cur_word = []
    for i_symbol, boundary in enumerate(boundaries):
        cur_word.append(id_to_symbol[ids[i_symbol]])
        if boundary:
            output += join_char.join(cur_word)
            output += " "
            cur_word = []
    return output.strip()

In [3]:
# Duration penalty functions

# Histogram
histogram = np.array([
    0., 0.051637, 0.36365634, 0.35984765, 0.1537391,
    0.04632681, 0.01662638, 0.00644547, 0.00131839, 0.00040284,
    0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001,
    0.0001, 0.0001
    ])
histogram = histogram/np.sum(histogram)
def neg_log_hist(dur):
    return -np.log(0 if dur >= len(histogram) else histogram[dur])

def neg_chorowski(dur):
    return -(dur - 1)

# Cached Gamma
shape, loc, scale = (7, 0, 0.4)
gamma_cache = []
for dur in range(50):
    gamma_cache.append(gamma.pdf(dur, shape, loc, scale))
gamma_cache = np.array(gamma_cache)
def neg_log_gamma(dur):
    if dur < 50:
        return -np.log(gamma_cache[dur])
    else:
        return -np.log(gamma.pdf(dur, shape, loc, scale))

## Data

In [4]:
# Load data
fn = Path("../data")/"br-phono.txt"
print("Reading:", fn)
sentences_ref = []
with open(fn) as f:
    for line in f:
        sentences_ref.append(line.strip())
print("No. sentences:", len(sentences_ref))
train_sentences_ref = sentences_ref[:]
val_sentences_ref = sentences_ref[:1000]
# test_sentences_ref = sentences_ref[8000:]
test_sentences_ref = sentences_ref[:]

print("\nExample training sentence reference:")
print(train_sentences_ref[0])

Reading: ../data/br-phono.txt
No. sentences: 9790

Example training sentence reference:
yu want tu si D6 bUk


In [5]:
# Vocabulary
PAD_SYMBOL = "<pad>"
SOS_SYMBOL = "<s>"    # start of sentence
EOS_SYMBOL = "</s>"   # end of sentence
symbols = set()
for sentence in sentences_ref:
    for char in sentence:
        symbols.add(char)
SYMBOLS = [PAD_SYMBOL, SOS_SYMBOL, EOS_SYMBOL] + (sorted(list(symbols)))
symbol_to_id = {s: i for i, s in enumerate(SYMBOLS)}
id_to_symbol = {i: s for i, s in enumerate(SYMBOLS)}

def text_to_id(text, add_sos_eos=False):
    """
    Convert text to a list of symbol IDs.

    Sentence start and end symbols can be added by setting `add_sos_eos`.
    """
    symbol_ids = [symbol_to_id[t] for t in text]
    if add_sos_eos:
        return ([
            symbol_to_id[SOS_SYMBOL]] + symbol_ids +
            [symbol_to_id[EOS_SYMBOL]
            ])
    else:
        return symbol_ids
print(text_to_id(train_sentences_ref[0]))
print(
    [id_to_symbol[i] for i in  text_to_id(train_sentences_ref[0])]
    )

[51, 48, 3, 50, 30, 42, 47, 3, 47, 48, 3, 46, 38, 3, 15, 11, 3, 31, 27, 39]
['y', 'u', ' ', 'w', 'a', 'n', 't', ' ', 't', 'u', ' ', 's', 'i', ' ', 'D', '6', ' ', 'b', 'U', 'k']


In [6]:
# Current train and validation
cur_train_sentences = train_sentences_ref
cur_val_sentences = val_sentences_ref[:100]
cur_train_sentences = ["".join(i.split(" ")) for i in cur_train_sentences]

## Model

In [7]:
# AE-RNN model
n_symbols = len(SYMBOLS)
symbol_embedding_dim = 25
hidden_dim = 200
embedding_dim = 25
teacher_forcing_ratio = 0.5  # 1.0
n_encoder_layers = 3  # 2  # 1  # 10
n_decoder_layers = 1  # 2  # 1
batch_size = 32  # 32
learning_rate = 0.001
input_dropout = 0.0
dropout = 0.0
n_epochs_max = 5

encoder = models.Encoder(
    n_symbols=n_symbols,
    symbol_embedding_dim=symbol_embedding_dim,
    hidden_dim=hidden_dim,
    embedding_dim=embedding_dim,
    n_layers=n_encoder_layers,
    dropout=dropout,
    input_dropout=input_dropout,
    )
decoder = models.Decoder1(
    n_symbols=n_symbols,
    symbol_embedding_dim=symbol_embedding_dim,
    hidden_dim=hidden_dim,
    embedding_dim=embedding_dim,
    n_layers=n_decoder_layers,
    sos_id = symbol_to_id[SOS_SYMBOL],
    teacher_forcing_ratio=teacher_forcing_ratio,
    dropout=dropout
    )
# decoder = models.Decoder2(
#     n_symbols=n_symbols,
#     hidden_dim=hidden_dim,
#     embedding_dim=embedding_dim,
#     n_layers=n_decoder_layers,
#     )
model = models.EncoderDecoder(encoder, decoder)

## Pre-training

In [8]:
# Training device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Random seed
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

# Training data
train_dataset = datasets.WordDataset(cur_train_sentences, text_to_id)
train_loader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True,
    collate_fn=datasets.pad_collate
    )

# Validation data
val_dataset = datasets.WordDataset(cur_val_sentences, text_to_id)
val_loader = DataLoader(
    val_dataset, batch_size=batch_size, shuffle=True,
    collate_fn=datasets.pad_collate
    )

# Loss
criterion = nn.NLLLoss(
    reduction="sum", ignore_index=symbol_to_id[PAD_SYMBOL]
    )
optimiser = torch.optim.Adam(model.parameters(), lr=learning_rate)

for i_epoch in range(n_epochs_max):

    # Training
    model.train()
    train_losses = []
    for i_batch, (data, data_lengths) in enumerate(tqdm(train_loader)):
        optimiser.zero_grad()
        data = data.to(device)       
        encoder_embedding, decoder_output = model(
            data, data_lengths, data, data_lengths
            )

        loss = criterion(
            decoder_output.contiguous().view(-1, decoder_output.size(-1)),
            data.contiguous().view(-1)
            )
        loss /= len(data_lengths)
        loss.backward()
        optimiser.step()
        train_losses.append(loss.item())

    # Validation
    model.eval()
    val_losses = []
    with torch.no_grad():
        for i_batch, (data, data_lengths) in enumerate(val_loader):
            data = data.to(device)            
            encoder_embedding, decoder_output = model(
                data, data_lengths, data, data_lengths
                )

            loss = criterion(
                decoder_output.contiguous().view(-1,
                decoder_output.size(-1)), data.contiguous().view(-1)
                )
            loss /= len(data_lengths)
            val_losses.append(loss.item())

    print(
        "Epoch {}, train loss: {:.3f}, val loss: {:.3f}".format(
        i_epoch,
        np.mean(train_losses),
        np.mean(val_losses))
        )
    sys.stdout.flush()

100%|███████████████████████████████████████████████████████████████████████████████| 306/306 [00:05<00:00, 57.26it/s]

Epoch 0, train loss: 28.153, val loss: 3.164



100%|███████████████████████████████████████████████████████████████████████████████| 306/306 [00:05<00:00, 57.12it/s]

Epoch 1, train loss: 21.699, val loss: 1.839



100%|███████████████████████████████████████████████████████████████████████████████| 306/306 [00:05<00:00, 57.28it/s]


Epoch 2, train loss: 20.097, val loss: 1.466


100%|███████████████████████████████████████████████████████████████████████████████| 306/306 [00:05<00:00, 56.98it/s]

Epoch 3, train loss: 18.818, val loss: 1.195



100%|███████████████████████████████████████████████████████████████████████████████| 306/306 [00:05<00:00, 57.41it/s]

Epoch 4, train loss: 17.908, val loss: 1.000





In [9]:
# Examples without segmentation

# Apply to validation data
print("Examples:")
model.eval()
with torch.no_grad():
    for i_batch, (data, data_lengths) in enumerate(val_loader):
        data = data.to(device)
        encoder_embedding, decoder_output = model(
            data, data_lengths, data, data_lengths
            )

        y, log_probs = model.decoder.greedy_decode(
            encoder_embedding,
            max_length=20,
            )
        x = data.cpu().numpy()

        for i_input in range(y.shape[0]):
            # Only print up to EOS symbol
            input_symbols = []
            for i in x[i_input]:
                if (i == symbol_to_id[EOS_SYMBOL] or i ==
                        symbol_to_id[PAD_SYMBOL]):
                    break
                input_symbols.append(id_to_symbol[i])
            output_symbols = []
            for i in y[i_input]:
                if (i == symbol_to_id[EOS_SYMBOL] or i ==
                        symbol_to_id[PAD_SYMBOL]):
                    break
                output_symbols.append(id_to_symbol[i])

            print("Input: ", "".join(input_symbols))
            print("Output:", "".join(output_symbols))
            print()

            if i_input == 10:
                break

        break

Examples:
Input:  6
Output: 6brASizInD6d%D6dOgiz

Input:  It
Output: Its6dr&g~d&dizgoINtu

Input:  h*
Output: hQmEloztusizIts6d%It

Input:  D&t
Output: D&ts6dOgizn9studidiz

Input:  yu
Output: yuk&nduItD6d%D6dOgiz

Input:  It
Output: Its6dr&g~d&dizgoINtu

Input:  tek
Output: tekItizInD6d%zizIts6

Input:  hIm
Output: hIz6nADRbUkItItItIts

Input:  D6
Output: D6b7z6blaksIts6d%Its

Input:  pUl
Output: pUlItmQtItItItItIts6

Input:  yu
Output: yuk&nduItD6d%D6dOgiz



## Segmentation

In [10]:
# Embed segments

# Random seed
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

# Data
sentences = val_sentences_ref
# sentences = test_sentences_ref
# sentences = train_sentences_ref
interval_dataset = datasets.SentenceIntervalDataset(sentences, text_to_id)
segment_loader = DataLoader(
    interval_dataset, 
    batch_size=batch_size,
    shuffle=False, 
    collate_fn=datasets.pad_collate,
    drop_last=False
    )

# Apply model to data
model.decoder.teacher_forcing_ratio = 1.0
model.eval()
rnn_losses = []
lengths = []
with torch.no_grad():
    for i_batch, (data, data_lengths) in enumerate(tqdm(segment_loader)):
        data = data.to(device)
        encoder_embedding, decoder_output = model(
            data, data_lengths, data, data_lengths
            )

        for i_item in range(data.shape[0]):
            item_loss = criterion(
                decoder_output[i_item].contiguous().view(-1,
                decoder_output[i_item].size(-1)),
                data[i_item].contiguous().view(-1)
                )
            rnn_losses.append(item_loss)
            lengths.append(data_lengths[i_item])

100%|████████████████████████████████████████████████████████████████████████████| 1903/1903 [00:07<00:00, 258.51it/s]


In [11]:
# Segment

dur_weight = 1.5

i_item = 0
predicted_boundaries = []
reference_boundaries = []
losses = []
cur_segmented_sentences = []
print("Segmenting:")
for i_sentence, intervals in enumerate(tqdm(interval_dataset.intervals)):

    # Costs for segment intervals
    costs = np.inf*np.ones(len(intervals))
    i_eos = intervals[-1][-1]
    for i_seg, interval in enumerate(intervals):
        if interval is None:
            continue
        i_start, i_end = interval
        dur = i_end - i_start
        assert dur == lengths[i_item]
        eos = (i_end == i_eos)  # end-of-sequence

#         # Chorowski
#         costs[i_seg] = (
#             rnn_losses[i_item]
#             + dur_weight*neg_chorowski(dur)
#             )
        
        # Gamma
        costs[i_seg] = (
            rnn_losses[i_item]
            + dur_weight*neg_log_gamma(dur)
            + np.log(np.sum(gamma_cache**dur_weight))
            )

        # # Histogram
        # costs[i_seg] = (
        #     rnn_losses[i_item]
        #     + dur_weight*(neg_log_hist(dur))
        #     + np.log(np.sum(histogram**dur_weight))
        #     )

#         # Sequence boundary
#         alpha = 0.0001  # 0.9
#         if eos:
#             costs[i_seg] += -np.log(alpha)
#         else:
#             costs[i_seg] += -np.log(1 - alpha)
        
        # Sequence boundary
        alpha = 0.1 # 0.0001  # 0.9
        if eos:
            costs[i_seg] += -np.log(alpha)
        else:
#             costs[i_seg] += -np.log(1 - alpha)
            K = 1
            costs[i_seg] += -np.log((1 - alpha)/K)

        i_item += 1

    # Viterbi segmentation
    n_frames = len(interval_dataset.sentences[i_sentence])
    summed_cost, boundaries = viterbi.custom_viterbi(costs, n_frames)
    losses.append(summed_cost)

    reference_sentence = sentences[i_sentence]
    segmented_sentence = get_segmented_sentence(
            interval_dataset.sentences[i_sentence],
            boundaries, id_to_symbol
            )
    cur_segmented_sentences.append(segmented_sentence)
    # Print examples of the first few sentences
    if i_sentence < 10:
        print(reference_sentence)
        print(segmented_sentence)
        # print()

    predicted_boundaries.append(boundaries)
    reference_boundaries.append(
        datasets.sentence_to_boundaries(reference_sentence)
        )

print("NLL: {:.4f}\n".format(np.sum(losses)))

Segmenting:


  3%|██▋                                                                           | 34/1000 [00:00<00:02, 337.61it/s]

yu want tu si D6 bUk
yu want tu si D6 bUk
lUk D*z 6 b7 wIT hIz h&t
lUk D*z 6b7 wIT hIz h&t
&nd 6 dOgi
&nd6 dOgi
yu want tu lUk &t DIs
yu want tu lUk&t DIs
lUk &t DIs
lUk&t DIs
h&v 6 drINk
h&v 6d rIN k
oke nQ
oke nQ
WAts DIs
WAts DIs
WAts D&t
WAts D&t
WAt Iz It
WAt Iz It


100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:03<00:00, 272.70it/s]

NLL: 10198.7362






## Evaluation

In [12]:
p, r, f  = eval_segmentation.score_boundaries(
    reference_boundaries, predicted_boundaries
    )
print("-"*(79 - 4))
print("Word boundaries:")
print("Precision: {:.4f}%".format(p*100))
print("Recall: {:.4f}%".format(r*100))
print("F-score: {:.4f}%".format(f*100))
print("OS: {:.4f}%".format(eval_segmentation.get_os(p, r)*100))
print("-"*(79 - 4))

p, r, f = eval_segmentation.score_word_token_boundaries(
    reference_boundaries, predicted_boundaries
    )
print("Word token boundaries:")
print("Precision: {:.4f}%".format(p*100))
print("Recall: {:.4f}%".format(r*100))
print("F-score: {:.4f}%".format(f*100))
print("OS: {:.4f}%".format(eval_segmentation.get_os(p, r)*100))
print("-"*(79 - 4))

---------------------------------------------------------------------------
Word boundaries:
Precision: 81.1819%
Recall: 86.7832%
F-score: 83.8891%
OS: 6.8996%
---------------------------------------------------------------------------
Word token boundaries:
Precision: 71.7956%
Recall: 75.2166%
F-score: 73.4663%
OS: 4.7649%
---------------------------------------------------------------------------


## Quantization

In [13]:
# prequant_segmented_sentences = cur_train_sentences
prequant_segmented_sentences = cur_segmented_sentences

In [14]:
# Find the K-means centroids

# Data
sentences = prequant_segmented_sentences
train_dataset = datasets.WordDataset(sentences, text_to_id)
train_loader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True,
    collate_fn=datasets.pad_collate
    )

# Apply model to data
model.eval()
encoder_embeddings = []
with torch.no_grad():
    for i_batch, (data, data_lengths) in enumerate(tqdm(train_loader)):
        data = data.to(device)
        encoder_embedding, decoder_output = model(
            data, data_lengths, data, data_lengths
            )
        encoder_embeddings.append(encoder_embedding.cpu().numpy())

        
# Cluster
X = np.vstack(encoder_embeddings)
print("X shape:", X.shape)
print(datetime.now())
print("Clustering")
K = 128  # 1024  # 2048
vq_model = cluster.KMeans(n_clusters=K, max_iter=10)
vq_model.fit(X)
print("Inertia: {:.4f}".format(vq_model.inertia_))
centroids = vq_model.cluster_centers_
print(datetime.now())

100%|██████████████████████████████████████████████████████████████████████████████| 106/106 [00:00<00:00, 540.40it/s]


X shape: (3386, 25)
2022-02-24 10:48:22.605330
Clustering
Inertia: 23002.5840
2022-02-24 10:48:23.197803


In [15]:
# Examples without segmentation

# Apply to validation data
model.eval()
with torch.no_grad():
    for i_batch, (data, data_lengths) in enumerate(val_loader):
#     for i_batch, (data, data_lengths) in enumerate(train_loader):
        data = data.to(device)
        encoder_embedding, decoder_output = model(
            data, data_lengths, data, data_lengths
            )

        encoder_embedding = encoder_embedding.cpu().numpy()
        clusters = vq_model.predict(encoder_embedding)
        embedding_reconstructed = centroids[clusters, :].reshape(
            encoder_embedding.shape
            )
        embedding_reconstructed = torch.from_numpy(
            embedding_reconstructed
            ).to(device)
        
        y, log_probs = model.decoder.greedy_decode(
            embedding_reconstructed,
            max_length=20,
            )
        x = data.cpu().numpy()
        
        for i_input in range(y.shape[0]):
            # Only print up to EOS symbol
            input_symbols = []
            for i in x[i_input]:
                if i == symbol_to_id[EOS_SYMBOL] or i == symbol_to_id[PAD_SYMBOL]:
                    break
                input_symbols.append(id_to_symbol[i])
            output_symbols = []
            for i in y[i_input]:
                if i == symbol_to_id[EOS_SYMBOL] or i == symbol_to_id[PAD_SYMBOL]:
                    break
                output_symbols.append(id_to_symbol[i])

            print("Input: ", "".join(input_symbols))
            print("Output:", "".join(output_symbols))
            print()
            
            if i_input == 10:
                break
        
        break

Input:  h9
Output: h9do&tizl9ktusekItiz

Input:  lUk
Output: lUksekItizIts6mQRzIn

Input:  f%
Output: f%milizIts6d%Its6vD6

Input:  h9
Output: h9do&tizl9ktusekItiz

Input:  D&t
Output: D&ts6dOgizn9studidiz

Input:  h9
Output: h9do&tizl9ktusekItiz

Input:  D&ts
Output: D&tsD6dOgizn9studidi

Input:  kAm
Output: kAmh(mimituIttupUtIt

Input:  bAni
Output: bAtRzl9kD6dOgiznD6d%

Input:  It
Output: Its6dr&g~d&dizgoINtu

Input:  huz
Output: huzInD6d%D6d%zizIts6



In [16]:
# Embed segments

# Random seed
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

# Data
sentences = train_sentences_ref[:1000]  # to-do: all the sentences
# sentences = train_sentences_ref
interval_dataset = datasets.SentenceIntervalDataset(sentences, text_to_id)
segment_loader = DataLoader(
    interval_dataset, 
    batch_size=batch_size, 
    shuffle=False, 
    collate_fn=datasets.pad_collate,
    drop_last=False
    )

# Apply model to data
model.decoder.teacher_forcing_ratio = 1.0  # to-do: adjust this
model.eval()
rnn_losses = []
lengths = []
with torch.no_grad():
    for i_batch, (data, data_lengths) in enumerate(tqdm(segment_loader)):
        data = data.to(device)

        encoder_embedding, decoder_output = model(
            data, data_lengths, data, data_lengths
            )

        encoder_embedding = encoder_embedding.cpu().numpy()
        clusters = vq_model.predict(encoder_embedding)
        embedding_reconstructed = centroids[clusters, :].reshape(
            encoder_embedding.shape
            )
        embedding_reconstructed = torch.from_numpy(
            embedding_reconstructed
            ).to(device)
        
        decoder_rnn, decoder_output = model.decoder(
            embedding_reconstructed, data, data_lengths
            )

        for i_item in range(data.shape[0]):
            item_loss = criterion(
                decoder_output[i_item].contiguous().view(-1,
                decoder_output[i_item].size(-1)),
                data[i_item].contiguous().view(-1)
                )
            rnn_losses.append(item_loss)
            lengths.append(data_lengths[i_item])

100%|████████████████████████████████████████████████████████████████████████████| 1903/1903 [00:12<00:00, 146.92it/s]


Options:
- Want to evaluate this segmentation: Go back up to the cell where segmentation is done (after segments are embedded).
- Want to retrain K-means model based on this segmentation: Go back to start of quantization cell.
- Want to retrain AE-RNN: Run cell below then go back to model training.

### Re-initialize data and repeat

Repeat from AE-RNN pretraining cell.

In [17]:
# Create psuedo-sentences and go back to the top cell in this section
cur_train_sentences = cur_segmented_sentences