In [1]:
import pickle
UNK_TOKEN = "<UNK>"
PAD_TOKEN = "<PAD>"
BOS_TOKEN = "시"
EOS_TOKEN = "끝"
SPLIT_TOKEN = "▁"
def create_digest_cedict(mono_file, poly_file, output_file):
    cedict = {}

    # Process monophonic characters
    with open(mono_file, 'r', encoding='utf-8') as f:
        for line in f:
            char, pron = line.strip().split('\t')
            cedict[char] = [pron]

    # Process polyphonic characters
    with open(poly_file, 'r', encoding='utf-8') as f:
        for line in f:
            char, prons = line.strip().split('\t')
            cedict[char] = prons.split(',')

    # Save to pickle file
    with open(output_file, 'wb') as f:
        pickle.dump(cedict, f)

# Create digest_cedict.pkl
create_digest_cedict('MONOPHONIC_CHARS.txt', 'POLYPHONIC_CHARS.txt', 'digest_cedict.pkl')

In [2]:
import pickle


def create_char2idx(sent_files, output_file):
    char2idx = {}
    idx = 0

    for sent_file in sent_files:
        with open(sent_file, "r", encoding="utf-8") as f:
            for line in f:
                for char in line.strip():
                    if char not in char2idx:
                        char2idx[char] = idx
                        idx += 1

    # Add special tokens
    char2idx[UNK_TOKEN] = idx
    char2idx[PAD_TOKEN] = idx + 1
    char2idx[BOS_TOKEN] = idx + 2
    char2idx[EOS_TOKEN] = idx + 3

    # Save to pickle file
    with open(output_file, "wb") as f:
        pickle.dump(char2idx, f)


# Create char2idx.pkl
create_char2idx(["train.sent", "dev.sent", "test.sent"], "char2idx.pkl")

In [3]:
def create_class2idx(lb_files, output_file):
    class2idx = {}
    idx = 0

    for lb_file in lb_files:
        with open(lb_file, "r", encoding="utf-8") as f:
            for line in f:
                for pron in line.strip().split():
                    if pron not in class2idx:
                        class2idx[pron] = idx
                        idx += 1

    # Add special tokens
    class2idx[UNK_TOKEN] = idx
    class2idx[PAD_TOKEN] = idx + 1

    # Save to pickle file
    with open(output_file, "wb") as f:
        pickle.dump(class2idx, f)


# Create class2idx.pkl
create_class2idx(["train.lb", "dev.lb", "test.lb"], "class2idx.pkl")

In [4]:
import numpy as np
import pickle


def initialize_np_ckpt(char2idx, class2idx, embedding_dim=64, lstm_hidden_dim=32):
    state_dict = {}

    # Initialize embedding weights
    state_dict["embedding.weight"] = np.random.randn(
        len(char2idx), embedding_dim
    ).astype(np.float32)

    # Initialize LSTM weights and biases
    state_dict["lstm.weight_ih_l0"] = np.random.randn(
        4 * lstm_hidden_dim, embedding_dim
    ).astype(np.float32)
    state_dict["lstm.weight_hh_l0"] = np.random.randn(
        4 * lstm_hidden_dim, lstm_hidden_dim
    ).astype(np.float32)
    state_dict["lstm.bias_ih_l0"] = np.zeros(4 * lstm_hidden_dim, dtype=np.float32)
    state_dict["lstm.bias_hh_l0"] = np.zeros(4 * lstm_hidden_dim, dtype=np.float32)

    state_dict["lstm.weight_ih_l0_reverse"] = np.random.randn(
        4 * lstm_hidden_dim, embedding_dim
    ).astype(np.float32)
    state_dict["lstm.weight_hh_l0_reverse"] = np.random.randn(
        4 * lstm_hidden_dim, lstm_hidden_dim
    ).astype(np.float32)
    state_dict["lstm.bias_ih_l0_reverse"] = np.zeros(
        4 * lstm_hidden_dim, dtype=np.float32
    )
    state_dict["lstm.bias_hh_l0_reverse"] = np.zeros(
        4 * lstm_hidden_dim, dtype=np.float32
    )

    # Initialize fully connected layer weights and biases
    state_dict["logit_layer.0.weight"] = np.random.randn(
        lstm_hidden_dim, 2 * lstm_hidden_dim
    ).astype(np.float32)
    state_dict["logit_layer.0.bias"] = np.zeros(lstm_hidden_dim, dtype=np.float32)
    state_dict["logit_layer.2.weight"] = np.random.randn(
        len(class2idx), lstm_hidden_dim
    ).astype(np.float32)
    state_dict["logit_layer.2.bias"] = np.zeros(len(class2idx), dtype=np.float32)

    # Save to pickle file
    with open("np_ckpt.pkl", "wb") as f:
        pickle.dump(state_dict, f)


# Load char2idx and class2idx
char2idx = pickle.load(open("char2idx.pkl", "rb"))
class2idx = pickle.load(open("class2idx.pkl", "rb"))

# Initialize np_ckpt.pkl
initialize_np_ckpt(char2idx, class2idx)

In [5]:
digest_cedict = pickle.load(open("digest_cedict.pkl", "rb"))
char2idx = pickle.load(open("char2idx.pkl", "rb"))
class2idx = pickle.load(open("class2idx.pkl", "rb"))

# Print statistics
print("Length of digest_cedict:", len(digest_cedict))
print("Length of char2idx:", len(char2idx))
print("Length of class2idx:", len(class2idx))
state_dict = pickle.load(open("np_ckpt.pkl", "rb"))

print("Dimensions of embedding.weight:", state_dict["embedding.weight"].shape)
print("Dimensions of lstm.weight_ih_l0:", state_dict["lstm.weight_ih_l0"].shape)
print("Dimensions of lstm.weight_hh_l0:", state_dict["lstm.weight_hh_l0"].shape)
print("Dimensions of lstm.bias_ih_l0:", state_dict["lstm.bias_ih_l0"].shape)
print("Dimensions of lstm.bias_hh_l0:", state_dict["lstm.bias_hh_l0"].shape)
print(
    "Dimensions of lstm.weight_ih_l0_reverse:",
    state_dict["lstm.weight_ih_l0_reverse"].shape,
)
print(
    "Dimensions of lstm.weight_hh_l0_reverse:",
    state_dict["lstm.weight_hh_l0_reverse"].shape,
)
print(
    "Dimensions of lstm.bias_ih_l0_reverse:",
    state_dict["lstm.bias_ih_l0_reverse"].shape,
)
print(
    "Dimensions of lstm.bias_hh_l0_reverse:",
    state_dict["lstm.bias_hh_l0_reverse"].shape,
)
print("Dimensions of logit_layer.0.weight:", state_dict["logit_layer.0.weight"].shape)
print("Dimensions of logit_layer.0.bias:", state_dict["logit_layer.0.bias"].shape)
print("Dimensions of logit_layer.2.weight:", state_dict["logit_layer.2.weight"].shape)
print("Dimensions of logit_layer.2.bias:", state_dict["logit_layer.2.bias"].shape)

Length of digest_cedict: 27006
Length of char2idx: 6358
Length of class2idx: 1593
Dimensions of embedding.weight: (6358, 64)
Dimensions of lstm.weight_ih_l0: (128, 64)
Dimensions of lstm.weight_hh_l0: (128, 32)
Dimensions of lstm.bias_ih_l0: (128,)
Dimensions of lstm.bias_hh_l0: (128,)
Dimensions of lstm.weight_ih_l0_reverse: (128, 64)
Dimensions of lstm.weight_hh_l0_reverse: (128, 32)
Dimensions of lstm.bias_ih_l0_reverse: (128,)
Dimensions of lstm.bias_hh_l0_reverse: (128,)
Dimensions of logit_layer.0.weight: (32, 64)
Dimensions of logit_layer.0.bias: (32,)
Dimensions of logit_layer.2.weight: (1593, 32)
Dimensions of logit_layer.2.bias: (1593,)


In [None]:
import numpy as np
import pickle
import os
from tqdm import tqdm
from g2pM2 import G2pM


# Load the dataset
def load_data(sent_file, lb_file):
    with open(sent_file, "r", encoding="utf-8") as f:
        sentences = [line.strip() for line in f]
    with open(lb_file, "r", encoding="utf-8") as f:
        labels = [line.strip().split() for line in f]
    return sentences, labels


# Convert characters and labels to indices and pad sequences
def prepare_data(sentences, labels, char2idx, class2idx):
    input_ids = []
    target_ids = []
    target_indices = []
    for sent, label in zip(sentences, labels):
        input_id = [char2idx.get(char, char2idx[UNK_TOKEN]) for char in sent]
        target_id = [class2idx.get(pron, class2idx[UNK_TOKEN]) for pron in label]
        input_ids.append(input_id)
        target_ids.append(target_id)

        # Compute target indices for polyphonic characters
        target_idx = [i for i, pron in enumerate(label) if pron in class2idx]
        target_indices.append(target_idx)

    # Pad sequences
    max_length = max(len(seq) for seq in input_ids)
    input_ids = [
        seq + [char2idx[PAD_TOKEN]] * (max_length - len(seq)) for seq in input_ids
    ]
    target_ids = [
        seq + [class2idx[PAD_TOKEN]] * (max_length - len(seq)) for seq in target_ids
    ]

    return np.array(input_ids), np.array(target_ids), target_indices


# Generate batches of data
def get_batches(data, batch_size):
    inputs, targets, target_indices = data
    for i in range(0, len(inputs), batch_size):
        batch_inputs = inputs[i : i + batch_size]
        batch_targets = targets[i : i + batch_size]
        batch_target_indices = target_indices[i : i + batch_size]
        yield np.array(batch_inputs), np.array(batch_targets), batch_target_indices


def compute_loss(model, inputs, targets, target_indices):
    lengths = np.sum(np.sign(inputs), axis=1)
    max_length = max(lengths)

    # Recompute the hidden states up to the point where logits are obtained
    rev_seq = model.reverse_sequence(inputs, lengths)
    fw_emb = model.get_embedding(inputs)  # [b, t, d]
    bw_emb = model.get_embedding(rev_seq)

    fw_states, bw_states = None, None
    fw_hs = []
    bw_hs = []
    for i in range(max_length):
        fw_input = fw_emb[:, i, :]
        bw_input = bw_emb[:, i, :]
        fw_states = model.fw_lstm_cell(fw_input, fw_states)
        bw_states = model.bw_lstm_cell(bw_input, bw_states)

        fw_hs.append(fw_states[0])
        bw_hs.append(bw_states[0])
    fw_hiddens = np.stack(fw_hs, axis=1)
    bw_hiddens = np.stack(bw_hs, axis=1)
    bw_hiddens = model.reverse_sequence(bw_hiddens, lengths)

    outputs = np.concatenate([fw_hiddens, bw_hiddens], axis=2)  # [b, t, d]
    batch_size = outputs.shape[0]
    if batch_size == 1:
        outputs = outputs.squeeze(axis=0)  # [t, d]
        target_hidden = outputs[target_indices[0]]
    else:
        # Flatten target_indices for batch processing
        target_hidden = []
        for i in range(batch_size):
            for idx in target_indices[i]:
                target_hidden.append(outputs[i, idx])
        target_hidden = np.array(target_hidden)  # [total_targets, d]

    # Compute logits using the fc_layer
    logits = model.fc_layer(target_hidden)  # [total_targets, num_classes]

    # Apply softmax to get probabilities
    exp_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))
    softmax_probs = exp_logits / np.sum(exp_logits, axis=1, keepdims=True)

    # Extract the probabilities of the correct classes
    total_targets = len(target_hidden)
    target_classes = []
    for i in range(batch_size):
        for idx in target_indices[i]:
            target_classes.append(targets[i, idx])
    target_classes = np.array(target_classes)  # [total_targets]

    # Gather the probabilities for the target classes
    target_probs = softmax_probs[np.arange(total_targets), target_classes]

    # Compute the negative log likelihood
    loss = -np.log(target_probs + 1e-9)
    loss = np.sum(loss) / total_targets

    return loss


# Update the model weights using Adam optimizer
def update_weights(model, grads, learning_rate, beta1, beta2, epsilon, t, m, v):
    for param, grad in grads.items():
        m[param] = beta1 * m[param] + (1 - beta1) * grad
        v[param] = beta2 * v[param] + (1 - beta2) * (grad**2)
        m_hat = m[param] / (1 - beta1**t)
        v_hat = v[param] / (1 - beta2**t)
        model.__dict__[param] -= learning_rate * m_hat / (np.sqrt(v_hat) + epsilon)


# Save the trained model
def save_model(model, output_file):
    state_dict = {
        "embedding.weight": model.embeddings,
        "lstm.weight_ih_l0": model.weight_ih,
        "lstm.weight_hh_l0": model.weight_hh,
        "lstm.bias_ih_l0": model.bias_ih,
        "lstm.bias_hh_l0": model.bias_hh,
        "lstm.weight_ih_l0_reverse": model.weight_ih_reverse,
        "lstm.weight_hh_l0_reverse": model.weight_hh_reverse,
        "lstm.bias_ih_l0_reverse": model.bias_ih_reverse,
        "lstm.bias_hh_l0_reverse": model.bias_hh_reverse,
        "logit_layer.0.weight": model.hidden_weight_l0,
        "logit_layer.0.bias": model.hidden_bias_l0,
        "logit_layer.2.weight": model.hidden_weight_l1,
        "logit_layer.2.bias": model.hidden_bias_l1,
    }
    with open(output_file, "wb") as f:
        pickle.dump(state_dict, f)


# Main script
if __name__ == "__main__":
    UNK_TOKEN = "<UNK>"
    PAD_TOKEN = "<PAD>"
    BOS_TOKEN = "시"
    EOS_TOKEN = "끝"
    SPLIT_TOKEN = "▁"

    # Load the model
    model = G2pM()

    # Load the training and development data
    train_sentences, train_labels = load_data("train.sent", "train.lb")
    dev_sentences, dev_labels = load_data("dev.sent", "dev.lb")

    # Load the char2idx and class2idx mappings
    char2idx = pickle.load(open("char2idx.pkl", "rb"))
    class2idx = pickle.load(open("class2idx.pkl", "rb"))

    # Prepare the data
    train_data = prepare_data(train_sentences, train_labels, char2idx, class2idx)
    dev_data = prepare_data(dev_sentences, dev_labels, char2idx, class2idx)

    # Training parameters
    epochs = 5
    batch_size = 32
    learning_rate = 0.001
    beta1 = 0.9
    beta2 = 0.999
    epsilon = 1e-8

    # Initialize Adam optimizer parameters
    t = 0
    m = {
        param: np.zeros_like(value)
        for param, value in model.__dict__.items()
        if isinstance(value, np.ndarray)
    }
    v = {
        param: np.zeros_like(value)
        for param, value in model.__dict__.items()
        if isinstance(value, np.ndarray)
    }

    # Training loop
    for epoch in range(epochs):
        train_loss = 0
        total_targets = 0
        with tqdm(total=len(train_data[0]), desc=f"Epoch {epoch+1}/{epochs}") as pbar:
            for inputs, targets, target_indices in get_batches(train_data, batch_size):
                t += 1
                loss = compute_loss(model, inputs, targets, target_indices)
                train_loss += loss

                # Compute gradients (this is a placeholder, you need to implement backpropagation to get actual gradients)
                grads = {
                    param: np.zeros_like(value)
                    for param, value in model.__dict__.items()
                    if isinstance(value, np.ndarray)
                }

                # Update weights
                update_weights(
                    model, grads, learning_rate, beta1, beta2, epsilon, t, m, v
                )

                # Update progress bar
                target_count = sum(len(indices) for indices in target_indices)
                total_targets += target_count
                pbar.update(len(inputs))
                pbar.set_postfix(
                    {
                        "Train Loss": (
                            train_loss / total_targets if total_targets > 0 else 0.0
                        )
                    }
                )

        # Validation loop
        dev_loss = 0
        total_dev_targets = 0
        for inputs, targets, target_indices in get_batches(dev_data, batch_size):
            loss = compute_loss(model, inputs, targets, target_indices)
            dev_loss += loss
            total_dev_targets += sum(len(indices) for indices in target_indices)

        avg_train_loss = (
            train_loss / total_targets if total_targets > 0 else float("inf")
        )
        avg_dev_loss = (
            dev_loss / total_dev_targets if total_dev_targets > 0 else float("inf")
        )

        print(
            f"Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss}, Dev Loss: {avg_dev_loss}"
        )

    # Save the trained model
    save_model(model, "trained_np_ckpt.pkl")

In [6]:
sentence = "然而，他红了20年以后，他竟退出了大家的视线。"

In [7]:
import pickle
import numpy as np
from g2pM2 import G2pM


# Load the trained model weights
def load_trained_model(model, ckpt_file):
    state_dict = pickle.load(open(ckpt_file, "rb"))
    model.load_variable(state_dict)


# Initialize the model
model = G2pM()

# Load the trained weights
load_trained_model(model, "trained_np_ckpt.pkl")

# Test sentence
sentence = "然而，他红了20年以后，他竟退出了大家的视线。"

# Predict pronunciations
predicted_pronunciations = model(sentence, tone=True, char_split=False)

# Print the result
print(predicted_pronunciations)

['jin4', 'nang4', '，', 'to1', '红', 'liu5', '20', 'nin4', 'ji5', 'hau6', '，', 'to1', 'ging2', 'teoi3', 'ceot1', 'liu5', 'taai3', 'ze1', 'dik1', '视线。']


In [8]:
import ToJyutping

ToJyutping.get_jyutping_text(sentence)

'jin4 ji4, taa1 hung4 liu5 […] nin4 ji5 hau6, taa1 ging2 teoi3 ceot1 liu5 daai6 gaa1 dik1 si6 sin3.'

In [10]:
file_path = "wordslist.csv"

with open(file_path, "r", encoding="utf-8") as file:
    lines = file.readlines()

cantonese_dict = {}

for line in lines[2:]:
    parts = line.strip().split(",")
    if len(parts) > 1:
        word = parts[0]
        pronunciations = parts[1:]
        cantonese_dict[word] = pronunciations

In [14]:
import pickle
import numpy as np
from g2pM2 import G2pM


# Load the trained model weights
def load_trained_model(model, ckpt_file):
    state_dict = pickle.load(open(ckpt_file, "rb"))
    model.load_variable(state_dict)


# Initialize the model
model = G2pM()

# Load the trained weights
load_trained_model(model, "trained_np_ckpt.pkl")

# Test sentence
sentence = "然而，他红了20年以后，他竟退出了大家的视线。"

# Predict pronunciations
predicted_pronunciations = model(sentence, tone=True, char_split=False)

# Print the result
print(''.join(predicted_pronunciations))

jin4nang4，to1红liu520nin4ji5hau6，to1ging2teoi3ceot1liu5taai3ze1dik1视线。


In [24]:
import pickle
import numpy as np
from g2pM2 import G2pM
import pandas as pd
from tqdm import tqdm


# Load the trained model weights
def load_trained_model(model, ckpt_file):
    state_dict = pickle.load(open(ckpt_file, "rb"))
    model.load_variable(state_dict)


# Initialize the model
model = G2pM()

# Load the trained weights
load_trained_model(model, "trained_np_ckpt.pkl")

# Load the dictionary from wordslist.csv
file_path = "wordslist.csv"

with open(file_path, "r", encoding="utf-8") as file:
    lines = file.readlines()

cantonese_dict = {}

for line in lines[2:]:
    parts = line.strip().split(",")
    if len(parts) > 1:
        word = parts[0]
        pronunciations = parts[1:]
        cantonese_dict[word] = pronunciations


# Function to compare predictions with actual values
def compare_predictions(word, actual_phonemes):
    prediction = model(word, tone=True, char_split=False)
    # Convert list of lists to a single string, ignoring None values
    prediction_str = " ".join([p for p in prediction if p is not None])
    return word, actual_phonemes, prediction_str


# Compare predictions for entries from 2000 to 9999 with a progress bar
results = []
incorrect_predictions = []
for word, phonemes in tqdm(
    list(cantonese_dict.items()), desc="Processing words"
):
    result = compare_predictions(word, phonemes[0])
    results.append(result)
    if result[1] != result[2]:
        incorrect_predictions.append(result)

# Calculate success rate
success_count = sum(
    1 for word, actual_phonemes, prediction in results if actual_phonemes == prediction
)
total_count = len(results)
success_rate = success_count / total_count * 100

# Print the results
for result in results:
    word, actual_phonemes, prediction = result
    print(f"Word: {word} -> Actual: {actual_phonemes} -> Prediction: {prediction}")

# Print success rate
print(f"Success Count: {success_count}")
print(f"Success Rate: {success_rate:.2f}%")

# Store incorrect predictions for further analysis
incorrect_df = pd.DataFrame(
    incorrect_predictions, columns=["Word", "Actual", "Prediction"]
)
incorrect_df.to_csv("incorrect_predictions.csv", index=False)

# Display some statistics on incorrect predictions
print(f"Total Incorrect Predictions: {len(incorrect_predictions)}")
print(incorrect_df.head())

Processing words: 100%|██████████| 61148/61148 [00:00<00:00, 144919.41it/s]


Word: 0 -> Actual: ling4 -> Prediction: 0
Word: 0尊 -> Actual: ling4 zyun1 -> Prediction: 0 zyun1
Word: 0知 -> Actual: ling4 zi1 -> Prediction: 0 zi3
Word: 133 -> Actual: jat1 saam1 saam1 -> Prediction: 133
Word: 19 -> Actual: sap1 gau1 -> Prediction: 19
Word: 1999 -> Actual: jat1 gau2 gau2 gau2 -> Prediction: 1999
Word: 21314 -> Actual: ji6 jat1 saam1 jat1 sei3 -> Prediction: 21314
Word: 26 -> Actual: ji6 luk1 -> Prediction: 26
Word: 2A3O -> Actual: ji6 ei1 saam1 ou1 -> Prediction: 2A3O
Word: 3A電池 -> Actual: saam1 ei1 din6 ci4 -> Prediction: 3A din6 to4
Word: 3D打印 -> Actual: saam1 di1 daa2 jan3 -> Prediction: 3D ding2 ngan3
Word: 3P -> Actual: saam1 pi1 -> Prediction: 3P
Word: 420 -> Actual: sei3 ji6 ling4 -> Prediction: 420
Word: 426 -> Actual: sei3 ji6 luk6 -> Prediction: 426
Word: 438 -> Actual: sei3 saam1 baat3 -> Prediction: 438
Word: 4424 -> Actual: sei3 sei3 ji6 sei3 -> Prediction: 4424
Word: 46 -> Actual: sei3 luk1 -> Prediction: 46
Word: 489 -> Actual: sei3 baat3 gau2 -> Predic