In [None]:
import pandas as pd

dataset = pd.read_csv('/content/drive/MyDrive/w266 Final project/phoneme_text_training_data.csv')

df_train = dataset[dataset['split'] != 'val']
df_val   = dataset[dataset['split'] == 'val']

In [None]:
df_train.head()

Unnamed: 0,split,session,block_num,trial_num,phoneme_sequence,sentence
0,train,t15.2023.08.11,2,0,B R IH NG | IH T | K L OW S ER |,bring it closer.
1,train,t15.2023.08.11,2,1,M AY | F AE M AH L IY | IH Z | K L OW S ...,my family is closer.
2,train,t15.2023.08.11,2,2,W AH T | D UW | DH EY | L AY K |,what do they like?
3,train,t15.2023.08.11,2,3,HH AW | IH Z | DH AE T | G UH D |,how is that good?
4,train,t15.2023.08.11,2,4,N IY D | HH EH L P | HH IY R |,need help here?


In [None]:
df_val.head()

Unnamed: 0,split,session,block_num,trial_num,phoneme_sequence,sentence
8072,val,t15.2023.08.13,8,0,Y UW | K AE N | S IY | DH AH | K OW D ...,you can see the code at this point as well.
8073,val,t15.2023.08.13,8,1,HH AW | D AH Z | IH T | K IY P | DH AH...,how does it keep the cost down?
8074,val,t15.2023.08.13,8,2,N AA T | T UW | K AA N T R AH V ER SH AH L...,not too controversial.
8075,val,t15.2023.08.13,8,3,DH AH | JH UH R IY | AH N D | AH | JH ...,the jury and a judge work together on it.
8076,val,t15.2023.08.13,8,4,W ER | K W AY T | V OW K AH L | AH B AW ...,were quite vocal about it.


In [None]:
LOGIT_TO_PHONEME = [
'BLANK',
'AA', 'AE', 'AH', 'AO', 'AW',
'AY', 'B', 'CH', 'D', 'DH',
'EH', 'ER', 'EY', 'F', 'G',
'HH', 'IH', 'IY', 'JH', 'K',
'L', 'M', 'N', 'NG', 'OW',
'OY', 'P', 'R', 'S', 'SH',
'T', 'TH', 'UH', 'UW', 'V',
'W', 'Y', 'Z', 'ZH',
' | ',
]

In [None]:
import torch
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

In [None]:
import torch.nn as nn
import torch.nn.functional as F
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [None]:
import re

def parse_cmu_dict_simple(cmu_file_path):
    word2phonemes = {}
    stress_re = re.compile(r'\d$')  # matches a digit at the end

    with open(cmu_file_path, 'r', encoding='cp1252') as f:
      lines = f.readlines()

    # find the last comment line starting with ;;;
    last_comment_idx = 0
    for i, line in enumerate(lines):
        if line.startswith(";;;"):
            last_comment_idx = i

    # process lines after last comment
    for line in lines[last_comment_idx + 1:]:
        line = line.strip()
        if not line:
            continue

        parts = line.split(maxsplit=1)
        if len(parts) != 2:
            continue
        word, phoneme_seq = parts
        phonemes = [stress_re.sub('', p) for p in phoneme_seq.split()]
        word2phonemes[word] = phonemes

    return word2phonemes

cmu_dict_path = "/content/drive/MyDrive/w266 Final project/cmudict-0.7b"
lexicon = parse_cmu_dict_simple(cmu_dict_path)

In [None]:
def phonemes_to_words(phonemes,lexicon):
  words = []
  current = []
  for p_token in phonemes:
    if p_token == "|":
      if current:
        word = lexicon.get(tuple(current),"<UNK>")
        words.append(word)
        current = []

    else:
      current.append(p_token)

  if current:
    words.append(lexicon.get(tuple(current),"<UNK>"))

  return words

In [None]:
phoneme_list = LOGIT_TO_PHONEME[1:] + ["<PAD>", "<UNK>"]

In [None]:
start_end_tokens = ["<START>", "<END>"]
vocab = start_end_tokens + phoneme_list

In [None]:
# Baseline
class PhonemeToText(nn.Module):
  def __init__(self,
               xphonebert_model,
               phoneme_embed_dim,
               tgt_vocab_size,
               decoder_dim=512,
               nhead=8,
               num_decoder_layers=4,
               ffnn_dim = 512,
               max_tgt_len=128,
               pad_token_id=0,
               start_token_id=1,
               stop_token_id=2,
               beam_size=10):
    super().__init__()

    self.encoder = xphonebert_model
    self.decoder_d_model = decoder_dim
    self.tgt_vocab_size = tgt_vocab_size
    self.max_tgt_len = max_tgt_len
    self.pad_token_id = pad_token_id
    self.start_token_id = start_token_id
    self.stop_token_id = stop_token_id
    self.beam_size = beam_size

    self.enc_proj = nn.Linear(phoneme_embed_dim, decoder_dim)

    # Decoder embeddings
    self.tgt_embed = nn.Embedding(tgt_vocab_size, decoder_dim, padding_idx=pad_token_id)
    self.pos_embed = nn.Embedding(max_tgt_len, decoder_dim)

    # Transformer decoder
    decoder_layer = nn.TransformerDecoderLayer(
        d_model=decoder_dim,
        nhead=nhead,
        dim_feedforward=ffnn_dim,
        batch_first=True
    )
    self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_decoder_layers)

    self.out_proj = nn.Linear(decoder_dim, tgt_vocab_size)

  def forward(self, phoneme_ids, tgt_ids=None):

    device = phoneme_ids.device
    B = phoneme_ids.size(0)

    # Encoder
    with torch.no_grad():
        enc_outputs = self.encoder(phoneme_ids)[0]
    enc_outputs = self.enc_proj(enc_outputs)

    if tgt_ids is None:
        return self.generate(phoneme_ids)

    # Decoder (Adapted from Vaswani et al. (2017))

    # Positional embeddings
    T_tgt = tgt_ids.size(1)
    tgt_pos = torch.arange(T_tgt, device=device).unsqueeze(0).expand(B, -1)
    tgt_emb = self.tgt_embed(tgt_ids) + self.pos_embed(tgt_pos)

    # Causal mask
    tgt_mask = nn.Transformer.generate_square_subsequent_mask(T_tgt).to(device)

    output = self.decoder(tgt_emb, enc_outputs, tgt_mask=tgt_mask)
    logits = self.out_proj(output)
    return logits

  def generate(self, phoneme_ids, max_len=None):
        # Standard beam search decoding algorithm implemented
        device = phoneme_ids.device
        max_len = max_len or self.max_tgt_len

        # Encode phonemes
        with torch.no_grad():
            enc_outputs = self.encoder(phoneme_ids)[0]
        enc_outputs = self.enc_proj(enc_outputs)

        beams = [(0.0, torch.tensor([[self.start_token_id]], device=device))]
        finished = []

        for idx in range(max_len):
            new_beams = []
            for score, seq in beams:
                tgt_pos = torch.arange(seq.size(1), device=device).unsqueeze(0)
                tgt_emb = self.tgt_embed(seq) + self.pos_embed(tgt_pos)
                tgt_mask = nn.Transformer.generate_square_subsequent_mask(seq.size(1)).to(device)
                logits = self.decoder(tgt_emb, enc_outputs, tgt_mask=tgt_mask)
                logp = F.log_softmax(self.out_proj(logits[:, -1, :]), dim=-1)

                topk_scores, topk_ids = torch.topk(logp[0], self.beam_size)
                for tok_score, tok_id in zip(topk_scores.tolist(), topk_ids.tolist()):
                    new_seq = torch.cat([seq, torch.tensor([[tok_id]], device=device)], dim=1)
                    new_score = score + tok_score
                    if tok_id == self.stop_token_id:
                        finished.append((new_score, new_seq))
                    else:
                        new_beams.append((new_score, new_seq))

            beams = sorted(new_beams, key=lambda x: x[0], reverse=True)[:self.beam_size]
            if not beams:
                break

        if finished:
            best = max(finished, key=lambda x: x[0])
        else:
            best = max(beams, key=lambda x: x[0])
        return best[1].squeeze(0).tolist()

In [None]:
def train_one_epoch(model, dataloader, optimizer, CE_loss, device):
    model.train()
    total_loss = 0.0

    for batch in dataloader:
        phonemes = batch['phoneme_ids'].to(device)
        targets = batch['labels'].to(device)

        optimizer.zero_grad()
        logits = model(phonemes, targets)

        B, T, V = logits.shape
        loss = CE_loss(logits.view(B*T, V), targets.view(B*T))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

def validate(model, dataloader, CE_loss, device, phoneme_tokenizer=None, lexicon=None):
    model.eval()
    total_loss = 0.0
    all_preds, all_targets = [], []

    with torch.no_grad():
        for batch in dataloader:
            phonemes = batch['phoneme_ids'].to(device)
            targets = batch['labels'].to(device)

            logits = model(phonemes, targets)
            B, T, V = logits.shape
            loss = CE_loss(logits.view(B*T, V), targets.view(B*T))
            total_loss += loss.item()

            #Decoding logits to words
            preds = logits.argmax(dim=-1)
            if lexicon:
                for p_seq, t_seq in zip(preds, targets):
                    pred_words = text_tokenizer.decode(
                        [i.item() for i in p_seq if i.item() != text_tokenizer.pad_token_id],
                        skip_special_tokens=True
                    ).split()

                    tgt_words = text_tokenizer.decode(
                        [i.item() for i in t_seq if i.item() != text_tokenizer.pad_token_id],
                        skip_special_tokens=True
                    ).split()

                    all_preds.append(pred_words)
                    all_targets.append(tgt_words)

    return total_loss / len(dataloader), all_preds, all_targets


In [None]:
#pip install -U transformers

In [None]:
from transformers import AutoModel, AutoTokenizer

# Load pretrained XPhoneBERT
xphonebert = AutoModel.from_pretrained("vinai/xphonebert-base")
tokenizer = AutoTokenizer.from_pretrained("vinai/xphonebert-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/641 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/350M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at vinai/xphonebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/350M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [None]:
from transformers import T5Tokenizer

text_tokenizer = T5Tokenizer.from_pretrained("t5-small")

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
class PhonemeToTextDataset(torch.utils.data.Dataset):
    def __init__(self, data_df, phoneme_tokenizer, text_tokenizer):
        self.df = data_df.reset_index(drop=True)
        self.phoneme_tokenizer = phoneme_tokenizer
        self.text_tokenizer = text_tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        phonemes = row["phoneme_sequence"]
        sentence = row["sentence"]

        # Encode phonemes
        phoneme_ids = self.phoneme_tokenizer.encode(phonemes)
        phoneme_ids = torch.tensor(phoneme_ids, dtype=torch.long)
        phoneme_length = torch.tensor(len(phoneme_ids), dtype=torch.long)

        # Encode sentence into word IDs
        text_ids = self.text_tokenizer.encode(
            sentence,
            add_special_tokens=True
        )
        text_ids = torch.tensor(text_ids, dtype=torch.long)

        return {
            "phoneme_ids": phoneme_ids,         # encoder input
            "phoneme_lengths": phoneme_length,
            "text_ids": text_ids,               # decoder target
            "sentence": sentence
        }

In [None]:
def collate_fn(batch, phoneme_pad_id, text_pad_id):
    # unpack
    phoneme_seqs = [b["phoneme_ids"] for b in batch]
    text_seqs = [b["text_ids"] for b in batch]

    # lengths
    phoneme_lens = torch.tensor([len(x) for x in phoneme_seqs])

    # padding
    phoneme_padded = torch.nn.utils.rnn.pad_sequence(
        phoneme_seqs,
        batch_first=True,
        padding_value=phoneme_pad_id
    )

    text_padded = torch.nn.utils.rnn.pad_sequence(
        text_seqs,
        batch_first=True,
        padding_value=text_pad_id
    )

    return {
        "phoneme_ids": phoneme_padded,
        "phoneme_lengths": phoneme_lens,
        "decoder_input_ids": text_padded,
        "labels": text_padded
    }

In [None]:

train_dataset = PhonemeToTextDataset(df_train, tokenizer,text_tokenizer)
val_dataset = PhonemeToTextDataset(df_val, tokenizer,text_tokenizer)
train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=64,
    shuffle=True,
    collate_fn=lambda b: collate_fn(b, tokenizer.pad_token_id, text_tokenizer.pad_token_id),
    num_workers=4,
)
val_dataloader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size=64,
    shuffle=False,
    collate_fn=lambda b: collate_fn(b, tokenizer.pad_token_id,text_tokenizer.pad_token_id),
    num_workers=4,
)


In [None]:
# Baseline
device = 'cuda' if torch.cuda.is_available() else 'cpu'
CE_loss = nn.CrossEntropyLoss(ignore_index=text_tokenizer.pad_token_id)
word_vocab_size = len(list(lexicon.keys()))
model = PhonemeToText(xphonebert,768,word_vocab_size,pad_token_id=tokenizer.pad_token_id).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-4)
num_epochs = 10

In [None]:
for epoch in range(num_epochs):
    train_loss = train_one_epoch(model, train_dataloader, optimizer, CE_loss, device)
    val_loss, val_preds, val_targets = validate(model, val_dataloader, CE_loss, device, tokenizer, lexicon)

    print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}")


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch 1: Train Loss=3.8033, Val Loss=1.9075
Epoch 2: Train Loss=0.9867, Val Loss=1.1409
Epoch 3: Train Loss=0.5465, Val Loss=0.8671
Epoch 4: Train Loss=0.3521, Val Loss=0.7275
Epoch 5: Train Loss=0.2313, Val Loss=0.6429
Epoch 6: Train Loss=0.1543, Val Loss=0.5937
Epoch 7: Train Loss=0.1008, Val Loss=0.5540
Epoch 8: Train Loss=0.0636, Val Loss=0.5295
Epoch 9: Train Loss=0.0376, Val Loss=0.5131
Epoch 10: Train Loss=0.0216, Val Loss=0.5047


In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

smooth_fn = SmoothingFunction().method1

def compute_avg_bleu(preds, targets):
    scores = []
    for pred, ref in zip(preds, targets):
        score = sentence_bleu([ref], pred, smoothing_function=smooth_fn)
        scores.append(score)
    return sum(scores) / len(scores)

avg_bleu = compute_avg_bleu(val_preds, val_targets)
print(f"Average BLEU score: {avg_bleu:.4f}")


Average BLEU score: 0.8376


In [None]:
!pip install jiwer

In [None]:
import jiwer

def compute_avg_wer(preds, targets):

    total_wer = 0.0
    for pred, ref in zip(preds, targets):
        pred_str = " ".join(pred)
        ref_str = " ".join(ref)
        total_wer += jiwer.wer(ref_str, pred_str)
    return total_wer / len(preds)

avg_wer = compute_avg_wer(val_preds, val_targets)
print(f"Average WER: {avg_wer:.4f}")


Average WER: 0.0887


In [None]:
# Experiment 1
# Reducing beam size during decoding
exp_1_model = PhonemeToText(xphonebert,768,word_vocab_size,beam_size=5,pad_token_id=tokenizer.pad_token_id).to(device)
optimizer = torch.optim.Adam(exp_1_model.parameters(), lr=2e-4)
num_epochs = 10
for epoch in range(num_epochs):
    exp_1_train_loss = train_one_epoch(exp_1_model, train_dataloader, optimizer, CE_loss, device)
    exp_1_val_loss, exp_1_val_preds, exp_1_val_targets = validate(exp_1_model, val_dataloader, CE_loss, device, tokenizer, lexicon)

    print(f"Epoch {epoch+1}: Train Loss={exp_1_train_loss:.4f}, Val Loss={exp_1_val_loss:.4f}")

Epoch 1: Train Loss=3.7857, Val Loss=1.8909
Epoch 2: Train Loss=0.9801, Val Loss=1.1336
Epoch 3: Train Loss=0.5452, Val Loss=0.8681
Epoch 4: Train Loss=0.3488, Val Loss=0.7296
Epoch 5: Train Loss=0.2309, Val Loss=0.6478
Epoch 6: Train Loss=0.1548, Val Loss=0.5918
Epoch 7: Train Loss=0.1022, Val Loss=0.5557
Epoch 8: Train Loss=0.0643, Val Loss=0.5283
Epoch 9: Train Loss=0.0382, Val Loss=0.5153
Epoch 10: Train Loss=0.0220, Val Loss=0.5057


In [None]:
exp_1_avg_bleu = compute_avg_bleu(exp_1_val_preds, exp_1_val_targets)
exp_1_avg_wer = compute_avg_wer(exp_1_val_preds, exp_1_val_targets)
print(f"Average BLEU score: {exp_1_avg_bleu:.4f}")
print(f"Average WER: {exp_1_avg_wer:.4f}")

Average BLEU score: 0.8085
Average WER: 0.1510


In [None]:
# Experiment 2
# Increasing beam size during decoding
exp_2_model = PhonemeToText(xphonebert,768,word_vocab_size,beam_size=15,pad_token_id=tokenizer.pad_token_id).to(device)
optimizer = torch.optim.Adam(exp_2_model.parameters(), lr=2e-4)
num_epochs = 10
for epoch in range(num_epochs):
    exp_2_train_loss = train_one_epoch(exp_2_model, train_dataloader, optimizer, CE_loss, device)
    exp_2_val_loss, exp_2_val_preds, exp_2_val_targets = validate(exp_2_model, val_dataloader, CE_loss, device, tokenizer, lexicon)

    print(f"Epoch {epoch+1}: Train Loss={exp_2_train_loss:.4f}, Val Loss={exp_2_val_loss:.4f}")


Epoch 1: Train Loss=3.8215, Val Loss=1.8931
Epoch 2: Train Loss=0.9844, Val Loss=1.1367
Epoch 3: Train Loss=0.5466, Val Loss=0.8711
Epoch 4: Train Loss=0.3552, Val Loss=0.7267
Epoch 5: Train Loss=0.2330, Val Loss=0.6492
Epoch 6: Train Loss=0.1567, Val Loss=0.5935
Epoch 7: Train Loss=0.1026, Val Loss=0.5568
Epoch 8: Train Loss=0.0650, Val Loss=0.5289
Epoch 9: Train Loss=0.0387, Val Loss=0.5115
Epoch 10: Train Loss=0.0221, Val Loss=0.5063


In [None]:
exp_2_avg_bleu = compute_avg_bleu(exp_2_val_preds, exp_2_val_targets)
exp_2_avg_wer = compute_avg_wer(exp_2_val_preds, exp_2_val_targets)
print(f"Average BLEU score: {exp_2_avg_bleu:.4f}")
print(f"Average WER: {exp_2_avg_wer:.4f}")


Average BLEU score: 0.8205
Average WER: 0.1358


In [None]:
# Experiment 3
# Reducing beam size during decoding and increasing number of decoder layers
exp_3_model = PhonemeToText(xphonebert,768,word_vocab_size,num_decoder_layers=8,beam_size=5,pad_token_id=tokenizer.pad_token_id).to(device)
optimizer = torch.optim.Adam(exp_3_model.parameters(), lr=2e-4)
num_epochs = 10
for epoch in range(num_epochs):
    exp_3_train_loss = train_one_epoch(exp_3_model, train_dataloader, optimizer, CE_loss, device)
    exp_3_val_loss, exp_3_val_preds, exp_3_val_targets = validate(exp_3_model, val_dataloader, CE_loss, device, tokenizer, lexicon)

    print(f"Epoch {epoch+1}: Train Loss={exp_3_train_loss:.4f}, Val Loss={exp_3_val_loss:.4f}")


Epoch 1: Train Loss=4.0958, Val Loss=2.0347
Epoch 2: Train Loss=1.0819, Val Loss=1.1976
Epoch 3: Train Loss=0.6025, Val Loss=0.9004
Epoch 4: Train Loss=0.3946, Val Loss=0.7523
Epoch 5: Train Loss=0.2661, Val Loss=0.6617
Epoch 6: Train Loss=0.1795, Val Loss=0.6064
Epoch 7: Train Loss=0.1204, Val Loss=0.5653
Epoch 8: Train Loss=0.0783, Val Loss=0.5352
Epoch 9: Train Loss=0.0485, Val Loss=0.5171
Epoch 10: Train Loss=0.0279, Val Loss=0.5070


In [None]:
exp_3_avg_bleu = compute_avg_bleu(exp_3_val_preds, exp_3_val_targets)
exp_3_avg_wer = compute_avg_wer(exp_3_val_preds, exp_3_val_targets)
print(f"Average BLEU score: {exp_3_avg_bleu:.4f}")
print(f"Average WER: {exp_3_avg_wer:.4f}")


Average BLEU score: 0.5503
Average WER: 0.8759


In [None]:
# Experiment 4
# Doubling the FFNN decoder dimension
exp_4_model = PhonemeToText(xphonebert,768,word_vocab_size,num_decoder_layers=4,ffnn_dim=1024, decoder_dim=1024, pad_token_id=tokenizer.pad_token_id).to(device)
optimizer = torch.optim.Adam(exp_4_model.parameters(), lr=2e-4)
num_epochs = 10
for epoch in range(num_epochs):
    exp_4_train_loss = train_one_epoch(exp_4_model, train_dataloader, optimizer, CE_loss, device)
    exp_4_val_loss, exp_4_val_preds, exp_4_val_targets = validate(exp_4_model, val_dataloader, CE_loss, device, tokenizer, lexicon)

    print(f"Epoch {epoch+1}: Train Loss={exp_4_train_loss:.4f}, Val Loss={exp_4_val_loss:.4f}")


Epoch 1: Train Loss=2.3355, Val Loss=1.0924
Epoch 2: Train Loss=0.4580, Val Loss=0.7045
Epoch 3: Train Loss=0.2025, Val Loss=0.5736
Epoch 4: Train Loss=0.0905, Val Loss=0.5101
Epoch 5: Train Loss=0.0311, Val Loss=0.4837
Epoch 6: Train Loss=0.0091, Val Loss=0.4803
Epoch 7: Train Loss=0.0050, Val Loss=0.4785
Epoch 8: Train Loss=0.0035, Val Loss=0.4797
Epoch 9: Train Loss=0.0027, Val Loss=0.4788
Epoch 10: Train Loss=0.0021, Val Loss=0.4783


In [None]:
exp_4_avg_bleu = compute_avg_bleu(exp_4_val_preds, exp_4_val_targets)
exp_4_avg_wer = compute_avg_wer(exp_4_val_preds, exp_4_val_targets)
print(f"Average BLEU score: {exp_4_avg_bleu:.4f}")
print(f"Average WER: {exp_4_avg_wer:.4f}")


Average BLEU score: 0.8345
Average WER: 0.1011


In [None]:
# Experiment 5
# Increasing beam size during decoding and doubling the number of decoding layers
exp_5_model = PhonemeToText(xphonebert,768,word_vocab_size,num_decoder_layers=8,beam_size=10,pad_token_id=tokenizer.pad_token_id).to(device)
optimizer = torch.optim.Adam(exp_5_model.parameters(), lr=2e-4)
num_epochs = 10
for epoch in range(num_epochs):
    exp_5_train_loss = train_one_epoch(exp_5_model, train_dataloader, optimizer, CE_loss, device)
    exp_5_val_loss, exp_5_val_preds, exp_5_val_targets = validate(exp_5_model, val_dataloader, CE_loss, device, tokenizer, lexicon)

    print(f"Epoch {epoch+1}: Train Loss={exp_5_train_loss:.4f}, Val Loss={exp_5_val_loss:.4f}")


Epoch 1: Train Loss=4.1022, Val Loss=2.0230
Epoch 2: Train Loss=1.0790, Val Loss=1.1936
Epoch 3: Train Loss=0.5947, Val Loss=0.9020
Epoch 4: Train Loss=0.3844, Val Loss=0.7545
Epoch 5: Train Loss=0.2587, Val Loss=0.6644
Epoch 6: Train Loss=0.1751, Val Loss=0.6073
Epoch 7: Train Loss=0.1173, Val Loss=0.5638
Epoch 8: Train Loss=0.0765, Val Loss=0.5378
Epoch 9: Train Loss=0.0470, Val Loss=0.5195
Epoch 10: Train Loss=0.0267, Val Loss=0.5111


In [None]:
exp_5_avg_bleu = compute_avg_bleu(exp_5_val_preds, exp_5_val_targets)
exp_5_avg_wer = compute_avg_wer(exp_5_val_preds, exp_5_val_targets)
print(f"Average BLEU score: {exp_5_avg_bleu:.4f}")
print(f"Average WER: {exp_5_avg_wer:.4f}")


Average BLEU score: 0.8366
Average WER: 0.0999


In [None]:
# Experiment 6
# Reducing number of attention heads
exp_6_model = PhonemeToText(xphonebert,768,word_vocab_size,num_decoder_layers=4,nhead=4, pad_token_id=tokenizer.pad_token_id).to(device)
optimizer = torch.optim.Adam(exp_6_model.parameters(), lr=2e-4)
num_epochs = 10
for epoch in range(num_epochs):
    exp_6_train_loss = train_one_epoch(exp_6_model, train_dataloader, optimizer, CE_loss, device)
    exp_6_val_loss, exp_6_val_preds, exp_6_val_targets = validate(exp_6_model, val_dataloader, CE_loss, device, tokenizer, lexicon)

    print(f"Epoch {epoch+1}: Train Loss={exp_6_train_loss:.4f}, Val Loss={exp_6_val_loss:.4f}")


Epoch 1: Train Loss=3.7569, Val Loss=1.8672
Epoch 2: Train Loss=0.9720, Val Loss=1.1315
Epoch 3: Train Loss=0.5446, Val Loss=0.8644
Epoch 4: Train Loss=0.3543, Val Loss=0.7246
Epoch 5: Train Loss=0.2370, Val Loss=0.6411
Epoch 6: Train Loss=0.1562, Val Loss=0.5889
Epoch 7: Train Loss=0.1040, Val Loss=0.5478
Epoch 8: Train Loss=0.0664, Val Loss=0.5243
Epoch 9: Train Loss=0.0401, Val Loss=0.5079
Epoch 10: Train Loss=0.0232, Val Loss=0.4999


In [None]:
exp_6_avg_bleu = compute_avg_bleu(exp_6_val_preds, exp_6_val_targets)
exp_6_avg_wer = compute_avg_wer(exp_6_val_preds, exp_6_val_targets)
print(f"Average BLEU score: {exp_6_avg_bleu:.4f}")
print(f"Average WER: {exp_6_avg_wer:.4f}")


Average BLEU score: 0.8311
Average WER: 0.1123


In [None]:
# Experiment 7
# Halving the learning rate during training
exp_7_model = PhonemeToText(xphonebert,768,word_vocab_size,num_decoder_layers=4,pad_token_id=tokenizer.pad_token_id).to(device)
optimizer = torch.optim.Adam(exp_7_model.parameters(), lr=1e-4)
num_epochs = 10
for epoch in range(num_epochs):
    exp_7_train_loss = train_one_epoch(exp_7_model, train_dataloader, optimizer, CE_loss, device)
    exp_7_val_loss, exp_7_val_preds, exp_7_val_targets = validate(exp_7_model, val_dataloader, CE_loss, device, tokenizer, lexicon)

    print(f"Epoch {epoch+1}: Train Loss={exp_7_train_loss:.4f}, Val Loss={exp_7_val_loss:.4f}")


Epoch 1: Train Loss=5.3418, Val Loss=2.9114
Epoch 2: Train Loss=1.7803, Val Loss=1.7831
Epoch 3: Train Loss=1.0477, Val Loss=1.3309
Epoch 4: Train Loss=0.7320, Val Loss=1.0826
Epoch 5: Train Loss=0.5454, Val Loss=0.9365
Epoch 6: Train Loss=0.4283, Val Loss=0.8343
Epoch 7: Train Loss=0.3390, Val Loss=0.7609
Epoch 8: Train Loss=0.2702, Val Loss=0.7051
Epoch 9: Train Loss=0.2166, Val Loss=0.6631
Epoch 10: Train Loss=0.1758, Val Loss=0.6276


In [None]:
exp_7_avg_bleu = compute_avg_bleu(exp_7_val_preds, exp_7_val_targets)
exp_7_avg_wer = compute_avg_wer(exp_7_val_preds, exp_7_val_targets)
print(f"Average BLEU score: {exp_7_avg_bleu:.4f}")
print(f"Average WER: {exp_7_avg_wer:.4f}")

Average BLEU score: 0.8107
Average WER: 0.1112


In [None]:
# Experiment 8
# Halving the number of epochs during training
exp_8_model = PhonemeToText(xphonebert,768,word_vocab_size,num_decoder_layers=4,pad_token_id=tokenizer.pad_token_id).to(device)
optimizer = torch.optim.Adam(exp_8_model.parameters(), lr=2e-4)
num_epochs = 5
for epoch in range(num_epochs):
    exp_8_train_loss = train_one_epoch(exp_8_model, train_dataloader, optimizer, CE_loss, device)
    exp_8_val_loss, exp_8_val_preds, exp_8_val_targets = validate(exp_8_model, val_dataloader, CE_loss, device, tokenizer, lexicon)

    print(f"Epoch {epoch+1}: Train Loss={exp_8_train_loss:.4f}, Val Loss={exp_8_val_loss:.4f}")


Epoch 1: Train Loss=3.8359, Val Loss=1.9016
Epoch 2: Train Loss=0.9860, Val Loss=1.1417
Epoch 3: Train Loss=0.5494, Val Loss=0.8684
Epoch 4: Train Loss=0.3482, Val Loss=0.7307
Epoch 5: Train Loss=0.2309, Val Loss=0.6470


In [None]:
exp_8_avg_bleu = compute_avg_bleu(exp_8_val_preds, exp_8_val_targets)
exp_8_avg_wer = compute_avg_wer(exp_8_val_preds, exp_8_val_targets)
print(f"Average BLEU score: {exp_8_avg_bleu:.4f}")
print(f"Average WER: {exp_8_avg_wer:.4f}")

Average BLEU score: 0.8053
Average WER: 0.1059


In [None]:
# Experiment 9
# Increased number of epochs during training
exp_9_model = PhonemeToText(xphonebert,768,word_vocab_size,num_decoder_layers=4,pad_token_id=tokenizer.pad_token_id).to(device)
optimizer = torch.optim.Adam(exp_9_model.parameters(), lr=2e-4)
num_epochs = 15
for epoch in range(num_epochs):
    exp_9_train_loss = train_one_epoch(exp_9_model, train_dataloader, optimizer, CE_loss, device)
    exp_9_val_loss, exp_9_val_preds, exp_9_val_targets = validate(exp_9_model, val_dataloader, CE_loss, device, tokenizer, lexicon)

    print(f"Epoch {epoch+1}: Train Loss={exp_9_train_loss:.4f}, Val Loss={exp_9_val_loss:.4f}")


Epoch 1: Train Loss=3.7818, Val Loss=1.8901
Epoch 2: Train Loss=0.9844, Val Loss=1.1402
Epoch 3: Train Loss=0.5479, Val Loss=0.8729
Epoch 4: Train Loss=0.3511, Val Loss=0.7320
Epoch 5: Train Loss=0.2328, Val Loss=0.6455
Epoch 6: Train Loss=0.1569, Val Loss=0.5871
Epoch 7: Train Loss=0.1030, Val Loss=0.5514
Epoch 8: Train Loss=0.0651, Val Loss=0.5234
Epoch 9: Train Loss=0.0387, Val Loss=0.5100
Epoch 10: Train Loss=0.0221, Val Loss=0.4997
Epoch 11: Train Loss=0.0137, Val Loss=0.4966
Epoch 12: Train Loss=0.0094, Val Loss=0.4954
Epoch 13: Train Loss=0.0071, Val Loss=0.4927
Epoch 14: Train Loss=0.0056, Val Loss=0.4919
Epoch 15: Train Loss=0.0045, Val Loss=0.4920


In [None]:
exp_9_avg_bleu = compute_avg_bleu(exp_9_val_preds, exp_9_val_targets)
exp_9_avg_wer = compute_avg_wer(exp_9_val_preds, exp_9_val_targets)
print(f"Average BLEU score: {exp_9_avg_bleu:.4f}")
print(f"Average WER: {exp_9_avg_wer:.4f}")

Average BLEU score: 0.7343
Average WER: 0.2843
