Defining Functions for Data Loading

In [1]:
"""
Use this file as a starting point to understand how to load in the data.
"""

import click
import pickle
import json
import numpy as np
from torch.utils.data import Dataset, DataLoader

TRAIN_PATH = './data/train.jsonl'
TEST_PATH = './data/test.jsonl'
VECS_PATH = './data/buckeye.vecs'
BATCH_SIZE = 64
SHUFFLE_DATA = True



def read_jsonl_file(path: str):
    data = []
    with open(path, 'r') as fid:
        for line in fid:
            data.append(json.loads(line))
    return data


def compute_log_duration(record):
    return np.log(
        np.sum(record['segment_duration_ms']))


def extract_phones(record):
    return record['observed_pron'].split(" ")


def load_vecs(path):
    with open(path, 'rb') as fid:
        return pickle.load(fid)


def get_embedding(record, vecs):
    return vecs[record['word']]


def read_record(record, vecs):
    phones = extract_phones(record)
    log_duration = compute_log_duration(record)
    embedding = get_embedding(record, vecs)
    return phones, embedding, log_duration


class BuckeyeDataset(Dataset):
    def __init__(self, jsonl_path, embeddings_path):
        self.records = read_jsonl_file(jsonl_path)
        self.vecs = load_vecs(embeddings_path)

    def __len__(self):
        return len(self.records)

    def __getitem__(self, idx):
        record = self.records[idx]
        phones, embedding, log_duration = read_record(record, self.vecs)
        return phones, embedding, log_duration

Linear model

In [32]:
import torch
from train_test_models import BuckeyeDataset, DataLoader

INPUT_SIZE = 1          # number of segments
OUTPUT_SIZE = 1         # total word duration
LEARNING_RATE = 0.001
WEIGHT_DECAY = 0
MOMENTUM = 0.001
MAX_LOSS = 1.
N_EPOCHS = 20000
BATCH_SIZE = 64
SHUFFLE = True
EVERY = 2000
VECS_PATH = './data/buckeye.vecs'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cpu"

training_data = BuckeyeDataset("./data/train.jsonl", VECS_PATH)
test_data = BuckeyeDataset("./data/test.jsonl", VECS_PATH)
train_dataloader = DataLoader(
    training_data, batch_size=64, shuffle=SHUFFLE, collate_fn=lambda x: x
)

class LinearModel(torch.nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearModel, self).__init__()
        self.linear = torch.nn.Linear(input_size, output_size)
    
    def forward(self, x):
        out = self.linear(x)
        return out
        
####################################################
### MODEL WITH JUST LENGTH IN NUMBER OF SEGMENTS ###
model = LinearModel(INPUT_SIZE, OUTPUT_SIZE).to(device)

for i in range(N_EPOCHS):
    criterion = torch.nn.MSELoss() 
    optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)
    # make sure we "zero out" the loss at each time step
    optimizer.zero_grad()
    batch_data = next(iter(train_dataloader))
    xs, ys = [], []
    for segments, embedding, duration in batch_data:
        xs.append(len(segments))
        ys.append(duration)
    xs = torch.Tensor(xs).reshape(-1, 1).to(device)
    ys = torch.Tensor(ys).reshape(-1, 1).to(device)
    loss = criterion(model(xs), ys)
    if i % EVERY==0:
        print(f"Loss: {loss}")
    # do backprop over that loss
    loss.backward()
    b, m = model.parameters()
    #if i % EVERY==0:
    #    print(f"Intercept: {b.detach()[0]}, Slope: {m.detach()[0]}")
    # move on to the next time step
    optimizer.step()

## TEST ##
test_criterion = torch.nn.MSELoss()
test_xs, test_ys = [], []
for segments, embedding, duration in test_data:
    test_xs.append(len(segments))
    test_ys.append(duration)
test_xs = torch.Tensor(test_xs).reshape(-1, 1).to(device)
test_ys = torch.Tensor(test_ys).reshape(-1, 1).to(device)
test_loss = test_criterion(model(test_xs), test_ys).detach()
print(f"Final loss on the test data is: {loss}") # tensor(0.6382)

#####################################################################
### MODEL WITH LENGTH IN NUMBER OF SEGMENTS PLUS WORD EMBEDDINGS ****
EMBEDDING_SIZE = 50
model = LinearModel(INPUT_SIZE + EMBEDDING_SIZE, OUTPUT_SIZE)

for i in range(N_EPOCHS):
    criterion = torch.nn.MSELoss() 
    optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)
    # make sure we "zero out" the loss at each time step
    optimizer.zero_grad()
    batch_data = next(iter(train_dataloader))
    xs, ys = [], []
    for segments, embedding, duration in batch_data:
        n_segments = len(segments)
        # combine two input spaces
        big_x = torch.concat([torch.Tensor([n_segments]), torch.Tensor(embedding)])
        xs.append(big_x)
        ys.append(duration)
    xs = torch.stack(xs).to(device)
    ys = torch.Tensor(ys).reshape(-1, 1).to(device)
    loss = criterion(model(xs), ys)
    if i % EVERY==0:
        print(f"Loss: {loss}")
    # do backprop over that loss
    loss.backward()
    b, m = model.parameters()
    #if i % EVERY==0:
        #print(f"Intercept: {b.detach()[0]}, Slope: {m.detach()[0]}")
    # move on to the next time step
    optimizer.step()

## TEST ##
test_criterion = torch.nn.MSELoss()
test_xs, test_ys = [], []
for segments, embedding, duration in batch_data:
    n_segments = len(segments)
    # combine two input spaces
    big_x = torch.concat([torch.Tensor([n_segments]), torch.Tensor(embedding)])
    test_xs.append(big_x)
    test_ys.append(duration)
test_xs = torch.stack(test_xs).to(device)
test_ys = torch.Tensor(test_ys).reshape(-1, 1).to(device)
test_loss = test_criterion(model(test_xs), test_ys).detach()
print(f"Final loss on the test data is: {loss}") # 0.6157650947570801

Loss: 72.12396240234375
Loss: 2.266043186187744
Loss: 0.5273388624191284
Loss: 0.684627115726471
Loss: 0.8294935822486877
Loss: 0.7337585687637329
Loss: 0.43014270067214966
Loss: 0.4064076840877533
Loss: 0.6353008151054382
Loss: 0.5439980030059814
Final loss on the test data is: 0.7615457773208618
Loss: 57.227073669433594
Loss: 4.095346450805664
Loss: 1.4785360097885132
Loss: 0.6148879528045654
Loss: 0.3057832717895508
Loss: 0.7791650891304016
Loss: 0.49807560443878174
Loss: 0.6263979077339172
Loss: 0.23934507369995117
Loss: 0.3244963586330414
Final loss on the test data is: 0.28784459829330444


CBOW Model

In [34]:
import numpy as np
import torch
from collections import Counter
from train_test_models import BuckeyeDataset, DataLoader
import pickle
import dill
from word2vec import Vocab
from torch.nn.utils.rnn import pad_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences

OUTPUT_SIZE = 1         # total word duration
LEARNING_RATE = 0.001
WEIGHT_DECAY = 0
MOMENTUM = 0.001
MAX_LOSS = 1.
MAX_NORM = 1.
N_EPOCHS = 4
BATCH_SIZE = 64
SHUFFLE = True
DEVICE = 'cpu'
EMBEDDING_SIZE = 8
MAX_NORM = 1
EVERY = 1000
#EMBEDDING_FILE = './data/phone_weights.pt'
EMBEDDING_FILE = './lab_2_data/word2_Vec_Lab_model.pt'
#PHONE_VOCAB_FILE = './data/phones.vocab'
PHONE_VOCAB_FILE = './lab_2_data/word2_Vec_Lab_model.vocab'
TRAIN_PATH = './data/train.jsonl'
TEST_PATH = './data/test.jsonl'
VECS_PATH = "./data/buckeye.vecs"

# Fetching the data
training_data = BuckeyeDataset(TRAIN_PATH, VECS_PATH)
test_data = BuckeyeDataset(TEST_PATH, VECS_PATH)
train_dataloader = DataLoader(
    training_data, batch_size=BATCH_SIZE, shuffle=SHUFFLE, collate_fn=lambda x: x
)
test_dataloader = DataLoader(
    test_data, batch_size=BATCH_SIZE, collate_fn= lambda x:x
)
#phone_embeds = torch.load(EMBEDDING_FILE)


#phone_embeds, phone_vocab = train_model()
#embedding_dim = phone_embeds.embedding.weight.size()[-1]


class LinearModel(torch.nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearModel, self).__init__()
        self.linear = torch.nn.Linear(input_size, output_size)
    
    def forward(self, x):
        out = self.linear(x)
        return out

class Vocab():
    def __init__(self, segments: list):
        self._compute_frequency_table(segments)
        print(self.frequency_table)
        self._build_ix_to_vocab_dicts()

    def _compute_frequency_table(self, segments):
        self.frequency_table = Counter(segments)
        self.vocab_size = len(self.frequency_table)
    
    def _build_ix_to_vocab_dicts(self):
        self.ix_to_vocab = {
            i: phone for i, phone in enumerate(self.frequency_table)
            if self.frequency_table[phone] > 0
        }
        self.vocab_to_ix = {
            self.ix_to_vocab[w]: w for w in self.ix_to_vocab.keys()
        }

    def tokenize(self, list_of_segments):
        return torch.tensor(
            [self.vocab_to_ix[w] for w in list_of_segments], dtype=torch.long,
            device=DEVICE).view(-1, 1)
    
    def detokenize(self, tensor):
        return torch.tensor(
            [self.ix_to_vocab[ix] for ix in tensor], dtype=torch.long,
            device=DEVICE).view(-1, 1)


class Word2Vec(torch.nn.Module):
    def __init__(self, input_size: int, embedding_size: int, output_size: int=None, max_norm=None):
        super(Word2Vec, self).__init__()
        self.embedding = torch.nn.Embedding(
            input_size,
            embedding_size,
            max_norm=MAX_NORM
            )
        if output_size is None:
            self.linear = torch.nn.Linear(embedding_size, input_size)
        else:
            self.linear = torch.nn.Linear(embedding_size, output_size)
    
    def forward(self, x):
        x = self.embedding(x)
        x = x.mean(axis=1)
        x = self.linear(x)
        return x


class MultilayerDurationModel(torch.nn.Module):
    def __init__(self, input_size: int, output_size: int=None, max_norm=None):
        super(MultilayerDurationModel, self).__init__()
        self.linear = torch.nn.Linear(input_size, 64) # flat
        self.hidden = torch.nn.Linear(64, output_size)
        # TODO: Add an intermediate layer

    def forward(self, x):
        x = torch.nn.functional.relu(self.linear(x))
        x = self.hidden(x)
        # TODO: Add the processing of the intermediate layer
        return x


def tokenize(vocab, list_of_segments):
    return torch.tensor(
        [vocab.vocab_to_ix[w] for w in list_of_segments if w in vocab.frequency_table], dtype=torch.long,
        device=DEVICE).view(-1, 1)

# Fetching the vocab and word2Vec model
phone_vocab = dill.load(open(PHONE_VOCAB_FILE, 'rb'))
phone_embeds = Word2Vec(
        input_size=phone_vocab.vocab_size,
        embedding_size=EMBEDDING_SIZE,
        max_norm=MAX_NORM
        )
phone_embeds.load_state_dict(torch.load(EMBEDDING_FILE))
embedding_dim = phone_embeds.embedding.weight.size()[-1]


####################################################

print("Training The Model ")
max_training_length = max([len(s[0]) for s in training_data]) # TODO: Figure out this number from the training data

#####################################################################
### LINEAR MODEL EMBEDDINGS CONCATENATED FOR EACH SEGMENT        ****
#####################################################################

modelL = LinearModel(max_training_length * embedding_dim, OUTPUT_SIZE).to(DEVICE)

#model = LinearModel(embedding_dim, OUTPUT_SIZE)  # TODO: Figure out the right shape
embedding_matrix = phone_embeds.embedding.weight.detach()
input_dims = max_training_length * embedding_dim

criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD(modelL.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)

for i in range(N_EPOCHS):
    print("EPOCH : " + str(i))
    running_loss = 0.0
    for idx, batch_data in enumerate(train_dataloader, 0):
        xs, ys = [], []

        # make sure we "zero out" the loss at each time step
        optimizer.zero_grad()
        batch_segments = []
        for segments, embedding, duration in batch_data:
            # TODO: concatenate the phone embeddings associated with all segments
            # TIP: reference your pt & vocab files
            segments_embed = []
            if len(segments) > 0:
                #concatenating all the segment embedding into a single tensor
                for segment in segments:
                    segment_id = tokenize(phone_vocab, segment)
                    segment_embedding = embedding_matrix[segment_id].flatten().tolist()
                    segments_embed.append(segment_embedding)
            #embeds = torch.Tensor(np.array(np.concatenate(segments_embed).flat)).flatten()
            embeds = list(np.concatenate(segments_embed).flat)
            batch_segments.append(embeds)

            #Adding padding to make it of the size of largest sequence tensor
            #padding_len = int((input_dims - (embeds.shape[0]))/2)
            #padding = torch.nn.ConstantPad1d(padding_len, 0.)

            #ConstPad1d() takes care of tensors that are bigger than max size.
            #padded_embeds = padding(embeds)
            ys.append(duration)

        # Adding padding to make it of the size of largest sequence tensor
        padded_embeds = torch.Tensor(pad_sequences(batch_segments, value=0., padding='post', truncating='post', maxlen=input_dims, dtype='float32'))
        #xs = torch.stack(xs)
        ys = torch.Tensor(ys).reshape(-1, 1).to(DEVICE)
        loss = criterion(modelL(padded_embeds), ys)
        running_loss += loss.item()
        if idx % EVERY==0:
            print(f"Loss: {loss}")

        # do backprop over that loss
        loss.backward()
        #b, m = model.parameters()
        #if i % EVERY==0:
        #    print(f"Intercept: {b.detach()[0]}, Slope: {m.detach()[0]}")
        # move on to the next time step
        optimizer.step()
    print(loss.item())

print("\n DONE Training ")

## TEST ##
print("\n \n Evaluating the Linear model on Test Data")
modelL.eval()

for idx, batch_data in enumerate(test_dataloader, 0):
    xs, ys = [], []
    optimizer.zero_grad()
    batch_segments = []
    for segments, embedding, duration in batch_data:
        # TODO: concatenate the phone embeddings associated with all segments
        # TIP: reference your pt & vocab files
        segments_embed = []
        if len(segments) > 0:
            # concatenating all the segment embedding into a single tensor
            for segment in segments:
                segment_id = tokenize(phone_vocab, segment)
                segment_embedding = embedding_matrix[segment_id].flatten().tolist()
                segments_embed.append(segment_embedding)

        embeds = list(np.concatenate(segments_embed).flat)
        batch_segments.append(embeds)

        # Adding padding to make it of the size of largest sequence tensor
        #padding_len = int((input_dims - (embeds.shape[0])) / 2)
        #padding = torch.nn.ConstantPad1d(padding_len, 0.)
        ys.append(duration)

    padded_embeds = torch.Tensor(pad_sequences(batch_segments, value=0., padding='post', truncating='post', maxlen=input_dims, dtype='float32'))
    ys = torch.Tensor(ys).reshape(-1, 1).to(DEVICE)
    loss = criterion(modelL(padded_embeds), ys)
    if idx % EVERY == 0:
        print(f"Loss: {loss}")

print(f"Final loss on the test data  for linear model is: {loss}")
torch.save(modelL.state_dict(), "./data/cbow_linear_model.pt")

#####################################################################
### MULTILAYER MODEL EMBEDDINGS CONCATENATED FOR EACH SEGMENT    ****
#####################################################################

modelM = MultilayerDurationModel(max_training_length * embedding_dim, OUTPUT_SIZE).to(DEVICE)
#model = LinearModel(embedding_dim, OUTPUT_SIZE)  # TODO: Figure out the right shape
embedding_matrix = phone_embeds.embedding.weight.detach()
input_dims = max_training_length * embedding_dim

criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD(modelM.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)

for i in range(N_EPOCHS):
    print("EPOCH : " + str(i))
    running_loss = 0.0
    for idx, batch_data in enumerate(train_dataloader, 0):
        xs, ys = [], []

        # make sure we "zero out" the loss at each time step
        optimizer.zero_grad()
        batch_segments = []
        for segments, embedding, duration in batch_data:
            # TODO: concatenate the phone embeddings associated with all segments
            # TIP: reference your pt & vocab files
            segments_embed = []
            if len(segments) > 0:
                #concatenating all the segment embedding into a single tensor
                for segment in segments:
                    segment_id = tokenize(phone_vocab, segment)
                    segment_embedding = embedding_matrix[segment_id].flatten().tolist()
                    segments_embed.append(segment_embedding)
            #embeds = torch.Tensor(np.array(np.concatenate(segments_embed).flat)).flatten()
            embeds = list(np.concatenate(segments_embed).flat)
            batch_segments.append(embeds)

            #Adding padding to make it of the size of largest sequence tensor
            #padding_len = int((input_dims - (embeds.shape[0]))/2)
            #padding = torch.nn.ConstantPad1d(padding_len, 0.)

            #ConstPad1d() takes care of tensors that are bigger than max size.
            #padded_embeds = padding(embeds)
            ys.append(duration)

        # Adding padding to make it of the size of largest sequence tensor
        padded_embeds = torch.Tensor(pad_sequences(batch_segments, value=0., padding='post', truncating='post', maxlen=input_dims, dtype='float32'))
        #xs = torch.stack(xs)
        ys = torch.Tensor(ys).reshape(-1, 1).to(DEVICE)
        loss = criterion(modelM(padded_embeds), ys)
        running_loss += loss.item()
        if idx % EVERY==0:
            print(f"Loss: {loss}")

        # do backprop over that loss
        loss.backward()
        #b, m = model.parameters()
        #if i % EVERY==0:
        #    print(f"Intercept: {b.detach()[0]}, Slope: {m.detach()[0]}")
        # move on to the next time step
        optimizer.step()
    print(loss.item())

print("\n DONE Training ")

## TEST ##
print("\n \n Evaluating the Multi layer model on Test Data")
modelM.eval()

for idx, batch_data in enumerate(test_dataloader, 0):
    xs, ys = [], []
    optimizer.zero_grad()
    batch_segments = []
    for segments, embedding, duration in batch_data:
        # TODO: concatenate the phone embeddings associated with all segments
        # TIP: reference your pt & vocab files
        segments_embed = []
        if len(segments) > 0:
            # concatenating all the segment embedding into a single tensor
            for segment in segments:
                segment_id = tokenize(phone_vocab, segment)
                segment_embedding = embedding_matrix[segment_id].flatten().tolist()
                segments_embed.append(segment_embedding)

        embeds = list(np.concatenate(segments_embed).flat)
        batch_segments.append(embeds)

        # Adding padding to make it of the size of largest sequence tensor
        #padding_len = int((input_dims - (embeds.shape[0])) / 2)
        #padding = torch.nn.ConstantPad1d(padding_len, 0.)
        ys.append(duration)

    padded_embeds = torch.Tensor(pad_sequences(batch_segments, value=0., padding='post', truncating='post', maxlen=input_dims, dtype='float32'))
    ys = torch.Tensor(ys).reshape(-1, 1).to(DEVICE)
    loss = criterion(modelM(padded_embeds), ys)
    if idx % EVERY == 0:
        print(f"Loss: {loss}")
print(f"Final loss on the test data for multilayer model is: {loss}")

torch.save(modelM.state_dict(), "./data/cbow_multi_model.pt")

Training The Model 
EPOCH : 0
Loss: 62.66505813598633
Loss: 1.7216973304748535
Loss: 1.0308253765106201
Loss: 1.547096848487854
Loss: 1.2050435543060303
0.48198163509368896
EPOCH : 1
Loss: 0.5772705078125
Loss: 0.5946406126022339
Loss: 0.6033220887184143
Loss: 0.5743860006332397
Loss: 0.6662980318069458
0.5000657439231873
EPOCH : 2
Loss: 0.47570523619651794
Loss: 0.49491262435913086
Loss: 0.7599642276763916
Loss: 0.6301745772361755
Loss: 0.6011781692504883
0.7191373109817505
EPOCH : 3
Loss: 0.4255598187446594
Loss: 0.643860936164856
Loss: 0.405662477016449
Loss: 0.3001071512699127
Loss: 0.43268153071403503
0.5643084049224854

 DONE Training 

 
 Evaluating the Linear model on Test Data
Loss: 0.6381330490112305
Loss: 0.5770381689071655
Final loss on the test data  for linear model is: 0.18668614327907562
EPOCH : 0
Loss: 57.52842330932617
Loss: 0.5226792097091675
Loss: 0.4405328929424286
Loss: 0.33546605706214905
Loss: 0.6175293922424316
0.6169184446334839
EPOCH : 1
Loss: 0.3310853540897

Encoder-Decoder 

In [53]:
import torch
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
import pickle, time, math, random, json
#from train_test_models import BuckeyeDataset, DataLoader
import dill
import torch.nn.functional as F

SOS_TOKEN = 0
EOS_TOKEN = 1
OUTPUT_SIZE = 1  # total word duration
MAX_LENGTH = 18
HIDDEN_SIZE = 256
DEVICE = 'cpu'
MAX_NORM = 1

LEARNING_RATE = 0.001
WEIGHT_DECAY = 0
MOMENTUM = 0.0001
MAX_LOSS = 1.
N_EPOCHS = 300
BATCH_SIZE = 64
SHUFFLE = True
EMBEDDING_SIZE = 8
EVERY = 30
GLOVE_EMBED_DIM=50

EMBEDDING_FILE = './lab_2_data/word2_Vec_Lab_model.pt'
PHONE_VOCAB_FILE = './lab_2_data/word2_Vec_Lab_model.vocab'
TRAIN_PATH = './data/train.jsonl'
TEST_PATH = './data/test.jsonl'
VECS_PATH = "./data/buckeye.vecs"
SHUFFLE_DATA = True

def tokenize(vocab, list_of_segments):
    return torch.tensor(
        [vocab.vocab_to_ix[w] for w in list_of_segments if w in vocab.frequency_table] + [EOS_TOKEN], dtype=torch.long,
        device=DEVICE).view(-1, 1)


def process_segments_for_encoder(line_data):
    # Split every line into pairs and normalize
    split_str = line_data['observed_pron'].split(" ")
    duration = torch.log(torch.Tensor([sum(line_data['segment_duration_ms'])]))
    return (split_str, duration)


def tensorFromSentence(vocab, sentence):
    indices = tokenize(vocab, sentence)
    return indices


class Vocab():
    def __init__(self, segments: list):
        self._compute_frequency_table(segments)
        print(self.frequency_table)
        self._build_ix_to_vocab_dicts()

    def _compute_frequency_table(self, segments):
        self.frequency_table = Counter(segments)
        self.vocab_size = len(self.frequency_table)

    def _build_ix_to_vocab_dicts(self):
        self.ix_to_vocab = {
            i: phone for i, phone in enumerate(self.frequency_table)
            if self.frequency_table[phone] > 0
        }
        self.vocab_to_ix = {
            self.ix_to_vocab[w]: w for w in self.ix_to_vocab.keys()
        }

    def tokenize(self, list_of_segments):
        return torch.tensor(
            [self.vocab_to_ix[w] for w in list_of_segments], dtype=torch.long,
            device=DEVICE).view(-1, 1)

    def detokenize(self, tensor):
        return torch.tensor(
            [self.ix_to_vocab[ix] for ix in tensor], dtype=torch.long,
            device=DEVICE).view(-1, 1)


class Word2Vec(torch.nn.Module):
    def __init__(self, input_size: int, embedding_size: int, output_size: int = None, max_norm=None):
        super(Word2Vec, self).__init__()
        self.embedding = torch.nn.Embedding(
            input_size,
            embedding_size,
            max_norm=MAX_NORM
        )
        if output_size is None:
            self.linear = torch.nn.Linear(embedding_size, input_size)
        else:
            self.linear = torch.nn.Linear(embedding_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        x = x.mean(axis=1)
        x = self.linear(x)
        return x


class EncoderRNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = torch.nn.Embedding(input_size, hidden_size)
        self.gru = torch.nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)


class EncoderGloveRNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size, glove_size=None):
        super(EncoderGloveRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = torch.nn.Embedding(input_size, hidden_size)
        if glove_size is not None:
            self.linear = torch.nn.Linear(glove_size, hidden_size)
            self.gru = torch.nn.GRU(hidden_size * 2, hidden_size)
        else:
            self.linear = torch.nn.Linear(0, hidden_size)
            self.gru = torch.nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden, glove_embedding=None):
        embedded = self.embedding(input).view(1, 1, -1)
        if glove_embedding is not None:
            linear = self.linear(glove_embedding).view(1, 1, -1)
            #print(embedded.shape, linear.shape)
            output = torch.cat((embedded, linear), axis=2)
        else:
            output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)


class EncoderRNN3(torch.nn.Module):
    def __init__(self, input_size, hidden_size, embeddings=None):
        super(EncoderRNN3, self).__init__()
        self.hidden_size = hidden_size
        if embeddings is not None:
            input_size, hidden_size = embeddings.size()
            self.embedding = torch.nn.Embedding(input_size, hidden_size)
            self.embedding.weight = torch.nn.Parameter(embeddings)
        else:
            self.embedding = torch.nn.Embedding(input_size, hidden_size)
        self.gru = torch.nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)


class DecoderRNN(torch.nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = torch.nn.Embedding(output_size, hidden_size)
        self.gru = torch.nn.GRU(hidden_size, hidden_size)
        self.out = torch.nn.Linear(hidden_size, output_size)
        self.softmax = torch.nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)


class AttnDecoderRNN(torch.nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = torch.nn.Embedding(self.output_size, self.hidden_size)
        self.attn = torch.nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = torch.nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = torch.nn.Dropout(self.dropout_p)
        self.gru = torch.nn.GRU(self.hidden_size, self.hidden_size)
        self.out = torch.nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion,
          max_length=MAX_LENGTH, glove_embedding=None):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max(input_length, max_length), encoder.hidden_size, device=DEVICE)

    loss = 0

    for ei in range(input_length):
        #encoder_output, encoder_hidden = encoder(
        #    input_tensor[ei], encoder_hidden, glove_embedding)
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_TOKEN]], device=DEVICE)

    decoder_hidden = encoder_hidden

    # use its own predictions as the next input
    for di in range(target_length):
        if type(decoder) is DecoderRNN:
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
        elif type(decoder) is AttnDecoderRNN:
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
        topv, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze().detach()  # detach from history as input
        loss += criterion(decoder_output, target_tensor[di].view(1))
        if decoder_input.item() == EOS_TOKEN:
            break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return encoder, decoder, loss.item() / target_length


def trainIters(pairs, encoder, decoder, learning_rate=0.01):
    start = time.time()

    encoder_optimizer = torch.optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = torch.optim.SGD(decoder.parameters(), lr=learning_rate)

    criterion = torch.nn.NLLLoss()

    for training_pair in pairs:
        #input_tensor, target_tensor, glove_embedding = training_pair
        input_tensor, target_tensor = training_pair

        #encoder, decoder, loss = train(
        #    input_tensor, target_tensor, encoder,
        #    decoder, encoder_optimizer, decoder_optimizer, criterion, glove_embedding=torch.Tensor(glove_embedding))
        encoder, decoder, loss = train(
            input_tensor, target_tensor, encoder,
            decoder, encoder_optimizer, decoder_optimizer, criterion,)


class LinearModel(torch.nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearModel, self).__init__()
        self.linear = torch.nn.Linear(input_size, output_size)

    def forward(self, x):
        out = self.linear(x)
        return out

def main():

    #fetching the data
    training_data = BuckeyeDataset(TRAIN_PATH, VECS_PATH)
    test_data = BuckeyeDataset(TEST_PATH, VECS_PATH)
    train_dataloader = DataLoader(
        training_data, batch_size=BATCH_SIZE, shuffle=SHUFFLE_DATA, collate_fn=lambda x: x
    )
    # loading the Vocab class
    phone_vocab = dill.load(open(PHONE_VOCAB_FILE, 'rb'))

    # loading learned phone embeddings from CBOW
    phone_embeds = Word2Vec(
        input_size=phone_vocab.vocab_size,
        embedding_size=EMBEDDING_SIZE,
        max_norm=MAX_NORM
    )
    phone_embeds.load_state_dict(torch.load(EMBEDDING_FILE))
    embedding_dim = phone_embeds.embedding.weight.size()[-1]


    #phone_embeds = torch.load(EMBEDDING_FILE)


    #phone_vocab = pickle.load(open(PHONE_VOCAB_FILE, 'rb'))
    #embedding_dim = phone_embeds.embedding.weight.size()[-1]

    #evaluating the max length
    MAX_LENGTH = max([len(s[0]) for s in training_data])


    # Encoder-decoder hidden states
    # Encoder-decoder hidden states + learned CBOW embeddings
    # Encoder-decoder hidden states + learned phone embeddings + GloVe embeddings
    # Encoder-decoder with attention hidden states
    # Encoder-decoder with attention hidden states + GloVe embeddings
    # Encoder-decoder with attention hidden states + learned CBOW embeddings + GloVe embeddings

    n_words_for_encoders = phone_vocab.vocab_size + len([SOS_TOKEN, EOS_TOKEN])
    encoder1 = EncoderRNN(n_words_for_encoders, HIDDEN_SIZE).to(DEVICE)
    encoder2 = EncoderGloveRNN(n_words_for_encoders, HIDDEN_SIZE, GLOVE_EMBED_DIM).to(DEVICE)
    encoder3 = EncoderRNN3(n_words_for_encoders, HIDDEN_SIZE,
                           embeddings=torch.nn.Parameter(phone_embeds.embedding.weight)).to(DEVICE)

    decoder1 = DecoderRNN(HIDDEN_SIZE, n_words_for_encoders)
    attn_decoder1 = AttnDecoderRNN(HIDDEN_SIZE, n_words_for_encoders, dropout_p=0.1).to(DEVICE)
    print("Training Encoder Decoder")
    for encoder in [encoder1]:#  , encoder3]: # , encoder2]:
        for decoder in [decoder1]: #, attn_decoder1]:
            for i in range(N_EPOCHS):
                batch_data = next(iter(train_dataloader))
                xs, ys, glove_embeddings = [], [], []
                for segments, glove_embedding, duration in batch_data:
                    if len(segments) > 0:
                        xs.append(tensorFromSentence(phone_vocab, segments))
                        ys.append(tensorFromSentence(phone_vocab, segments))
                        glove_embeddings.append(glove_embedding)
                #print(glove_embedding, embedding_dim)
                #pairs = list(zip(xs, ys, glove_embeddings))
                pairs = list(zip(xs, ys))
                trainIters(pairs, encoder, decoder)
                # evaluateRandomly(encoder, decoder, pairs, phone_vocab, n=10)
                encoder_type = str(type(encoder)).split(".")[-1][:-2]
                decoder_type = str(type(decoder)).split(".")[-1][:-2]
                #torch.save(encoder, f"./2022-05-11_{type(encoder)}_{type(decoder)}.pt")
                torch.save(encoder, f"./2022-05-11_{encoder_type}_{decoder_type}.pt")
                #torch.save(encoder.state_dict(), f"./2022-05-11_{type(encoder)}_{type(decoder)}.pt")

    # train duration prediction model
    model = LinearModel(HIDDEN_SIZE, OUTPUT_SIZE)
    embedding_matrix = phone_embeds.embedding.weight.detach()
    print("Training model")
    for ix, encoder in enumerate([encoder1]):# , encoder3]): #, encoder3]):
        encoder.eval()
        encoder_hidden = encoder.initHidden()
        for i in range(N_EPOCHS):
            loss = 0
            criterion = torch.nn.MSELoss()
            #optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
            optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)
            # make sure we "zero out" the loss at each time step
            optimizer.zero_grad()
            batch_data = next(iter(train_dataloader))
            xs, ys = [], []
            batch_segments = []
            #for idx, batch_data in enumerate(train_dataloader, 0):
            for segments, embedding, duration in batch_data:
              input_length = len(segments)
              encoder_outputs = torch.zeros(input_length, HIDDEN_SIZE,
                                              device=DEVICE, requires_grad=False)
              if input_length > 0:
                input_tensor = tokenize(phone_vocab, segments)
                for ei in range(input_length):
                  #encoder_output, encoder_hidden = encoder(
                  #              input_tensor[ei], encoder_hidden, torch.Tensor(embedding))
                  encoder_output, encoder_hidden = encoder(
                                input_tensor[ei], encoder_hidden)
                  encoder_outputs[ei] += encoder_output[0, 0]
                  last_hidden_state = encoder_outputs[-1].detach()
                  xs.append(last_hidden_state.flatten())
                  ys.append(duration)
            xs = torch.stack(xs)
            ys = torch.Tensor(ys).reshape(-1, 1)
            loss = criterion(model(xs), ys)

            if i % EVERY == 0:
                print(f"Loss: {loss}")

            loss.backward()
            b, m = model.parameters()
            optimizer.step()


        print(f"Final loss on the training data for encoder {ix} is: {loss}")  # tensor(0.478)
        ## TEST ##
        test_criterion = torch.nn.MSELoss()
        test_xs, test_ys = [], []
        for segments, embedding, duration in batch_data:
            input_length = len(segments)
            encoder_outputs = torch.zeros(input_length, HIDDEN_SIZE,
                                          device=DEVICE, requires_grad=False)
            if input_length > 0:
                input_tensor = tokenize(phone_vocab, segments)
                for ei in range(input_length):
                    #encoder_output, encoder_hidden = encoder(
                    #    input_tensor[ei], encoder_hidden, torch.Tensor(embedding))
                    encoder_output, encoder_hidden = encoder(
                        input_tensor[ei], encoder_hidden)
                    encoder_outputs[ei] += encoder_output[0, 0]
                last_hidden_state = encoder_outputs[-1].detach()
            test_xs.append(last_hidden_state)
            test_ys.append(duration)
        test_xs = torch.stack(test_xs)
        test_ys = torch.Tensor(test_ys).reshape(-1, 1)
        test_loss = test_criterion(model(test_xs), test_ys).detach()
        
        print(f"Final loss on the test data for encoder {ix} is: {test_loss}")  # tensor(0.7235)

In [54]:
main()

Training Encoder Decoder
Training model
Loss: 78.26979064941406
Loss: 0.5052607655525208
Loss: 0.4616890847682953
Loss: 0.5968324542045593
Loss: 0.4828501343727112
Loss: 0.5633633732795715
Loss: 0.5704646706581116
Loss: 0.35789021849632263
Loss: 0.2999202013015747
Loss: 0.504475474357605
Final loss on the training data for encoder 0 is: 0.5097392797470093
Final loss on the test data for encoder 0 is: 0.6795893907546997


Encoder-Decoder with CBOW Embeddings

In [55]:
import torch
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
import pickle, time, math, random, json
#from train_test_models import BuckeyeDataset, DataLoader
import dill
import torch.nn.functional as F

SOS_TOKEN = 0
EOS_TOKEN = 1
OUTPUT_SIZE = 1  # total word duration
MAX_LENGTH = 18
HIDDEN_SIZE = 8
DEVICE = 'cpu'
MAX_NORM = 1

LEARNING_RATE = 0.01
WEIGHT_DECAY = 0
MOMENTUM = 0.0001
MAX_LOSS = 1.
N_EPOCHS = 300
BATCH_SIZE = 64
SHUFFLE = True
EMBEDDING_SIZE = 8
EVERY = 30
GLOVE_EMBED_DIM=50

EMBEDDING_FILE = './lab_2_data/word2_Vec_Lab_model.pt'
PHONE_VOCAB_FILE = './lab_2_data/word2_Vec_Lab_model.vocab'
TRAIN_PATH = './data/train.jsonl'
TEST_PATH = './data/test.jsonl'
VECS_PATH = "./data/buckeye.vecs"
SHUFFLE_DATA = True

def tokenize(vocab, list_of_segments):
    return torch.tensor(
        [vocab.vocab_to_ix[w] for w in list_of_segments if w in vocab.frequency_table] + [EOS_TOKEN], dtype=torch.long,
        device=DEVICE).view(-1, 1)


def process_segments_for_encoder(line_data):
    # Split every line into pairs and normalize
    split_str = line_data['observed_pron'].split(" ")
    duration = torch.log(torch.Tensor([sum(line_data['segment_duration_ms'])]))
    return (split_str, duration)


def tensorFromSentence(vocab, sentence):
    indices = tokenize(vocab, sentence)
    return indices


class Vocab():
    def __init__(self, segments: list):
        self._compute_frequency_table(segments)
        print(self.frequency_table)
        self._build_ix_to_vocab_dicts()

    def _compute_frequency_table(self, segments):
        self.frequency_table = Counter(segments)
        self.vocab_size = len(self.frequency_table)

    def _build_ix_to_vocab_dicts(self):
        self.ix_to_vocab = {
            i: phone for i, phone in enumerate(self.frequency_table)
            if self.frequency_table[phone] > 0
        }
        self.vocab_to_ix = {
            self.ix_to_vocab[w]: w for w in self.ix_to_vocab.keys()
        }

    def tokenize(self, list_of_segments):
        return torch.tensor(
            [self.vocab_to_ix[w] for w in list_of_segments], dtype=torch.long,
            device=DEVICE).view(-1, 1)

    def detokenize(self, tensor):
        return torch.tensor(
            [self.ix_to_vocab[ix] for ix in tensor], dtype=torch.long,
            device=DEVICE).view(-1, 1)


class Word2Vec(torch.nn.Module):
    def __init__(self, input_size: int, embedding_size: int, output_size: int = None, max_norm=None):
        super(Word2Vec, self).__init__()
        self.embedding = torch.nn.Embedding(
            input_size,
            embedding_size,
            max_norm=MAX_NORM
        )
        if output_size is None:
            self.linear = torch.nn.Linear(embedding_size, input_size)
        else:
            self.linear = torch.nn.Linear(embedding_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        x = x.mean(axis=1)
        x = self.linear(x)
        return x


class EncoderRNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = torch.nn.Embedding(input_size, hidden_size)
        self.gru = torch.nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)


class EncoderGloveRNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size, glove_size=None):
        super(EncoderGloveRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = torch.nn.Embedding(input_size, hidden_size)
        if glove_size is not None:
            self.linear = torch.nn.Linear(glove_size, hidden_size)
            self.gru = torch.nn.GRU(hidden_size * 2, hidden_size)
        else:
            self.linear = torch.nn.Linear(0, hidden_size)
            self.gru = torch.nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden, glove_embedding=None):
        embedded = self.embedding(input).view(1, 1, -1)
        if glove_embedding is not None:
            linear = self.linear(glove_embedding).view(1, 1, -1)
            #print(embedded.shape, linear.shape)
            output = torch.cat((embedded, linear), axis=2)
        else:
            output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)


class EncoderRNN3(torch.nn.Module):
    def __init__(self, input_size, hidden_size, embeddings=None):
        super(EncoderRNN3, self).__init__()
        self.hidden_size = hidden_size
        if embeddings is not None:
            input_size, hidden_size = embeddings.size()
            self.embedding = torch.nn.Embedding(input_size, hidden_size)
            self.embedding.weight = torch.nn.Parameter(embeddings)
        else:
            self.embedding = torch.nn.Embedding(input_size, hidden_size)
        self.gru = torch.nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)


class DecoderRNN(torch.nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = torch.nn.Embedding(output_size, hidden_size)
        self.gru = torch.nn.GRU(hidden_size, hidden_size)
        self.out = torch.nn.Linear(hidden_size, output_size)
        self.softmax = torch.nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)


class AttnDecoderRNN(torch.nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = torch.nn.Embedding(self.output_size, self.hidden_size)
        self.attn = torch.nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = torch.nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = torch.nn.Dropout(self.dropout_p)
        self.gru = torch.nn.GRU(self.hidden_size, self.hidden_size)
        self.out = torch.nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion,
          max_length=MAX_LENGTH, glove_embedding=None):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max(input_length, max_length), encoder.hidden_size, device=DEVICE)

    loss = 0

    for ei in range(input_length):
        #encoder_output, encoder_hidden = encoder(
        #    input_tensor[ei], encoder_hidden, glove_embedding)
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_TOKEN]], device=DEVICE)

    decoder_hidden = encoder_hidden

    # use its own predictions as the next input
    for di in range(target_length):
        if type(decoder) is DecoderRNN:
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
        elif type(decoder) is AttnDecoderRNN:
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
        topv, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze().detach()  # detach from history as input
        loss += criterion(decoder_output, target_tensor[di].view(1))
        if decoder_input.item() == EOS_TOKEN:
            break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return encoder, decoder, loss.item() / target_length


def trainIters(pairs, encoder, decoder, learning_rate=0.01):
    start = time.time()

    encoder_optimizer = torch.optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = torch.optim.SGD(decoder.parameters(), lr=learning_rate)

    criterion = torch.nn.NLLLoss()

    for training_pair in pairs:
        #input_tensor, target_tensor, glove_embedding = training_pair
        input_tensor, target_tensor = training_pair

        #encoder, decoder, loss = train(
        #    input_tensor, target_tensor, encoder,
        #    decoder, encoder_optimizer, decoder_optimizer, criterion, glove_embedding=torch.Tensor(glove_embedding))
        encoder, decoder, loss = train(
            input_tensor, target_tensor, encoder,
            decoder, encoder_optimizer, decoder_optimizer, criterion,)


class LinearModel(torch.nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearModel, self).__init__()
        self.linear = torch.nn.Linear(input_size, output_size)

    def forward(self, x):
        out = self.linear(x)
        return out

def main():

    #fetching the data
    training_data = BuckeyeDataset(TRAIN_PATH, VECS_PATH)
    test_data = BuckeyeDataset(TEST_PATH, VECS_PATH)
    train_dataloader = DataLoader(
        training_data, batch_size=BATCH_SIZE, shuffle=SHUFFLE_DATA, collate_fn=lambda x: x
    )
    # loading the Vocab class
    phone_vocab = dill.load(open(PHONE_VOCAB_FILE, 'rb'))

    # loading learned phone embeddings from CBOW
    phone_embeds = Word2Vec(
        input_size=phone_vocab.vocab_size,
        embedding_size=EMBEDDING_SIZE,
        max_norm=MAX_NORM
    )
    phone_embeds.load_state_dict(torch.load(EMBEDDING_FILE))
    embedding_dim = phone_embeds.embedding.weight.size()[-1]


    #phone_embeds = torch.load(EMBEDDING_FILE)


    #phone_vocab = pickle.load(open(PHONE_VOCAB_FILE, 'rb'))
    #embedding_dim = phone_embeds.embedding.weight.size()[-1]

    #evaluating the max length
    MAX_LENGTH = max([len(s[0]) for s in training_data])


    # Encoder-decoder hidden states
    # Encoder-decoder hidden states + learned CBOW embeddings
    # Encoder-decoder hidden states + learned phone embeddings + GloVe embeddings
    # Encoder-decoder with attention hidden states
    # Encoder-decoder with attention hidden states + GloVe embeddings
    # Encoder-decoder with attention hidden states + learned CBOW embeddings + GloVe embeddings

    n_words_for_encoders = phone_vocab.vocab_size + len([SOS_TOKEN, EOS_TOKEN])
    encoder1 = EncoderRNN(n_words_for_encoders, HIDDEN_SIZE).to(DEVICE)
    encoder2 = EncoderGloveRNN(n_words_for_encoders, HIDDEN_SIZE, GLOVE_EMBED_DIM).to(DEVICE)
    encoder3 = EncoderRNN3(n_words_for_encoders, HIDDEN_SIZE,
                           embeddings=torch.nn.Parameter(phone_embeds.embedding.weight)).to(DEVICE)

    decoder1 = DecoderRNN(HIDDEN_SIZE, n_words_for_encoders)
    attn_decoder1 = AttnDecoderRNN(HIDDEN_SIZE, n_words_for_encoders, dropout_p=0.1).to(DEVICE)
    print("Training Encoder Decoder")
    for encoder in [encoder1, encoder3]:#  , encoder3]: # , encoder2]:
        for decoder in [decoder1]: #, attn_decoder1]:
            for i in range(N_EPOCHS):
                batch_data = next(iter(train_dataloader))
                xs, ys, glove_embeddings = [], [], []
                for segments, glove_embedding, duration in batch_data:
                    if len(segments) > 0:
                        xs.append(tensorFromSentence(phone_vocab, segments))
                        ys.append(tensorFromSentence(phone_vocab, segments))
                        glove_embeddings.append(glove_embedding)
                #print(glove_embedding, embedding_dim)
                #pairs = list(zip(xs, ys, glove_embeddings))
                pairs = list(zip(xs, ys))
                trainIters(pairs, encoder, decoder)
                # evaluateRandomly(encoder, decoder, pairs, phone_vocab, n=10)
                encoder_type = str(type(encoder)).split(".")[-1][:-2]
                decoder_type = str(type(decoder)).split(".")[-1][:-2]
                #torch.save(encoder, f"./2022-05-11_{type(encoder)}_{type(decoder)}.pt")
                torch.save(encoder, f"./2022-05-11_{encoder_type}_{decoder_type}.pt")
                #torch.save(encoder.state_dict(), f"./2022-05-11_{type(encoder)}_{type(decoder)}.pt")

    # train duration prediction model
    model = LinearModel(HIDDEN_SIZE, OUTPUT_SIZE)
    embedding_matrix = phone_embeds.embedding.weight.detach()
    print("Training model")
    for ix, encoder in enumerate([encoder1, encoder3]):# , encoder3]): #, encoder3]):
        encoder.eval()
        encoder_hidden = encoder.initHidden()
        for i in range(N_EPOCHS):
            loss = 0
            criterion = torch.nn.MSELoss()
            #optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
            optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)
            # make sure we "zero out" the loss at each time step
            optimizer.zero_grad()
            batch_data = next(iter(train_dataloader))
            xs, ys = [], []
            batch_segments = []
            #for idx, batch_data in enumerate(train_dataloader, 0):
            for segments, embedding, duration in batch_data:
              input_length = len(segments)
              encoder_outputs = torch.zeros(input_length, HIDDEN_SIZE,
                                              device=DEVICE, requires_grad=False)
              if input_length > 0:
                input_tensor = tokenize(phone_vocab, segments)
                for ei in range(input_length):
                  #encoder_output, encoder_hidden = encoder(
                  #              input_tensor[ei], encoder_hidden, torch.Tensor(embedding))
                  encoder_output, encoder_hidden = encoder(
                                input_tensor[ei], encoder_hidden)
                  encoder_outputs[ei] += encoder_output[0, 0]
                  last_hidden_state = encoder_outputs[-1].detach()
                  xs.append(last_hidden_state.flatten())
                  ys.append(duration)
            xs = torch.stack(xs)
            ys = torch.Tensor(ys).reshape(-1, 1)
            loss = criterion(model(xs), ys)

            if i % EVERY == 0:
                print(f"Loss: {loss}")

            loss.backward()
            b, m = model.parameters()
            optimizer.step()


        print(f"Final loss on the training data for encoder {ix} is: {loss}")  # tensor(0.478)
        ## TEST ##
        test_criterion = torch.nn.MSELoss()
        test_xs, test_ys = [], []
        for segments, embedding, duration in batch_data:
            input_length = len(segments)
            encoder_outputs = torch.zeros(input_length, HIDDEN_SIZE,
                                          device=DEVICE, requires_grad=False)
            if input_length > 0:
                input_tensor = tokenize(phone_vocab, segments)
                for ei in range(input_length):
                    #encoder_output, encoder_hidden = encoder(
                    #    input_tensor[ei], encoder_hidden, torch.Tensor(embedding))
                    encoder_output, encoder_hidden = encoder(
                        input_tensor[ei], encoder_hidden)
                    encoder_outputs[ei] += encoder_output[0, 0]
                last_hidden_state = encoder_outputs[-1].detach()
            test_xs.append(last_hidden_state)
            test_ys.append(duration)
        test_xs = torch.stack(test_xs)
        test_ys = torch.Tensor(test_ys).reshape(-1, 1)
        test_loss = test_criterion(model(test_xs), test_ys).detach()
        
        print(f"Final loss on the test data for encoder {ix} is: {test_loss}")  # tensor(0.7235)

In [56]:
main()

Training Encoder Decoder
Training model
Loss: 63.31273651123047
Loss: 2.3395936489105225
Loss: 0.48316314816474915
Loss: 0.4286659359931946
Loss: 1.3541005849838257
Loss: 0.8907515406608582
Loss: 0.9916452765464783
Loss: 0.8088127374649048
Loss: 0.8704742193222046
Loss: 0.7753809690475464
Final loss on the training data for encoder 0 is: 0.5860453248023987
Final loss on the test data for encoder 0 is: 0.8913733959197998
Loss: 1.1487457752227783
Loss: 1.1872951984405518
Loss: 0.627408504486084
Loss: 1.9141994714736938
Loss: 0.5556747913360596
Loss: 0.4828616976737976
Loss: 0.5209386348724365
Loss: 0.5176318287849426
Loss: 0.6656030416488647
Loss: 0.5227574110031128
Final loss on the training data for encoder 1 is: 1.285611867904663
Final loss on the test data for encoder 1 is: 1.4258601665496826


Encoder-Decoder with CBOW and GLOVE EMbeddings

In [57]:
import torch
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
import pickle, time, math, random, json
from train_test_models import BuckeyeDataset, DataLoader
import dill
import torch.nn.functional as F

SOS_TOKEN = 0
EOS_TOKEN = 1
OUTPUT_SIZE = 1  # total word duration
MAX_LENGTH = 18
HIDDEN_SIZE = 8
DEVICE = 'cpu'
MAX_NORM = 1

LEARNING_RATE = 0.01
WEIGHT_DECAY = 0
MOMENTUM = 0.0001
MAX_LOSS = 1.
N_EPOCHS = 500
BATCH_SIZE = 64
SHUFFLE = True
EMBEDDING_SIZE = 8
EVERY = 50
GLOVE_EMBED_DIM=50

EMBEDDING_FILE = './lab_2_data/word2_Vec_Lab_model.pt'
PHONE_VOCAB_FILE = './lab_2_data/word2_Vec_Lab_model.vocab'
TRAIN_PATH = './data/train.jsonl'
TEST_PATH = './data/test.jsonl'
VECS_PATH = "./data/buckeye.vecs"
SHUFFLE_DATA = True

def tokenize(vocab, list_of_segments):
    return torch.tensor(
        [vocab.vocab_to_ix[w] for w in list_of_segments if w in vocab.frequency_table] + [EOS_TOKEN], dtype=torch.long,
        device=DEVICE).view(-1, 1)


def process_segments_for_encoder(line_data):
    # Split every line into pairs and normalize
    split_str = line_data['observed_pron'].split(" ")
    duration = torch.log(torch.Tensor([sum(line_data['segment_duration_ms'])]))
    return (split_str, duration)


def tensorFromSentence(vocab, sentence):
    indices = tokenize(vocab, sentence)
    return indices


class Vocab():
    def __init__(self, segments: list):
        self._compute_frequency_table(segments)
        print(self.frequency_table)
        self._build_ix_to_vocab_dicts()

    def _compute_frequency_table(self, segments):
        self.frequency_table = Counter(segments)
        self.vocab_size = len(self.frequency_table)

    def _build_ix_to_vocab_dicts(self):
        self.ix_to_vocab = {
            i: phone for i, phone in enumerate(self.frequency_table)
            if self.frequency_table[phone] > 0
        }
        self.vocab_to_ix = {
            self.ix_to_vocab[w]: w for w in self.ix_to_vocab.keys()
        }

    def tokenize(self, list_of_segments):
        return torch.tensor(
            [self.vocab_to_ix[w] for w in list_of_segments], dtype=torch.long,
            device=DEVICE).view(-1, 1)

    def detokenize(self, tensor):
        return torch.tensor(
            [self.ix_to_vocab[ix] for ix in tensor], dtype=torch.long,
            device=DEVICE).view(-1, 1)


class Word2Vec(torch.nn.Module):
    def __init__(self, input_size: int, embedding_size: int, output_size: int = None, max_norm=None):
        super(Word2Vec, self).__init__()
        self.embedding = torch.nn.Embedding(
            input_size,
            embedding_size,
            max_norm=MAX_NORM
        )
        if output_size is None:
            self.linear = torch.nn.Linear(embedding_size, input_size)
        else:
            self.linear = torch.nn.Linear(embedding_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        x = x.mean(axis=1)
        x = self.linear(x)
        return x


class EncoderRNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = torch.nn.Embedding(input_size, hidden_size)
        self.gru = torch.nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)


class EncoderGloveRNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size, glove_size=None):
        super(EncoderGloveRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = torch.nn.Embedding(input_size, hidden_size)
        if glove_size is not None:
            self.linear = torch.nn.Linear(glove_size, hidden_size)
            self.gru = torch.nn.GRU(hidden_size * 2, hidden_size)
        else:
            self.linear = torch.nn.Linear(0, hidden_size)
            self.gru = torch.nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden, glove_embedding=None):
        embedded = self.embedding(input).view(1, 1, -1)
        if glove_embedding is not None:
            linear = self.linear(glove_embedding).view(1, 1, -1)
            #print(embedded.shape, linear.shape)
            output = torch.cat((embedded, linear), axis=2)
        else:
            output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)


class EncoderRNN3(torch.nn.Module):
    def __init__(self, input_size, hidden_size, embeddings=None):
        super(EncoderRNN3, self).__init__()
        self.hidden_size = hidden_size
        if embeddings is not None:
            input_size, hidden_size = embeddings.size()
            self.embedding = torch.nn.Embedding(input_size, hidden_size)
            self.embedding.weight = torch.nn.Parameter(embeddings)
        else:
            self.embedding = torch.nn.Embedding(input_size, hidden_size)
        self.gru = torch.nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)


class DecoderRNN(torch.nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = torch.nn.Embedding(output_size, hidden_size)
        self.gru = torch.nn.GRU(hidden_size, hidden_size)
        self.out = torch.nn.Linear(hidden_size, output_size)
        self.softmax = torch.nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)


class AttnDecoderRNN(torch.nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = torch.nn.Embedding(self.output_size, self.hidden_size)
        self.attn = torch.nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = torch.nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = torch.nn.Dropout(self.dropout_p)
        self.gru = torch.nn.GRU(self.hidden_size, self.hidden_size)
        self.out = torch.nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion,
          max_length=MAX_LENGTH, glove_embedding=None):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max(input_length, max_length), encoder.hidden_size, device=DEVICE)

    loss = 0

    for ei in range(input_length):
      if(type(encoder) is EncoderGloveRNN):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden, glove_embedding)
      else:
         encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)       
      encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_TOKEN]], device=DEVICE)

    decoder_hidden = encoder_hidden

    # use its own predictions as the next input
    for di in range(target_length):
        if type(decoder) is DecoderRNN:
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
        elif type(decoder) is AttnDecoderRNN:
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
        topv, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze().detach()  # detach from history as input
        loss += criterion(decoder_output, target_tensor[di].view(1))
        if decoder_input.item() == EOS_TOKEN:
            break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return encoder, decoder, loss.item() / target_length


def trainIters(pairs, encoder, decoder, learning_rate=0.01):
    start = time.time()

    encoder_optimizer = torch.optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = torch.optim.SGD(decoder.parameters(), lr=learning_rate)

    criterion = torch.nn.NLLLoss()

    for training_pair in pairs:
        input_tensor, target_tensor, glove_embedding = training_pair

        encoder, decoder, loss = train(
            input_tensor, target_tensor, encoder,
            decoder, encoder_optimizer, decoder_optimizer, criterion, glove_embedding=torch.Tensor(glove_embedding))

class LinearModel(torch.nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearModel, self).__init__()
        self.linear = torch.nn.Linear(input_size, output_size)

    def forward(self, x):
        out = self.linear(x)
        return out

def main():

    #fetching the data
    training_data = BuckeyeDataset(TRAIN_PATH, VECS_PATH)
    test_data = BuckeyeDataset(TEST_PATH, VECS_PATH)
    train_dataloader = DataLoader(
        training_data, batch_size=BATCH_SIZE, shuffle=SHUFFLE_DATA, collate_fn=lambda x: x
    )
    # loading the Vocab class
    phone_vocab = dill.load(open(PHONE_VOCAB_FILE, 'rb'))

    # loading learned phone embeddings from CBOW
    phone_embeds = Word2Vec(
        input_size=phone_vocab.vocab_size,
        embedding_size=EMBEDDING_SIZE,
        max_norm=MAX_NORM
    )
    phone_embeds.load_state_dict(torch.load(EMBEDDING_FILE))
    embedding_dim = phone_embeds.embedding.weight.size()[-1]


    #phone_embeds = torch.load(EMBEDDING_FILE)


    #phone_vocab = pickle.load(open(PHONE_VOCAB_FILE, 'rb'))
    #embedding_dim = phone_embeds.embedding.weight.size()[-1]

    #evaluating the max length
    MAX_LENGTH = max([len(s[0]) for s in training_data])


    # Encoder-decoder hidden states
    # Encoder-decoder hidden states + learned CBOW embeddings
    # Encoder-decoder hidden states + learned phone embeddings + GloVe embeddings
    # Encoder-decoder with attention hidden states
    # Encoder-decoder with attention hidden states + GloVe embeddings
    # Encoder-decoder with attention hidden states + learned CBOW embeddings + GloVe embeddings

    n_words_for_encoders = phone_vocab.vocab_size + len([SOS_TOKEN, EOS_TOKEN])
    encoder1 = EncoderRNN(n_words_for_encoders, HIDDEN_SIZE).to(DEVICE)
    encoder2 = EncoderGloveRNN(n_words_for_encoders, HIDDEN_SIZE, GLOVE_EMBED_DIM).to(DEVICE)
    encoder3 = EncoderRNN3(n_words_for_encoders, HIDDEN_SIZE,
                           embeddings=torch.nn.Parameter(phone_embeds.embedding.weight)).to(DEVICE)

    decoder1 = DecoderRNN(HIDDEN_SIZE, n_words_for_encoders)
    attn_decoder1 = AttnDecoderRNN(HIDDEN_SIZE, n_words_for_encoders, dropout_p=0.1).to(DEVICE)
    print("Training Encoder Decoder")
    for encoder in [encoder1, encoder2, encoder3]:#  , encoder3]: # , encoder2]:
        for decoder in [decoder1]: #, attn_decoder1]:
            for i in range(N_EPOCHS):
                batch_data = next(iter(train_dataloader))
                xs, ys, glove_embeddings = [], [], []
                for segments, glove_embedding, duration in batch_data:
                    if len(segments) > 0:
                        xs.append(tensorFromSentence(phone_vocab, segments))
                        ys.append(tensorFromSentence(phone_vocab, segments))
                        glove_embeddings.append(glove_embedding)
                #print(glove_embedding, embedding_dim)
                pairs = list(zip(xs, ys, glove_embeddings))
                trainIters(pairs, encoder, decoder)
                # evaluateRandomly(encoder, decoder, pairs, phone_vocab, n=10)
                encoder_type = str(type(encoder)).split(".")[-1][:-2]
                decoder_type = str(type(decoder)).split(".")[-1][:-2]
                #torch.save(encoder, f"./2022-05-11_{type(encoder)}_{type(decoder)}.pt")
                torch.save(encoder, f"./2022-05-11_{encoder_type}_{decoder_type}.pt")
                #torch.save(encoder.state_dict(), f"./2022-05-11_{type(encoder)}_{type(decoder)}.pt")

    # train duration prediction model
    model = LinearModel(HIDDEN_SIZE, OUTPUT_SIZE)
    embedding_matrix = phone_embeds.embedding.weight.detach()
    print("Training model")
    for ix, encoder in enumerate([encoder1, encoder2, encoder3]):# , encoder3]): #, encoder3]):
        encoder.eval()
        encoder_hidden = encoder.initHidden()
        for i in range(N_EPOCHS):
            loss = 0
            criterion = torch.nn.MSELoss()
            #optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
            optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)
            # make sure we "zero out" the loss at each time step
            optimizer.zero_grad()
            batch_data = next(iter(train_dataloader))
            xs, ys = [], []
            batch_segments = []
            #for idx, batch_data in enumerate(train_dataloader, 0):
            for segments, embedding, duration in batch_data:
              input_length = len(segments)
              encoder_outputs = torch.zeros(input_length, HIDDEN_SIZE,
                                              device=DEVICE, requires_grad=False)
              if input_length > 0:
                input_tensor = tokenize(phone_vocab, segments)
                for ei in range(input_length):
                  if(type(encoder) is EncoderGloveRNN):
                    encoder_output, encoder_hidden = encoder(
                      input_tensor[ei], encoder_hidden, torch.Tensor(embedding))
                  else:
                    encoder_output, encoder_hidden = encoder(
                      input_tensor[ei], encoder_hidden)
                  encoder_outputs[ei] += encoder_output[0, 0]
                  last_hidden_state = encoder_outputs[-1].detach()
                  xs.append(last_hidden_state.flatten())
                  ys.append(duration)
            xs = torch.stack(xs)
            ys = torch.Tensor(ys).reshape(-1, 1)
            loss = criterion(model(xs), ys)

            if i % EVERY == 0:
                print(f"Loss: {loss}")

            loss.backward()
            b, m = model.parameters()
            optimizer.step()


        print(f"Final loss on the training data for encoder {ix} is: {loss}")  # tensor(0.478)
        ## TEST ##
        test_criterion = torch.nn.MSELoss()
        test_xs, test_ys = [], []
        for segments, embedding, duration in batch_data:
            input_length = len(segments)
            encoder_outputs = torch.zeros(input_length, HIDDEN_SIZE,
                                          device=DEVICE, requires_grad=False)
            if input_length > 0:
                input_tensor = tokenize(phone_vocab, segments)
                for ei in range(input_length):
                  if(type(encoder) is EncoderGloveRNN):
                    encoder_output, encoder_hidden = encoder(
                      input_tensor[ei], encoder_hidden, torch.Tensor(embedding))
                  else:
                    encoder_output, encoder_hidden = encoder(
                      input_tensor[ei], encoder_hidden)
                    
                    encoder_outputs[ei] += encoder_output[0, 0]
                last_hidden_state = encoder_outputs[-1].detach()
            test_xs.append(last_hidden_state)
            test_ys.append(duration)
        test_xs = torch.stack(test_xs)
        test_ys = torch.Tensor(test_ys).reshape(-1, 1)
        test_loss = test_criterion(model(test_xs), test_ys).detach()
        
        print(f"Final loss on the test data for encoder {ix} is: {test_loss}")  # tensor(0.7235)

In [58]:
main()

Training Encoder Decoder
Training model
Loss: 65.6651840209961
Loss: 0.804783821105957
Loss: 0.6187025308609009
Loss: 0.6793946623802185
Loss: 0.5158143639564514
Loss: 0.7093705534934998
Loss: 0.9525966048240662
Loss: 0.49109992384910583
Loss: 0.9505195021629333
Loss: 0.5223023295402527
Final loss on the training data for encoder 0 is: 0.7255538702011108
Final loss on the test data for encoder 0 is: 1.3859784603118896
Loss: 2.1690266132354736
Loss: 0.3879263401031494
Loss: 0.6513210535049438
Loss: 0.7673561573028564
Loss: 0.6840199828147888
Loss: 0.501665472984314
Loss: 0.4364526867866516
Loss: 0.5918902158737183
Loss: 0.4731883406639099
Loss: 0.5687894225120544
Final loss on the training data for encoder 1 is: 0.5722402930259705
Final loss on the test data for encoder 1 is: 40.107059478759766
Loss: 7.4033708572387695
Loss: 0.4525009095668793
Loss: 0.4716859459877014
Loss: 0.5113304853439331
Loss: 0.6955394744873047
Loss: 0.37185657024383545
Loss: 0.6209486126899719
Loss: 0.47453957796

Encoder-Decoder With Attention

In [59]:
import torch
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
import pickle, time, math, random, json
#from train_test_models import BuckeyeDataset, DataLoader
import dill
import torch.nn.functional as F

SOS_TOKEN = 0
EOS_TOKEN = 1
OUTPUT_SIZE = 1  # total word duration
MAX_LENGTH = 18
HIDDEN_SIZE = 256
DEVICE = 'cpu'
MAX_NORM = 1

LEARNING_RATE = 0.001
WEIGHT_DECAY = 0
MOMENTUM = 0.0001
MAX_LOSS = 1.
N_EPOCHS = 300
BATCH_SIZE = 64
SHUFFLE = True
EMBEDDING_SIZE = 8
EVERY = 30
GLOVE_EMBED_DIM=50

EMBEDDING_FILE = './lab_2_data/word2_Vec_Lab_model.pt'
PHONE_VOCAB_FILE = './lab_2_data/word2_Vec_Lab_model.vocab'
TRAIN_PATH = './data/train.jsonl'
TEST_PATH = './data/test.jsonl'
VECS_PATH = "./data/buckeye.vecs"
SHUFFLE_DATA = True

def tokenize(vocab, list_of_segments):
    return torch.tensor(
        [vocab.vocab_to_ix[w] for w in list_of_segments if w in vocab.frequency_table] + [EOS_TOKEN], dtype=torch.long,
        device=DEVICE).view(-1, 1)


def process_segments_for_encoder(line_data):
    # Split every line into pairs and normalize
    split_str = line_data['observed_pron'].split(" ")
    duration = torch.log(torch.Tensor([sum(line_data['segment_duration_ms'])]))
    return (split_str, duration)


def tensorFromSentence(vocab, sentence):
    indices = tokenize(vocab, sentence)
    return indices


class Vocab():
    def __init__(self, segments: list):
        self._compute_frequency_table(segments)
        print(self.frequency_table)
        self._build_ix_to_vocab_dicts()

    def _compute_frequency_table(self, segments):
        self.frequency_table = Counter(segments)
        self.vocab_size = len(self.frequency_table)

    def _build_ix_to_vocab_dicts(self):
        self.ix_to_vocab = {
            i: phone for i, phone in enumerate(self.frequency_table)
            if self.frequency_table[phone] > 0
        }
        self.vocab_to_ix = {
            self.ix_to_vocab[w]: w for w in self.ix_to_vocab.keys()
        }

    def tokenize(self, list_of_segments):
        return torch.tensor(
            [self.vocab_to_ix[w] for w in list_of_segments], dtype=torch.long,
            device=DEVICE).view(-1, 1)

    def detokenize(self, tensor):
        return torch.tensor(
            [self.ix_to_vocab[ix] for ix in tensor], dtype=torch.long,
            device=DEVICE).view(-1, 1)


class Word2Vec(torch.nn.Module):
    def __init__(self, input_size: int, embedding_size: int, output_size: int = None, max_norm=None):
        super(Word2Vec, self).__init__()
        self.embedding = torch.nn.Embedding(
            input_size,
            embedding_size,
            max_norm=MAX_NORM
        )
        if output_size is None:
            self.linear = torch.nn.Linear(embedding_size, input_size)
        else:
            self.linear = torch.nn.Linear(embedding_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        x = x.mean(axis=1)
        x = self.linear(x)
        return x


class EncoderRNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = torch.nn.Embedding(input_size, hidden_size)
        self.gru = torch.nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)


class EncoderGloveRNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size, glove_size=None):
        super(EncoderGloveRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = torch.nn.Embedding(input_size, hidden_size)
        if glove_size is not None:
            self.linear = torch.nn.Linear(glove_size, hidden_size)
            self.gru = torch.nn.GRU(hidden_size * 2, hidden_size)
        else:
            self.linear = torch.nn.Linear(0, hidden_size)
            self.gru = torch.nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden, glove_embedding=None):
        embedded = self.embedding(input).view(1, 1, -1)
        if glove_embedding is not None:
            linear = self.linear(glove_embedding).view(1, 1, -1)
            #print(embedded.shape, linear.shape)
            output = torch.cat((embedded, linear), axis=2)
        else:
            output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)


class EncoderRNN3(torch.nn.Module):
    def __init__(self, input_size, hidden_size, embeddings=None):
        super(EncoderRNN3, self).__init__()
        self.hidden_size = hidden_size
        if embeddings is not None:
            input_size, hidden_size = embeddings.size()
            self.embedding = torch.nn.Embedding(input_size, hidden_size)
            self.embedding.weight = torch.nn.Parameter(embeddings)
        else:
            self.embedding = torch.nn.Embedding(input_size, hidden_size)
        self.gru = torch.nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)


class DecoderRNN(torch.nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = torch.nn.Embedding(output_size, hidden_size)
        self.gru = torch.nn.GRU(hidden_size, hidden_size)
        self.out = torch.nn.Linear(hidden_size, output_size)
        self.softmax = torch.nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)


class AttnDecoderRNN(torch.nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = torch.nn.Embedding(self.output_size, self.hidden_size)
        self.attn = torch.nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = torch.nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = torch.nn.Dropout(self.dropout_p)
        self.gru = torch.nn.GRU(self.hidden_size, self.hidden_size)
        self.out = torch.nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion,
          max_length=MAX_LENGTH, glove_embedding=None):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max(input_length, max_length), encoder.hidden_size, device=DEVICE)

    loss = 0

    for ei in range(input_length):
        #encoder_output, encoder_hidden = encoder(
        #    input_tensor[ei], encoder_hidden, glove_embedding)
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_TOKEN]], device=DEVICE)

    decoder_hidden = encoder_hidden

    # use its own predictions as the next input
    for di in range(target_length):
        if type(decoder) is DecoderRNN:
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
        elif type(decoder) is AttnDecoderRNN:
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
        topv, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze().detach()  # detach from history as input
        loss += criterion(decoder_output, target_tensor[di].view(1))
        if decoder_input.item() == EOS_TOKEN:
            break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return encoder, decoder, loss.item() / target_length


def trainIters(pairs, encoder, decoder, learning_rate=0.01):
    start = time.time()

    encoder_optimizer = torch.optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = torch.optim.SGD(decoder.parameters(), lr=learning_rate)

    criterion = torch.nn.NLLLoss()

    for training_pair in pairs:
        #input_tensor, target_tensor, glove_embedding = training_pair
        input_tensor, target_tensor = training_pair

        #encoder, decoder, loss = train(
        #    input_tensor, target_tensor, encoder,
        #    decoder, encoder_optimizer, decoder_optimizer, criterion, glove_embedding=torch.Tensor(glove_embedding))
        encoder, decoder, loss = train(
            input_tensor, target_tensor, encoder,
            decoder, encoder_optimizer, decoder_optimizer, criterion,)


class LinearModel(torch.nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearModel, self).__init__()
        self.linear = torch.nn.Linear(input_size, output_size)

    def forward(self, x):
        out = self.linear(x)
        return out

def main():

    #fetching the data
    training_data = BuckeyeDataset(TRAIN_PATH, VECS_PATH)
    test_data = BuckeyeDataset(TEST_PATH, VECS_PATH)
    train_dataloader = DataLoader(
        training_data, batch_size=BATCH_SIZE, shuffle=SHUFFLE_DATA, collate_fn=lambda x: x
    )
    # loading the Vocab class
    phone_vocab = dill.load(open(PHONE_VOCAB_FILE, 'rb'))

    # loading learned phone embeddings from CBOW
    phone_embeds = Word2Vec(
        input_size=phone_vocab.vocab_size,
        embedding_size=EMBEDDING_SIZE,
        max_norm=MAX_NORM
    )
    phone_embeds.load_state_dict(torch.load(EMBEDDING_FILE))
    embedding_dim = phone_embeds.embedding.weight.size()[-1]


    #phone_embeds = torch.load(EMBEDDING_FILE)


    #phone_vocab = pickle.load(open(PHONE_VOCAB_FILE, 'rb'))
    #embedding_dim = phone_embeds.embedding.weight.size()[-1]

    #evaluating the max length
    MAX_LENGTH = max([len(s[0]) for s in training_data])


    # Encoder-decoder hidden states
    # Encoder-decoder hidden states + learned CBOW embeddings
    # Encoder-decoder hidden states + learned phone embeddings + GloVe embeddings
    # Encoder-decoder with attention hidden states
    # Encoder-decoder with attention hidden states + GloVe embeddings
    # Encoder-decoder with attention hidden states + learned CBOW embeddings + GloVe embeddings

    n_words_for_encoders = phone_vocab.vocab_size + len([SOS_TOKEN, EOS_TOKEN])
    encoder1 = EncoderRNN(n_words_for_encoders, HIDDEN_SIZE).to(DEVICE)
    encoder2 = EncoderGloveRNN(n_words_for_encoders, HIDDEN_SIZE, GLOVE_EMBED_DIM).to(DEVICE)
    encoder3 = EncoderRNN3(n_words_for_encoders, HIDDEN_SIZE,
                           embeddings=torch.nn.Parameter(phone_embeds.embedding.weight)).to(DEVICE)

    decoder1 = DecoderRNN(HIDDEN_SIZE, n_words_for_encoders)
    attn_decoder1 = AttnDecoderRNN(HIDDEN_SIZE, n_words_for_encoders, dropout_p=0.1).to(DEVICE)
    print("Training Encoder Decoder")
    for encoder in [encoder1]:#  , encoder3]: # , encoder2]:
        for decoder in [attn_decoder1]: #, attn_decoder1]:
            for i in range(N_EPOCHS):
                batch_data = next(iter(train_dataloader))
                xs, ys, glove_embeddings = [], [], []
                for segments, glove_embedding, duration in batch_data:
                    if len(segments) > 0:
                        xs.append(tensorFromSentence(phone_vocab, segments))
                        ys.append(tensorFromSentence(phone_vocab, segments))
                        glove_embeddings.append(glove_embedding)
                #print(glove_embedding, embedding_dim)
                #pairs = list(zip(xs, ys, glove_embeddings))
                pairs = list(zip(xs, ys))
                trainIters(pairs, encoder, decoder)
                # evaluateRandomly(encoder, decoder, pairs, phone_vocab, n=10)
                encoder_type = str(type(encoder)).split(".")[-1][:-2]
                decoder_type = str(type(decoder)).split(".")[-1][:-2]
                #torch.save(encoder, f"./2022-05-11_{type(encoder)}_{type(decoder)}.pt")
                torch.save(encoder, f"./2022-05-11_{encoder_type}_{decoder_type}.pt")
                #torch.save(encoder.state_dict(), f"./2022-05-11_{type(encoder)}_{type(decoder)}.pt")

    # train duration prediction model
    model = LinearModel(HIDDEN_SIZE, OUTPUT_SIZE)
    embedding_matrix = phone_embeds.embedding.weight.detach()
    print("Training model")
    for ix, encoder in enumerate([encoder1]):# , encoder3]): #, encoder3]):
        encoder.eval()
        encoder_hidden = encoder.initHidden()
        for i in range(N_EPOCHS):
            loss = 0
            criterion = torch.nn.MSELoss()
            #optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
            optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)
            # make sure we "zero out" the loss at each time step
            optimizer.zero_grad()
            batch_data = next(iter(train_dataloader))
            xs, ys = [], []
            batch_segments = []
            #for idx, batch_data in enumerate(train_dataloader, 0):
            for segments, embedding, duration in batch_data:
              input_length = len(segments)
              encoder_outputs = torch.zeros(input_length, HIDDEN_SIZE,
                                              device=DEVICE, requires_grad=False)
              if input_length > 0:
                input_tensor = tokenize(phone_vocab, segments)
                for ei in range(input_length):
                  #encoder_output, encoder_hidden = encoder(
                  #              input_tensor[ei], encoder_hidden, torch.Tensor(embedding))
                  encoder_output, encoder_hidden = encoder(
                                input_tensor[ei], encoder_hidden)
                  encoder_outputs[ei] += encoder_output[0, 0]
                  last_hidden_state = encoder_outputs[-1].detach()
                  xs.append(last_hidden_state.flatten())
                  ys.append(duration)
            xs = torch.stack(xs)
            ys = torch.Tensor(ys).reshape(-1, 1)
            loss = criterion(model(xs), ys)

            if i % EVERY == 0:
                print(f"Loss: {loss}")

            loss.backward()
            b, m = model.parameters()
            optimizer.step()


        print(f"Final loss on the training data for encoder {ix} is: {loss}")  # tensor(0.478)
        ## TEST ##
        test_criterion = torch.nn.MSELoss()
        test_xs, test_ys = [], []
        for segments, embedding, duration in batch_data:
            input_length = len(segments)
            encoder_outputs = torch.zeros(input_length, HIDDEN_SIZE,
                                          device=DEVICE, requires_grad=False)
            if input_length > 0:
                input_tensor = tokenize(phone_vocab, segments)
                for ei in range(input_length):
                    #encoder_output, encoder_hidden = encoder(
                    #    input_tensor[ei], encoder_hidden, torch.Tensor(embedding))
                    encoder_output, encoder_hidden = encoder(
                        input_tensor[ei], encoder_hidden)
                    encoder_outputs[ei] += encoder_output[0, 0]
                last_hidden_state = encoder_outputs[-1].detach()
            test_xs.append(last_hidden_state)
            test_ys.append(duration)
        test_xs = torch.stack(test_xs)
        test_ys = torch.Tensor(test_ys).reshape(-1, 1)
        test_loss = test_criterion(model(test_xs), test_ys).detach()
        
        print(f"Final loss on the test data for encoder {ix} is: {test_loss}")  # tensor(0.7235)

In [60]:
main()


Training Encoder Decoder
Training model
Loss: 57.8001594543457
Loss: 0.9664241075515747
Loss: 0.6025242209434509
Loss: 0.6730889678001404
Loss: 0.619820237159729
Loss: 0.573050856590271
Loss: 0.43242523074150085
Loss: 0.5873600840568542
Loss: 0.5564441680908203
Loss: 0.5288306474685669
Final loss on the training data for encoder 0 is: 0.5834252834320068
Final loss on the test data for encoder 0 is: 0.8446372747421265


Encoder-Decoder With Attention and CBOW Embeddings






In [61]:
import torch
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
import pickle, time, math, random, json
#from train_test_models import BuckeyeDataset, DataLoader
import dill
import torch.nn.functional as F

SOS_TOKEN = 0
EOS_TOKEN = 1
OUTPUT_SIZE = 1  # total word duration
MAX_LENGTH = 18
HIDDEN_SIZE = 8
DEVICE = 'cpu'
MAX_NORM = 1

LEARNING_RATE = 0.01
WEIGHT_DECAY = 0
MOMENTUM = 0.001
MAX_LOSS = 1.
N_EPOCHS = 300
BATCH_SIZE = 64
SHUFFLE = True
EMBEDDING_SIZE = 8
EVERY = 30
GLOVE_EMBED_DIM=50

EMBEDDING_FILE = './lab_2_data/word2_Vec_Lab_model.pt'
PHONE_VOCAB_FILE = './lab_2_data/word2_Vec_Lab_model.vocab'
TRAIN_PATH = './data/train.jsonl'
TEST_PATH = './data/test.jsonl'
VECS_PATH = "./data/buckeye.vecs"
SHUFFLE_DATA = True

def tokenize(vocab, list_of_segments):
    return torch.tensor(
        [vocab.vocab_to_ix[w] for w in list_of_segments if w in vocab.frequency_table] + [EOS_TOKEN], dtype=torch.long,
        device=DEVICE).view(-1, 1)


def process_segments_for_encoder(line_data):
    # Split every line into pairs and normalize
    split_str = line_data['observed_pron'].split(" ")
    duration = torch.log(torch.Tensor([sum(line_data['segment_duration_ms'])]))
    return (split_str, duration)


def tensorFromSentence(vocab, sentence):
    indices = tokenize(vocab, sentence)
    return indices


class Vocab():
    def __init__(self, segments: list):
        self._compute_frequency_table(segments)
        print(self.frequency_table)
        self._build_ix_to_vocab_dicts()

    def _compute_frequency_table(self, segments):
        self.frequency_table = Counter(segments)
        self.vocab_size = len(self.frequency_table)

    def _build_ix_to_vocab_dicts(self):
        self.ix_to_vocab = {
            i: phone for i, phone in enumerate(self.frequency_table)
            if self.frequency_table[phone] > 0
        }
        self.vocab_to_ix = {
            self.ix_to_vocab[w]: w for w in self.ix_to_vocab.keys()
        }

    def tokenize(self, list_of_segments):
        return torch.tensor(
            [self.vocab_to_ix[w] for w in list_of_segments], dtype=torch.long,
            device=DEVICE).view(-1, 1)

    def detokenize(self, tensor):
        return torch.tensor(
            [self.ix_to_vocab[ix] for ix in tensor], dtype=torch.long,
            device=DEVICE).view(-1, 1)


class Word2Vec(torch.nn.Module):
    def __init__(self, input_size: int, embedding_size: int, output_size: int = None, max_norm=None):
        super(Word2Vec, self).__init__()
        self.embedding = torch.nn.Embedding(
            input_size,
            embedding_size,
            max_norm=MAX_NORM
        )
        if output_size is None:
            self.linear = torch.nn.Linear(embedding_size, input_size)
        else:
            self.linear = torch.nn.Linear(embedding_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        x = x.mean(axis=1)
        x = self.linear(x)
        return x


class EncoderRNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = torch.nn.Embedding(input_size, hidden_size)
        self.gru = torch.nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)


class EncoderGloveRNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size, glove_size=None):
        super(EncoderGloveRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = torch.nn.Embedding(input_size, hidden_size)
        if glove_size is not None:
            self.linear = torch.nn.Linear(glove_size, hidden_size)
            self.gru = torch.nn.GRU(hidden_size * 2, hidden_size)
        else:
            self.linear = torch.nn.Linear(0, hidden_size)
            self.gru = torch.nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden, glove_embedding=None):
        embedded = self.embedding(input).view(1, 1, -1)
        if glove_embedding is not None:
            linear = self.linear(glove_embedding).view(1, 1, -1)
            #print(embedded.shape, linear.shape)
            output = torch.cat((embedded, linear), axis=2)
        else:
            output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)


class EncoderRNN3(torch.nn.Module):
    def __init__(self, input_size, hidden_size, embeddings=None):
        super(EncoderRNN3, self).__init__()
        self.hidden_size = hidden_size
        if embeddings is not None:
            input_size, hidden_size = embeddings.size()
            self.embedding = torch.nn.Embedding(input_size, hidden_size)
            self.embedding.weight = torch.nn.Parameter(embeddings)
        else:
            self.embedding = torch.nn.Embedding(input_size, hidden_size)
        self.gru = torch.nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)


class DecoderRNN(torch.nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = torch.nn.Embedding(output_size, hidden_size)
        self.gru = torch.nn.GRU(hidden_size, hidden_size)
        self.out = torch.nn.Linear(hidden_size, output_size)
        self.softmax = torch.nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)


class AttnDecoderRNN(torch.nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = torch.nn.Embedding(self.output_size, self.hidden_size)
        self.attn = torch.nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = torch.nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = torch.nn.Dropout(self.dropout_p)
        self.gru = torch.nn.GRU(self.hidden_size, self.hidden_size)
        self.out = torch.nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion,
          max_length=MAX_LENGTH, glove_embedding=None):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max(input_length, max_length), encoder.hidden_size, device=DEVICE)

    loss = 0

    for ei in range(input_length):
        #encoder_output, encoder_hidden = encoder(
        #    input_tensor[ei], encoder_hidden, glove_embedding)
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_TOKEN]], device=DEVICE)

    decoder_hidden = encoder_hidden

    # use its own predictions as the next input
    for di in range(target_length):
        if type(decoder) is DecoderRNN:
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
        elif type(decoder) is AttnDecoderRNN:
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
        topv, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze().detach()  # detach from history as input
        loss += criterion(decoder_output, target_tensor[di].view(1))
        if decoder_input.item() == EOS_TOKEN:
            break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return encoder, decoder, loss.item() / target_length


def trainIters(pairs, encoder, decoder, learning_rate=0.01):
    start = time.time()

    encoder_optimizer = torch.optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = torch.optim.SGD(decoder.parameters(), lr=learning_rate)

    criterion = torch.nn.NLLLoss()

    for training_pair in pairs:
        #input_tensor, target_tensor, glove_embedding = training_pair
        input_tensor, target_tensor = training_pair

        #encoder, decoder, loss = train(
        #    input_tensor, target_tensor, encoder,
        #    decoder, encoder_optimizer, decoder_optimizer, criterion, glove_embedding=torch.Tensor(glove_embedding))
        encoder, decoder, loss = train(
            input_tensor, target_tensor, encoder,
            decoder, encoder_optimizer, decoder_optimizer, criterion,)


class LinearModel(torch.nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearModel, self).__init__()
        self.linear = torch.nn.Linear(input_size, output_size)

    def forward(self, x):
        out = self.linear(x)
        return out

def main():

    #fetching the data
    training_data = BuckeyeDataset(TRAIN_PATH, VECS_PATH)
    test_data = BuckeyeDataset(TEST_PATH, VECS_PATH)
    train_dataloader = DataLoader(
        training_data, batch_size=BATCH_SIZE, shuffle=SHUFFLE_DATA, collate_fn=lambda x: x
    )
    # loading the Vocab class
    phone_vocab = dill.load(open(PHONE_VOCAB_FILE, 'rb'))

    # loading learned phone embeddings from CBOW
    phone_embeds = Word2Vec(
        input_size=phone_vocab.vocab_size,
        embedding_size=EMBEDDING_SIZE,
        max_norm=MAX_NORM
    )
    phone_embeds.load_state_dict(torch.load(EMBEDDING_FILE))
    embedding_dim = phone_embeds.embedding.weight.size()[-1]


    #phone_embeds = torch.load(EMBEDDING_FILE)


    #phone_vocab = pickle.load(open(PHONE_VOCAB_FILE, 'rb'))
    #embedding_dim = phone_embeds.embedding.weight.size()[-1]

    #evaluating the max length
    MAX_LENGTH = max([len(s[0]) for s in training_data])


    # Encoder-decoder hidden states
    # Encoder-decoder hidden states + learned CBOW embeddings
    # Encoder-decoder hidden states + learned phone embeddings + GloVe embeddings
    # Encoder-decoder with attention hidden states
    # Encoder-decoder with attention hidden states + GloVe embeddings
    # Encoder-decoder with attention hidden states + learned CBOW embeddings + GloVe embeddings

    n_words_for_encoders = phone_vocab.vocab_size + len([SOS_TOKEN, EOS_TOKEN])
    encoder1 = EncoderRNN(n_words_for_encoders, HIDDEN_SIZE).to(DEVICE)
    encoder2 = EncoderGloveRNN(n_words_for_encoders, HIDDEN_SIZE, GLOVE_EMBED_DIM).to(DEVICE)
    encoder3 = EncoderRNN3(n_words_for_encoders, HIDDEN_SIZE,
                           embeddings=torch.nn.Parameter(phone_embeds.embedding.weight)).to(DEVICE)

    decoder1 = DecoderRNN(HIDDEN_SIZE, n_words_for_encoders)
    attn_decoder1 = AttnDecoderRNN(HIDDEN_SIZE, n_words_for_encoders, dropout_p=0.1).to(DEVICE)
    print("Training Encoder Decoder")
    for encoder in [encoder1, encoder3]:#  , encoder3]: # , encoder2]:
        for decoder in [attn_decoder1]: #, attn_decoder1]:
            for i in range(N_EPOCHS):
                batch_data = next(iter(train_dataloader))
                xs, ys, glove_embeddings = [], [], []
                for segments, glove_embedding, duration in batch_data:
                    if len(segments) > 0:
                        xs.append(tensorFromSentence(phone_vocab, segments))
                        ys.append(tensorFromSentence(phone_vocab, segments))
                        glove_embeddings.append(glove_embedding)
                #print(glove_embedding, embedding_dim)
                #pairs = list(zip(xs, ys, glove_embeddings))
                pairs = list(zip(xs, ys))
                trainIters(pairs, encoder, decoder)
                # evaluateRandomly(encoder, decoder, pairs, phone_vocab, n=10)
                encoder_type = str(type(encoder)).split(".")[-1][:-2]
                decoder_type = str(type(decoder)).split(".")[-1][:-2]
                #torch.save(encoder, f"./2022-05-11_{type(encoder)}_{type(decoder)}.pt")
                torch.save(encoder, f"./2022-05-11_{encoder_type}_{decoder_type}.pt")
                #torch.save(encoder.state_dict(), f"./2022-05-11_{type(encoder)}_{type(decoder)}.pt")

    # train duration prediction model
    model = LinearModel(HIDDEN_SIZE, OUTPUT_SIZE)
    embedding_matrix = phone_embeds.embedding.weight.detach()
    print("Training model")
    for ix, encoder in enumerate([encoder1, encoder3]):# , encoder3]): #, encoder3]):
        encoder.eval()
        encoder_hidden = encoder.initHidden()
        for i in range(N_EPOCHS):
            loss = 0
            criterion = torch.nn.MSELoss()
            #optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
            optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)
            # make sure we "zero out" the loss at each time step
            optimizer.zero_grad()
            batch_data = next(iter(train_dataloader))
            xs, ys = [], []
            batch_segments = []
            #for idx, batch_data in enumerate(train_dataloader, 0):
            for segments, embedding, duration in batch_data:
              input_length = len(segments)
              encoder_outputs = torch.zeros(input_length, HIDDEN_SIZE,
                                              device=DEVICE, requires_grad=False)
              if input_length > 0:
                input_tensor = tokenize(phone_vocab, segments)
                for ei in range(input_length):
                  #encoder_output, encoder_hidden = encoder(
                  #              input_tensor[ei], encoder_hidden, torch.Tensor(embedding))
                  encoder_output, encoder_hidden = encoder(
                                input_tensor[ei], encoder_hidden)
                  encoder_outputs[ei] += encoder_output[0, 0]
                  last_hidden_state = encoder_outputs[-1].detach()
                  xs.append(last_hidden_state.flatten())
                  ys.append(duration)
            xs = torch.stack(xs)
            ys = torch.Tensor(ys).reshape(-1, 1)
            loss = criterion(model(xs), ys)

            if i % EVERY == 0:
                print(f"Loss: {loss}")

            loss.backward()
            b, m = model.parameters()
            optimizer.step()


        print(f"Final loss on the training data for encoder {ix} is: {loss}")  # tensor(0.478)
        ## TEST ##
        test_criterion = torch.nn.MSELoss()
        test_xs, test_ys = [], []
        for segments, embedding, duration in batch_data:
            input_length = len(segments)
            encoder_outputs = torch.zeros(input_length, HIDDEN_SIZE,
                                          device=DEVICE, requires_grad=False)
            if input_length > 0:
                input_tensor = tokenize(phone_vocab, segments)
                for ei in range(input_length):
                    #encoder_output, encoder_hidden = encoder(
                    #    input_tensor[ei], encoder_hidden, torch.Tensor(embedding))
                    encoder_output, encoder_hidden = encoder(
                        input_tensor[ei], encoder_hidden)
                    encoder_outputs[ei] += encoder_output[0, 0]
                last_hidden_state = encoder_outputs[-1].detach()
            test_xs.append(last_hidden_state)
            test_ys.append(duration)
        test_xs = torch.stack(test_xs)
        test_ys = torch.Tensor(test_ys).reshape(-1, 1)
        test_loss = test_criterion(model(test_xs), test_ys).detach()
        
        print(f"Final loss on the test data for encoder {ix} is: {test_loss}")  # tensor(0.7235)

In [None]:
main()

Training Encoder Decoder
Training model
Loss: 55.89055252075195
Loss: 0.8378207087516785
Loss: 0.6185505986213684
Loss: 0.691376268863678
Loss: 0.5926519632339478
Loss: 0.8279205560684204
Loss: 0.9110522270202637
Loss: 0.4455769658088684
Loss: 0.8437288403511047
Final loss on the training data for encoder 0 is: 0.4587155878543854
Final loss on the test data for encoder 0 is: 0.48783427476882935
Loss: 22.000808715820312
Loss: 1.621963381767273
Loss: 0.8436983823776245
Loss: 0.9428918957710266
Loss: 1.2026623487472534
Loss: 0.8381234407424927
Loss: 0.42408397793769836
Loss: 0.7704006433486938
Loss: 1.0019842386245728
Loss: 0.9775902032852173
Final loss on the training data for encoder 1 is: 0.6189470291137695
Final loss on the test data for encoder 1 is: 0.9605862498283386


Encoder-Decoder With Attention and CBOW, GLOVE Embeddings

In [51]:
import torch
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
import pickle, time, math, random, json
#from train_test_models import BuckeyeDataset, DataLoader
import dill
import torch.nn.functional as F

SOS_TOKEN = 0
EOS_TOKEN = 1
OUTPUT_SIZE = 1  # total word duration
MAX_LENGTH = 18
HIDDEN_SIZE = 8
DEVICE = 'cpu'
MAX_NORM = 1

LEARNING_RATE = 0.01
WEIGHT_DECAY = 0
MOMENTUM = 0.0001
MAX_LOSS = 1.
N_EPOCHS = 500
BATCH_SIZE = 64
SHUFFLE = True
EMBEDDING_SIZE = 8
EVERY = 50
GLOVE_EMBED_DIM=50

EMBEDDING_FILE = './lab_2_data/word2_Vec_Lab_model.pt'
PHONE_VOCAB_FILE = './lab_2_data/word2_Vec_Lab_model.vocab'
TRAIN_PATH = './data/train.jsonl'
TEST_PATH = './data/test.jsonl'
VECS_PATH = "./data/buckeye.vecs"
SHUFFLE_DATA = True

def tokenize(vocab, list_of_segments):
    return torch.tensor(
        [vocab.vocab_to_ix[w] for w in list_of_segments if w in vocab.frequency_table] + [EOS_TOKEN], dtype=torch.long,
        device=DEVICE).view(-1, 1)


def process_segments_for_encoder(line_data):
    # Split every line into pairs and normalize
    split_str = line_data['observed_pron'].split(" ")
    duration = torch.log(torch.Tensor([sum(line_data['segment_duration_ms'])]))
    return (split_str, duration)


def tensorFromSentence(vocab, sentence):
    indices = tokenize(vocab, sentence)
    return indices


class Vocab():
    def __init__(self, segments: list):
        self._compute_frequency_table(segments)
        print(self.frequency_table)
        self._build_ix_to_vocab_dicts()

    def _compute_frequency_table(self, segments):
        self.frequency_table = Counter(segments)
        self.vocab_size = len(self.frequency_table)

    def _build_ix_to_vocab_dicts(self):
        self.ix_to_vocab = {
            i: phone for i, phone in enumerate(self.frequency_table)
            if self.frequency_table[phone] > 0
        }
        self.vocab_to_ix = {
            self.ix_to_vocab[w]: w for w in self.ix_to_vocab.keys()
        }

    def tokenize(self, list_of_segments):
        return torch.tensor(
            [self.vocab_to_ix[w] for w in list_of_segments], dtype=torch.long,
            device=DEVICE).view(-1, 1)

    def detokenize(self, tensor):
        return torch.tensor(
            [self.ix_to_vocab[ix] for ix in tensor], dtype=torch.long,
            device=DEVICE).view(-1, 1)


class Word2Vec(torch.nn.Module):
    def __init__(self, input_size: int, embedding_size: int, output_size: int = None, max_norm=None):
        super(Word2Vec, self).__init__()
        self.embedding = torch.nn.Embedding(
            input_size,
            embedding_size,
            max_norm=MAX_NORM
        )
        if output_size is None:
            self.linear = torch.nn.Linear(embedding_size, input_size)
        else:
            self.linear = torch.nn.Linear(embedding_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        x = x.mean(axis=1)
        x = self.linear(x)
        return x


class EncoderRNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = torch.nn.Embedding(input_size, hidden_size)
        self.gru = torch.nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)


class EncoderGloveRNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size, glove_size=None):
        super(EncoderGloveRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = torch.nn.Embedding(input_size, hidden_size)
        if glove_size is not None:
            self.linear = torch.nn.Linear(glove_size, hidden_size)
            self.gru = torch.nn.GRU(hidden_size * 2, hidden_size)
        else:
            self.linear = torch.nn.Linear(0, hidden_size)
            self.gru = torch.nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden, glove_embedding=None):
        embedded = self.embedding(input).view(1, 1, -1)
        if glove_embedding is not None:
            linear = self.linear(glove_embedding).view(1, 1, -1)
            #print(embedded.shape, linear.shape)
            output = torch.cat((embedded, linear), axis=2)
        else:
            output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)


class EncoderRNN3(torch.nn.Module):
    def __init__(self, input_size, hidden_size, embeddings=None):
        super(EncoderRNN3, self).__init__()
        self.hidden_size = hidden_size
        if embeddings is not None:
            input_size, hidden_size = embeddings.size()
            self.embedding = torch.nn.Embedding(input_size, hidden_size)
            self.embedding.weight = torch.nn.Parameter(embeddings)
        else:
            self.embedding = torch.nn.Embedding(input_size, hidden_size)
        self.gru = torch.nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)


class DecoderRNN(torch.nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = torch.nn.Embedding(output_size, hidden_size)
        self.gru = torch.nn.GRU(hidden_size, hidden_size)
        self.out = torch.nn.Linear(hidden_size, output_size)
        self.softmax = torch.nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)


class AttnDecoderRNN(torch.nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = torch.nn.Embedding(self.output_size, self.hidden_size)
        self.attn = torch.nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = torch.nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = torch.nn.Dropout(self.dropout_p)
        self.gru = torch.nn.GRU(self.hidden_size, self.hidden_size)
        self.out = torch.nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion,
          max_length=MAX_LENGTH, glove_embedding=None):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max(input_length, max_length), encoder.hidden_size, device=DEVICE)

    loss = 0

    for ei in range(input_length):
      if(type(encoder) is EncoderGloveRNN):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden, glove_embedding)
      else:
         encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)       
      encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_TOKEN]], device=DEVICE)

    decoder_hidden = encoder_hidden

    # use its own predictions as the next input
    for di in range(target_length):
        if type(decoder) is DecoderRNN:
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
        elif type(decoder) is AttnDecoderRNN:
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
        topv, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze().detach()  # detach from history as input
        loss += criterion(decoder_output, target_tensor[di].view(1))
        if decoder_input.item() == EOS_TOKEN:
            break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return encoder, decoder, loss.item() / target_length


def trainIters(pairs, encoder, decoder, learning_rate=0.01):
    start = time.time()

    encoder_optimizer = torch.optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = torch.optim.SGD(decoder.parameters(), lr=learning_rate)

    criterion = torch.nn.NLLLoss()

    for training_pair in pairs:
        input_tensor, target_tensor, glove_embedding = training_pair
        #input_tensor, target_tensor = training_pair

        encoder, decoder, loss = train(
            input_tensor, target_tensor, encoder,
            decoder, encoder_optimizer, decoder_optimizer, criterion, glove_embedding=torch.Tensor(glove_embedding))
        #encoder, decoder, loss = train(
        #    input_tensor, target_tensor, encoder,
        #    decoder, encoder_optimizer, decoder_optimizer, criterion,)


class LinearModel(torch.nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearModel, self).__init__()
        self.linear = torch.nn.Linear(input_size, output_size)

    def forward(self, x):
        out = self.linear(x)
        return out

def main():

    #fetching the data
    training_data = BuckeyeDataset(TRAIN_PATH, VECS_PATH)
    test_data = BuckeyeDataset(TEST_PATH, VECS_PATH)
    train_dataloader = DataLoader(
        training_data, batch_size=BATCH_SIZE, shuffle=SHUFFLE_DATA, collate_fn=lambda x: x
    )
    # loading the Vocab class
    phone_vocab = dill.load(open(PHONE_VOCAB_FILE, 'rb'))

    # loading learned phone embeddings from CBOW
    phone_embeds = Word2Vec(
        input_size=phone_vocab.vocab_size,
        embedding_size=EMBEDDING_SIZE,
        max_norm=MAX_NORM
    )
    phone_embeds.load_state_dict(torch.load(EMBEDDING_FILE))
    embedding_dim = phone_embeds.embedding.weight.size()[-1]


    #phone_embeds = torch.load(EMBEDDING_FILE)


    #phone_vocab = pickle.load(open(PHONE_VOCAB_FILE, 'rb'))
    #embedding_dim = phone_embeds.embedding.weight.size()[-1]

    #evaluating the max length
    MAX_LENGTH = max([len(s[0]) for s in training_data])


    # Encoder-decoder hidden states
    # Encoder-decoder hidden states + learned CBOW embeddings
    # Encoder-decoder hidden states + learned phone embeddings + GloVe embeddings
    # Encoder-decoder with attention hidden states
    # Encoder-decoder with attention hidden states + GloVe embeddings
    # Encoder-decoder with attention hidden states + learned CBOW embeddings + GloVe embeddings

    n_words_for_encoders = phone_vocab.vocab_size + len([SOS_TOKEN, EOS_TOKEN])
    encoder1 = EncoderRNN(n_words_for_encoders, HIDDEN_SIZE).to(DEVICE)
    encoder2 = EncoderGloveRNN(n_words_for_encoders, HIDDEN_SIZE, GLOVE_EMBED_DIM).to(DEVICE)
    encoder3 = EncoderRNN3(n_words_for_encoders, HIDDEN_SIZE,
                           embeddings=torch.nn.Parameter(phone_embeds.embedding.weight)).to(DEVICE)

    decoder1 = DecoderRNN(HIDDEN_SIZE, n_words_for_encoders)
    attn_decoder1 = AttnDecoderRNN(HIDDEN_SIZE, n_words_for_encoders, dropout_p=0.1).to(DEVICE)
    print("Training Encoder Decoder")
    for encoder in [encoder1, encoder2, encoder3]:#  , encoder3]: # , encoder2]:
        for decoder in [attn_decoder1]: #, attn_decoder1]:
            for i in range(N_EPOCHS):
                batch_data = next(iter(train_dataloader))
                xs, ys, glove_embeddings = [], [], []
                for segments, glove_embedding, duration in batch_data:
                    if len(segments) > 0:
                        xs.append(tensorFromSentence(phone_vocab, segments))
                        ys.append(tensorFromSentence(phone_vocab, segments))
                        glove_embeddings.append(glove_embedding)
                #print(glove_embedding, embedding_dim)
                pairs = list(zip(xs, ys, glove_embeddings))
                #pairs = list(zip(xs, ys))
                trainIters(pairs, encoder, decoder)
                # evaluateRandomly(encoder, decoder, pairs, phone_vocab, n=10)
                encoder_type = str(type(encoder)).split(".")[-1][:-2]
                decoder_type = str(type(decoder)).split(".")[-1][:-2]
                #torch.save(encoder, f"./2022-05-11_{type(encoder)}_{type(decoder)}.pt")
                torch.save(encoder, f"./2022-05-11_{encoder_type}_{decoder_type}.pt")
                #torch.save(encoder.state_dict(), f"./2022-05-11_{type(encoder)}_{type(decoder)}.pt")

    # train duration prediction model
    model = LinearModel(HIDDEN_SIZE, OUTPUT_SIZE)
    embedding_matrix = phone_embeds.embedding.weight.detach()
    print("Training model")
    for ix, encoder in enumerate([encoder1, encoder2, encoder3]):# , encoder3]): #, encoder3]):
        encoder.eval()
        encoder_hidden = encoder.initHidden()
        for i in range(N_EPOCHS):
            loss = 0
            criterion = torch.nn.MSELoss()
            #optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
            optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)
            # make sure we "zero out" the loss at each time step
            optimizer.zero_grad()
            batch_data = next(iter(train_dataloader))
            xs, ys = [], []
            batch_segments = []
            #for idx, batch_data in enumerate(train_dataloader, 0):
            for segments, embedding, duration in batch_data:
              input_length = len(segments)
              encoder_outputs = torch.zeros(input_length, HIDDEN_SIZE,
                                              device=DEVICE, requires_grad=False)
              if input_length > 0:
                input_tensor = tokenize(phone_vocab, segments)
                for ei in range(input_length):
                  if(type(encoder) is EncoderGloveRNN):
                    encoder_output, encoder_hidden = encoder(
                      input_tensor[ei], encoder_hidden, torch.Tensor(embedding))
                  else:
                    encoder_output, encoder_hidden = encoder(
                      input_tensor[ei], encoder_hidden)
                  #encoder_output, encoder_hidden = encoder(
                  #              input_tensor[ei], encoder_hidden)
                  encoder_outputs[ei] += encoder_output[0, 0]
                  last_hidden_state = encoder_outputs[-1].detach()
                  xs.append(last_hidden_state.flatten())
                  ys.append(duration)
            xs = torch.stack(xs)
            ys = torch.Tensor(ys).reshape(-1, 1)
            loss = criterion(model(xs), ys)

            if i % EVERY == 0:
                print(f"Loss: {loss}")

            loss.backward()
            b, m = model.parameters()
            optimizer.step()


        print(f"Final loss on the training data for encoder {ix} is: {loss}")  # tensor(0.478)
        ## TEST ##
        test_criterion = torch.nn.MSELoss()
        test_xs, test_ys = [], []
        for segments, embedding, duration in batch_data:
            input_length = len(segments)
            encoder_outputs = torch.zeros(input_length, HIDDEN_SIZE,
                                          device=DEVICE, requires_grad=False)
            if input_length > 0:
                input_tensor = tokenize(phone_vocab, segments)
                for ei in range(input_length):
                  if(type(encoder) is EncoderGloveRNN):
                    encoder_output, encoder_hidden = encoder(
                      input_tensor[ei], encoder_hidden, torch.Tensor(embedding))
                  else:
                    encoder_output, encoder_hidden = encoder(
                      input_tensor[ei], encoder_hidden)
                    #encoder_output, encoder_hidden = encoder(
                    #    input_tensor[ei], encoder_hidden)
                    encoder_outputs[ei] += encoder_output[0, 0]
                last_hidden_state = encoder_outputs[-1].detach()
            test_xs.append(last_hidden_state)
            test_ys.append(duration)
        test_xs = torch.stack(test_xs)
        test_ys = torch.Tensor(test_ys).reshape(-1, 1)
        test_loss = test_criterion(model(test_xs), test_ys).detach()
        
        print(f"Final loss on the test data for encoder {ix} is: {test_loss}")  # tensor(0.7235)

In [52]:
main()

Training Encoder Decoder
Training model
Loss: 67.20687866210938
Loss: 1.084847092628479
Loss: 1.0670005083084106
Loss: 0.5969911813735962
Loss: 0.6299929618835449
Loss: 0.6387811303138733
Loss: 0.6426233053207397
Loss: 0.5377352833747864
Loss: 1.0618247985839844
Loss: 0.5671363472938538
Final loss on the training data for encoder 0 is: 0.6335338950157166
Final loss on the test data for encoder 0 is: 1.0975581407546997
Loss: 2.2431862354278564
Loss: 1.1846762895584106
Loss: 0.9475269317626953
Loss: 0.670482873916626
Loss: 0.48679137229919434
Loss: 0.6842508316040039
Loss: 0.6521562337875366
Loss: 0.6253276467323303
Loss: 0.4932592809200287
Loss: 0.6500654220581055
Final loss on the training data for encoder 1 is: 0.6114843487739563
Final loss on the test data for encoder 1 is: 25.575294494628906
Loss: 1.0935804843902588
Loss: 0.6774217486381531
Loss: 0.5994555950164795
Loss: 0.4604150056838989
Loss: 0.3727049231529236
Loss: 0.5391032099723816
Loss: 0.39320316910743713
Loss: 0.3938292562