Final Project by Ike Pawsat, Luke Gearin, and Sam Forde

# Model 1: BERT Encoder with Roommate Dataset

Text-to-speech AI using BERT encoder with a CNN to recreate my roommates voice.

## Imports and Data Processing

In [1]:
pip install transformers torchaudio librosa matplotlib

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

In [2]:
#import statements
from transformers import BertTokenizer, BertModel, DistilBertModel, DistilBertTokenizer
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import os
import string
import torchaudio

In [119]:
#google drive mounting and dataset
from google.colab import drive
drive.mount("/content/drive")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

dataset_path = "/content/drive/MyDrive/TTS_Dataset/Nick_Data/bee_movie_script.csv"  #update to our csv
wav_path = "/content/drive/MyDrive/TTS_Dataset/Nick_Data/recordings" #folder with wav files

#make dataframe
nick_data = pd.read_csv(dataset_path)
#skipped a line in recording so the first line becomes the column name, and I rename that column name to "lines"
nick_data.columns = ["lines"]
nick_data = nick_data[2:]
print(nick_data.head(3))

#removing punctuation now
nick_data["lines"] = nick_data["lines"].apply(lambda x: x.translate(str.maketrans("", "", string.punctuation)))
print(nick_data.head(3))

#cannot do gdown because .wav files, there is a 50 file limit

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: cuda
                                               lines
2  Yellow, black. Yellow, black. Yellow, black. Y...
3                             Ooh, black and yellow!
4                        Let's shake it up a little.
                                               lines
2  Yellow black Yellow black Yellow black Yellow ...
3                               Ooh black and yellow
4                          Lets shake it up a little


In [127]:
#wav path defined above in wav_path
def make_wav_filename(index):
    return os.path.join(wav_path, f"{index:04d}.wav")

#add new wav_path for each file for the training
nick_data["wav_path"] = nick_data.index.map(make_wav_filename)
nick_data = nick_data[nick_data["wav_path"].apply(os.path.exists)]

#checks
print(nick_data[["lines", "wav_path"]].head(3))
print(nick_data.shape) #check shape
print(nick_data.loc[3, "wav_path"]) #check that 0 is 0000.wav because pandas truncates it

                                               lines  \
2  Yellow black Yellow black Yellow black Yellow ...   
3                               Ooh black and yellow   
4                          Lets shake it up a little   

                                            wav_path  
2  /content/drive/MyDrive/TTS_Dataset/Nick_Data/r...  
3  /content/drive/MyDrive/TTS_Dataset/Nick_Data/r...  
4  /content/drive/MyDrive/TTS_Dataset/Nick_Data/r...  
(1285, 2)
/content/drive/MyDrive/TTS_Dataset/Nick_Data/recordings/0003.wav


In [129]:
#typical 80 / 20 split
train_df, test_df = train_test_split(nick_data, test_size=0.2, random_state=42)

#sizes
print(f"Train size: {len(train_df)}")
print(f"Test size: {len(test_df)}")

#metadata format: wav_path|transcript
train_df["metadata"] = train_df.apply(lambda row: f"{row['wav_path']}|{row['lines']}", axis=1) #separated by |
test_df["metadata"] = test_df.apply(lambda row: f"{row['wav_path']}|{row['lines']}", axis=1)

#make csvs
train_df["metadata"].to_csv("train_metadata.csv", index=False, header=False)
test_df["metadata"].to_csv("test_metadata.csv", index=False, header=False)

print(train_df.head(3))
print(train_df.loc[1000, "metadata"]) #check if it combines lines with wav_path

Train size: 1028
Test size: 257
                                                  lines  \
970   And assuming youve done step correctly youre r...   
748                                         This is Ken   
1149                                                Yes   

                                               wav_path  \
970   /content/drive/MyDrive/TTS_Dataset/Nick_Data/r...   
748   /content/drive/MyDrive/TTS_Dataset/Nick_Data/r...   
1149  /content/drive/MyDrive/TTS_Dataset/Nick_Data/r...   

                                               metadata  
970   /content/drive/MyDrive/TTS_Dataset/Nick_Data/r...  
748   /content/drive/MyDrive/TTS_Dataset/Nick_Data/r...  
1149  /content/drive/MyDrive/TTS_Dataset/Nick_Data/r...  
/content/drive/MyDrive/TTS_Dataset/Nick_Data/recordings/1000.wav|All the honey will finally belong to the bees


## Model Setup

In [130]:
#use pretrained bert_model
#bert_model = BertModel.from_pretrained("bert-base-uncased")
bert_model = DistilBertModel.from_pretrained("distilbert-base-uncased") #smaller so trains faster

#tokenize data
#tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") #smaller so trains faster

In [131]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

class TTSDataset(Dataset):
    def __init__(self, metadata_path, max_len=512):
        self.samples = []
        with open(metadata_path, "r") as f:
            for line in f:
                wav_path, text = line.strip().split("|")
                self.samples.append((wav_path, text))
        self.max_len = max_len
        self.mel_transform = torchaudio.transforms.MelSpectrogram(
            sample_rate=40000,
            n_fft=1024,
            hop_length=256,
            n_mels=80
        )

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        wav_path, text = self.samples[idx]

        # Load audio
        waveform, sr = torchaudio.load(wav_path)
        if sr != 40000:
            resampler = torchaudio.transforms.Resample(sr, 40000)
            waveform = resampler(waveform)

        mel = self.mel_transform(waveform).squeeze(0).transpose(0, 1)

        input_ids = tokenizer.encode(
            text,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=self.max_len
        ).squeeze(0)

        return input_ids, mel


def collate_fn(batch):
    input_ids, mels = zip(*batch)

    input_ids = torch.stack(input_ids)

    max_mel_len = max(mel.shape[0] for mel in mels)
    mel_dim = mels[0].shape[1]
    padded_mels = torch.zeros(len(mels), max_mel_len, mel_dim)

    for i, mel in enumerate(mels):
        padded_mels[i, :mel.shape[0], :] = mel

    return input_ids, padded_mels


train_dataset = TTSDataset("train_metadata.csv")
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn, num_workers=4, pin_memory=True) #pin_memory speeds up GPU transfers


layers = [
    nn.Conv1d(in_channels=768, out_channels=512, kernel_size=3, padding=1),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Conv1d(in_channels=512, out_channels=256, kernel_size=3, padding=1),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Conv1d(in_channels=256, out_channels=80, kernel_size=1)
]

cnn = nn.Sequential(*layers).to(device)
optimizer = optim.Adam(cnn.parameters(), lr=1e-5)



## Model Training

In [136]:
import os
import torch
import torchaudio
import torchaudio.transforms as T

bert_model.eval()
bert_model.to(device)
loss_function = torch.nn.L1Loss()
num_epochs = 200 #honestly all you need since it is a small dataset

save_dir = "/content/drive/MyDrive/generated_wavs" #where to save .wavs
os.makedirs(save_dir, exist_ok=True)

# Griffin-Lim vocoder (basic mel -> waveform)
mel_to_audio = T.GriffinLim(n_fft=1024, n_iter=32)

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")
    epoch_loss = 0.0
    total_batches = len(train_loader)

    for batch_idx, batch in enumerate(train_loader):
        input_ids, mel_targets = batch  #input_ids: [B, max_len], mel_targets: [B, T, 80]
        input_ids = input_ids.to(device)
        mel_targets = mel_targets.to(device)

        with torch.no_grad():
            bert_outputs = bert_model(input_ids)
            embeddings = bert_outputs.last_hidden_state  #[B, max_len, 768]

        x = embeddings.permute(0, 2, 1)  #[B, 768, max_len]
        out = cnn(x).permute(0, 2, 1)  #[B, T, 80]

        min_len = min(out.shape[1], mel_targets.shape[1])
        out = out[:, :min_len, :]
        mel_targets = mel_targets[:, :min_len, :]

        loss = loss_function(out, mel_targets)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        epoch_loss += loss.item()
        if (batch_idx + 1) % 10 == 0 or (batch_idx + 1) == total_batches:
            print(f"  Batch {batch_idx + 1}/{total_batches} | Loss: {loss.item():.4f}")

    avg_loss = epoch_loss / total_batches
    print(f"Epoch {epoch+1} complete. Avg Loss: {avg_loss:.4f}")

    mel_inv = T.InverseMelScale(n_stft=513, n_mels=80, sample_rate=40000)
    mel_to_audio = T.GriffinLim(n_fft=1024, n_iter=32)

    with torch.no_grad():
        sample_mel = mel_targets[0].cpu().T  #[80, T]
        spec = mel_inv(sample_mel)  #should be [513, T]

        #reconstruct waveform
        waveform = mel_to_audio(spec)
        waveform = waveform / waveform.abs().max()

        save_path = os.path.join(save_dir, f"epoch{epoch+1}_target.wav")
        torchaudio.save(save_path, waveform.unsqueeze(0), sample_rate=40000)
        print(f"Saved target audio sample to {save_path}")

save_model = os.path.join(save_dir, "model.pth")
torch.save(cnn.state_dict(), save_model)
print(f"Saved model to {save_model}")

#I attempted to use spectral loss but since the dataset is so small it did not help


Epoch 1/200
  Batch 10/33 | Loss: 6.3880
  Batch 20/33 | Loss: 4.8947
  Batch 30/33 | Loss: 9.0659
  Batch 33/33 | Loss: 4.5068
Epoch 1 complete. Avg Loss: 7.4565
Saved target audio sample to /content/drive/MyDrive/generated_wavs/epoch1_target.wav

Epoch 2/200
  Batch 10/33 | Loss: 9.3630
  Batch 20/33 | Loss: 6.3160
  Batch 30/33 | Loss: 6.0855
  Batch 33/33 | Loss: 5.1039
Epoch 2 complete. Avg Loss: 7.4723
Saved target audio sample to /content/drive/MyDrive/generated_wavs/epoch2_target.wav

Epoch 3/200
  Batch 10/33 | Loss: 6.2270
  Batch 20/33 | Loss: 8.8744
  Batch 30/33 | Loss: 8.9875
  Batch 33/33 | Loss: 5.4388
Epoch 3 complete. Avg Loss: 7.4810
Saved target audio sample to /content/drive/MyDrive/generated_wavs/epoch3_target.wav

Epoch 4/200
  Batch 10/33 | Loss: 5.6293
  Batch 20/33 | Loss: 11.4215
  Batch 30/33 | Loss: 7.2444
  Batch 33/33 | Loss: 3.1441
Epoch 4 complete. Avg Loss: 7.4200
Saved target audio sample to /content/drive/MyDrive/generated_wavs/epoch4_target.wav

Ep

## Model 1 Text input

In [137]:
#text in put code
phrase = "yellow black yellow black"
phrase_tokenized = tokenizer.encode(phrase)
phrase_tokenized = torch.tensor(phrase_tokenized).unsqueeze(0)
phrase_tensor = phrase_tokenized.to(device)

#bert model
with torch.no_grad():
    bert_outputs = bert_model(phrase_tensor)
    embeddings = bert_outputs.last_hidden_state
    embedded_input = embeddings.permute(0, 2, 1)

x = embeddings.permute(0, 2, 1)

#load model from .pth file
model_path = "/content/drive/MyDrive/generated_wavs/model.pth"
cnn.load_state_dict(torch.load(model_path))
cnn.eval()
model = cnn.to(device)

output_save_dir = "/content/drive/MyDrive/specific_phrases"
os.makedirs(output_save_dir, exist_ok=True)

#reconstruct waveform
output = model(embedded_input).permute(0, 2, 1)
save_path = os.path.join(output_save_dir, f"{phrase}.wav")
waveform = mel_to_audio(spec)
waveform = waveform / waveform.abs().max()
torchaudio.save(save_path, waveform.unsqueeze(0), sample_rate=40000)

# Model 2: Variational Auto-Encoder


Based on https://arxiv.org/pdf/2107.03298

## Text Encoder
Convolution layers with dropout, batch normalization and ReLU activation. Follow this with positional encoding, and then self attention blocks.

## Model Evaluation

In [None]:
def sinosoidal_position_encoding(token_size, embedding_dim):
  pos = torch.arange(0, token_size).unsqueeze(1)
  emb = torch.zeros(token_size, embedding_dim)

  emb[:, 0::2] = torch.sin(pos/torch.pow(10000, 2*torch.arange(0, embedding_dim//2)/embedding_dim))
  emb[:, 1::2] = torch.cos(pos/torch.pow(10000, 2*torch.arange(0, embedding_dim//2)/embedding_dim))

  return emb



class ConvStack(torch.nn.Module):
  def __init__(self, D, K):
    super(ConvStack, self).__init__()
    self.conv = torch.nn.Conv1d(D, D, K)
    self.norm = torch.nn.BatchNorm1d(D)
    self.relu = torch.nn.ReLU()
    self.dropout = torch.nn.Dropout1d()
  def forward(self, X):
    convolution = self.conv(X)
    normalization = self.norm(convolution)
    relu = self.relu(normalization)
    output = self.dropout(relu)
    return output

class TextEncoder(torch.nn.Module):
  def __init__(self, embedding_size, conv_size, K):
    self.embedding = torch.nn.Embedding(embedding_size, conv_size)
    self.stack1 = ConvStack(conv_size, K)
    self.stack2 = ConvStack(conv_size, K)
    self.stack3 = ConvStack(conv_size, K)
    self.stack4 = ConvStack(conv_size, K)
    self.stack5 = ConvStack(conv_size, K)
    self.attention1 = torch.nn.MultiheadAttention(embedding_size, 4)
    self.attention2 = torch.nn.MultiheadAttention(embedding_size, 4)
    self.attention3 = torch.nn.MultiheadAttention(embedding_size, 4)
    self.attention4 = torch.nn.MultiheadAttention(embedding_size, 4)

  def forward(self, X):
    embed = self.embedding(X)
    conv1 = self.stack1(embed)
    conv2 = self.stack2(conv1)
    conv3 = self.stack3(conv1)
    conv4 = self.stack4(conv1)
    conv5 = self.stack5(conv1)
    conv_pos = conv5 + sinosoidal_position_encoding(X.size(0), 512)
    attn1 = self.attention1(conv_pos)
    attn2 = self.attention2(attn1)
    attn3 = self.attention3(attn2)
    attn4 = self.attention4(attn3)

    return attn4

