<a href="https://colab.research.google.com/github/kuba-cherryb/voice-anonymization-EN-PL/blob/main/voice_anonymization_EN_CMU_ARCTIC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<center><font size="4"><b>Technologia Mowy 2024</b></font></center>
<br />
<center><font size="6"><b><u>Project 2: Voice anonymization.</u></b></font></center>
<center>Jakub Czernecki, Wojciech Sabała, Bartosz Wąsik, 01/2025</center>

#Environment setup

In [18]:
import os

!git clone https://github.com/kuba-cherryb/voice-anonymization-EN-PL
%cd voice-anonymization-EN-PL
!pip install datasets
!pip install speechbrain
!pip install git+https://github.com/openai/whisper.git

from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset, Audio
import torch
import soundfile as sf
from IPython.display import Audio
from speechbrain.inference.speaker import EncoderClassifier
import torchaudio
import numpy as np
import warnings
import whisper
warnings.filterwarnings("ignore")

Cloning into 'voice-anonymization-EN-PL'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (3/3), 202.42 KiB | 5.95 MiB/s, done.
/content/voice-anonimization-EN-PL/voice-anonimization-EN-PL/voice-anonymization-EN-PL
Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-j_ubq87a
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-j_ubq87a
  Resolved https://github.com/openai/whisper.git to commit 517a43ecd132a2089d85f4ebc044728a71d49f6e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


##Pretrained models

### Speech to Text - [Whisper](https://github.com/openai/whisper)

In [19]:
transcriptor = whisper.load_model("base.en")

###xvector extractor - [SpeechBrain Speaker Verification with xvector embeddings on Voxceleb](https://huggingface.co/speechbrain/spkrec-xvect-voxceleb)

In [20]:
classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-xvect-voxceleb", savedir="pretrained_models/spkrec-xvect-voxceleb")

INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached
DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/56895a2df401be4150a159f3a1c653f00051d477/hyperparams.yaml' -> '/content/voice-anonimization-EN-PL/voice-anonimization-EN-PL/voice-anonymization-EN-PL/pretrained_models/spkrec-xvect-voxceleb/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached
DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in pretrained_models/spkrec-xvect-voxceleb.
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached
DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggin

### Synthesizer - [SpeechT5](https://huggingface.co/blog/speecht5)

In [21]:
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

##xvector database - [CMU ARCTIC](https://huggingface.co/datasets/Matthijs/cmu-arctic-xvectors)

In [22]:
xvector_database = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
xvector_database = torch.tensor(xvector_database["xvector"])

#Analysis

## Speech to text

In [7]:
%ls

[0m[01;34mpretrained_models[0m/  resynthesis_input.wav


In [8]:
transcription = transcriptor.transcribe("resynthesis_input.wav")
print(transcription["text"])

 How does it feel? How does it feel? To be on your own with no directions home. A complete unknown like a rolling stone.


##xvect_extraction

In [9]:
signal, fs = torchaudio.load('resynthesis_input.wav')
extracted_xvector = classifier.encode_batch(signal[0]).squeeze(0)
print(np.shape(extracted_xvector))

torch.Size([1, 512])


#Anonymization

In [10]:
def anonymize(xvector):

  #computing euclidian distance from each database speaker
  #to determine the least similar speakers.
  euclid_dist = torch.cdist(xvector_database, xvector,p=2)
  ed_list = np.reshape(euclid_dist.tolist(), (-1))
  least_similar_idx = ed_list.argsort()[-3:][::-1]

  #averaging the least similar speakers to create anonymized
  axv = []

  for x in range(0,3):
    axv.append(xvector_database[least_similar_idx[x]])

  axv = np.mean(axv, axis=0)
  anon_xvector1 = torch.tensor(axv).unsqueeze(0)


  #max distance speaker
  maxid = np.argmax(ed_list)
  anxv3 = xvector_database[maxid]
  anon_xvector2 = torch.tensor(anxv3).unsqueeze(0)


  #determining random speakers to get a randomly distant, yet distant embedding
  randidx = np.random.randint(0, len(xvector_database),1)
  axv_rand = xvector_database[randidx]
  anon_xvector3 = torch.tensor(axv_rand)

  return anon_xvector1, anon_xvector2, anon_xvector3

In [11]:
anon_xvector1, anon_xvector2, anon_xvector3 = anonymize(xvector=extracted_xvector)

#Synthesis

##Original signal

In [12]:
Audio(signal, rate=16000)

##Model

###Original speech resynthesis

In [13]:
inputs = processor(text=transcription["text"], return_tensors="pt")
speech = model.generate_speech(inputs["input_ids"], extracted_xvector, vocoder=vocoder)
Audio(speech, rate=16000)

###Distance anonimization

In [14]:
inputs = processor(text=transcription["text"], return_tensors="pt")
speech = model.generate_speech(inputs["input_ids"], anon_xvector1, vocoder=vocoder)
Audio(speech, rate=16000)

###Max distance speaker

In [15]:
inputs = processor(text=transcription["text"], return_tensors="pt")
speech = model.generate_speech(inputs["input_ids"], anon_xvector2, vocoder=vocoder)
Audio(speech, rate=16000)

###Speaker randomization

In [16]:
inputs = processor(text=transcription["text"], return_tensors="pt")
speech = model.generate_speech(inputs["input_ids"], anon_xvector3, vocoder=vocoder)
Audio(speech, rate=16000)

#Archive

##Archival anonymization methods



###Averaged randomization
Choosing three random speaker embeddings to average them out into a randomly distant anonymous pseudospeaker.

In [17]:
#determining random speakers to get a randomly distant, yet distant embedding
randidx = np.random.randint(0, len(xvector_database),3)
axv_rand = list(xvector_database[randidx])
axv_rand = np.mean(axv_rand, axis=0)
anon_xvector1 = torch.tensor(axv_rand).unsqueeze(0)

inputs = processor(text=transcription["text"], return_tensors="pt")
speech = model.generate_speech(inputs["input_ids"], anon_xvector1, vocoder=vocoder)
Audio(speech, rate=16000)