<a href="https://colab.research.google.com/github/kuba-cherryb/voice-anonymization-EN-PL/blob/main/voice_anonymization_PL_CMU_ARCTIC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<center><font size="4"><b>Speech technology 2024</b></font></center>
<br />
<center><font size="6"><b><u>Project 2: Speech Anonymization.</u></b></font></center>
<center>Jakub Czernecki, Wojciech Sabała, Bartosz Wąsik, 01/2025</center>

#Environment setup

##Python modules

In [None]:
import os

!git clone https://github.com/kuba-cherryb/voice-anonymization-EN-PL
%cd voice-anonymization-EN-PL
!pip install datasets
!pip install speechbrain
!pip install git+https://github.com/openai/whisper.git

from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset, Audio
import torch
import soundfile as sf
from IPython.display import Audio
from speechbrain.inference.speaker import EncoderClassifier
import torchaudio
import numpy as np
import warnings
import whisper
warnings.filterwarnings("ignore")

##Pretrained models

### Speech to Text - [Whisper](https://github.com/openai/whisper)

In [None]:
transcriptor = whisper.load_model("turbo")

###xvector extractor - [SpeechBrain Speaker Verification with xvector embeddings on Voxceleb](https://huggingface.co/speechbrain/spkrec-xvect-voxceleb)

In [None]:
classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-xvect-voxceleb", savedir="pretrained_models/spkrec-xvect-voxceleb")


### Synthesizer - [SpeechT5](https://huggingface.co/blog/speecht5), [fine-tuned to Polish](kuba-cherryb/speecht5_tts_voxpopuli_pl_v4)

In [None]:
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("kuba-cherryb/speecht5_tts_voxpopuli_pl_v4")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

##xvector database - [CMU ARCTIC](https://huggingface.co/datasets/Matthijs/cmu-arctic-xvectors)

In [None]:
xvector_database = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
xvector_database = torch.tensor(xvector_database["xvector"])

#Analysis

## Speech to text

In [None]:
transcription = transcriptor.transcribe("resynthesis_pl.wav")
print(transcription["text"])

##Text cleanup

Tokenizer doesn't recognize foreign symbols and replaces them with `<unk>`. This creates a need to replace them with recognizable tokens. Additionally numbers are not recognized by the model, hence the replacement is necessary for them too.

In [None]:
replacements = [
    ('ó', 'œ'),
    ('ą', 'æ'),
    ('ć', 'ê'),
    ('ę', 'é'),
    ('ł', 'X'),
    ('ń', 'Q'),
    ('ś', 'V'),
    ('ź', 'q'),
    ('ż', 'v'),
    ('1', 'jeden'),
    ('2', 'dwa'),
    ('3', 'trzy'),
    ('4', 'cztery'),
    ('5', 'piéê'),
    ('6', 'szeVê'),
    ('7', 'siedem'),
    ('8', 'osiem'),
    ('9', 'dziewiéê'),
    ('0', 'zero'),
    ('%', 'procent'),
]


def cleanup_text(inputs):
    for src, dst in replacements:
        inputs["text"] = inputs["text"].lower().replace(src, dst)
    return inputs

In [None]:
transcription = cleanup_text(transcription)
print(transcription["text"])

##xvect_extraction

In [None]:
signal, fs = torchaudio.load('resynthesis_pl.wav')
extracted_xvector = classifier.encode_batch(signal[0]).squeeze(0)

#Anonymization

In [None]:
def anonymize(xvector):

  #computing euclidian distance from each database speaker
  #to determine the least similar speakers.
  euclid_dist = torch.cdist(xvector_database, xvector,p=2)
  ed_list = np.reshape(euclid_dist.tolist(), (-1))
  least_similar_idx = ed_list.argsort()[-3:][::-1]

  #averaging the least similar speakers to create anonymized
  axv = []

  for x in range(0,3):
    axv.append(xvector_database[least_similar_idx[x]])

  axv = np.mean(axv, axis=0)
  anon_xvector1 = torch.tensor(axv).unsqueeze(0)


  #max distance speaker
  maxid = np.argmax(ed_list)
  anxv3 = xvector_database[maxid]
  anon_xvector2 = torch.tensor(anxv3).unsqueeze(0)


  #determining random speakers to get a randomly distant, yet distant embedding
  randidx = np.random.randint(0, len(xvector_database),1)
  axv_rand = xvector_database[randidx]
  anon_xvector3 = torch.tensor(axv_rand)

  return anon_xvector1, anon_xvector2, anon_xvector3

In [None]:
anon_xvector1, anon_xvector2, anon_xvector3 = anonymize(xvector=extracted_xvector)

#Synthesis

##Original signal

In [None]:
Audio(signal, rate=16000)

##Model

###Original speech resynthesis

In [None]:
inputs = processor(text=transcription["text"], return_tensors="pt")
speech = model.generate_speech(inputs["input_ids"], extracted_xvector, vocoder=vocoder)
Audio(speech, rate=16000)

###Distance anonimization

In [None]:
inputs = processor(text=transcription["text"], return_tensors="pt")
speech = model.generate_speech(inputs["input_ids"], anon_xvector1, vocoder=vocoder)
Audio(speech, rate=16000)

###Max distance speaker

In [None]:
inputs = processor(text=transcription["text"], return_tensors="pt")
speech = model.generate_speech(inputs["input_ids"], anon_xvector2, vocoder=vocoder)
Audio(speech, rate=16000)

###Speaker randomization

In [None]:
inputs = processor(text=transcription["text"], return_tensors="pt")
speech = model.generate_speech(inputs["input_ids"], anon_xvector3, vocoder=vocoder)
Audio(speech, rate=16000)