In [4]:
import os

# Proof of Concept

The code in this notebook will be used to run different tests to identify what strategy would get us the best results in terms of imperceptible changes to audio that can poison music gen models. Each test will have either one or several experiments to determine the approach with the best results given each test's goal. 

### Overview

**Test1**:

- The goal of this test is to determine whether the proof of concept can be reached if we work on modifying the spectrogram domain. Each experiment is a different implementation of perturbations in the spectogram domain. The goal is to try to find the implementation with the best results that allows us to add imperceptible perturbations in the spectrogram domain that can poison a music gen model

**Test2**:
- The goal of this test is to determine whether the proof of concept can be reached if we work on modifying the numpy representation of the audio inputs loaded with librosa. Each experiment is a different implementation of perturbations applied to the numpy representation. The goal is to try to find the implementation with the best results that allows us to add imperceptible perturbations applied to the numpy representation of a song that can poison a music gen model

**Test3**: 
- Same but in the raw audio domain 

**Test4**: 
- Find a way to change audio based on features 
- extract features 
- reconstruct audio from features 

All these experiments will assume that there exists some fucked up vocals that were generated by RVC given the input audio.

In [5]:
# Note perturbed means slight changed; Changed means it's a different voice completely
DATA_CHECKPOINT = '../data'
INPUT_SONG_CHECKPOINT = 'original_wavs'
CHANGED_SONG_CHECKPOINT = 'changed_wavs/mixes' #Songs that have vocals completely changed
CHANGED_VOX_CHECKPOINT = 'changed_wavs/vocals' #Just vocals that are completely changed 
PERTURBED_SONG_CHECKPOINT = 'perturbed_wavs/mixes'
PERTURBED_VOX_CHECKPOINT = 'perturbed_wavs/vocals'
SPLIT_SONG_CHECKPONT = 'split_wavs'

source_names = ['drums', 'bass', 'other', 'vocals']
song_name = 'Westy - KING OF THE NIGHT'

filename1 = f'Westy - KING OF THE NIGHT_{source_names[3]}' #Using the vocals split now 
filename2 = f'Westy - KING OF THE NIGHT_changed' #Using the completely changed vocals 
perturbed_output_filename = f'Westy - KING OF THE NIGHT_perturbed'
file_format = '.wav'

In [15]:
audio_type = "vocals"
folder = PERTURBED_SONG_CHECKPOINT if audio_type=="song" else PERTURBED_VOX_CHECKPOINT
print(folder)

perturbed_wavs/vocals


In [19]:
def get_output_path(audio_type, test_number, experiment_number, perturbed_output_filename=perturbed_output_filename, file_format=file_format):
    folder = PERTURBED_SONG_CHECKPOINT if audio_type=="song" else PERTURBED_VOX_CHECKPOINT
    filename = perturbed_output_filename+f"_test_{test_number}"+f"_exp{experiment_number}"+file_format
    return os.path.join(DATA_CHECKPOINT, folder, filename)

In [6]:
clean_input_audio = os.path.join(DATA_CHECKPOINT, SPLIT_SONG_CHECKPONT, filename1+file_format)
changed_input_audio = os.path.join(DATA_CHECKPOINT, CHANGED_VOX_CHECKPOINT, filename2+file_format) 
perturbed_vox_output_audio = os.path.join(DATA_CHECKPOINT, PERTURBED_VOX_CHECKPOINT, perturbed_output_filename+file_format)
perturbed_mix_output_audio = os.path.join(DATA_CHECKPOINT, PERTURBED_SONG_CHECKPOINT, perturbed_output_filename+file_format)  

In [8]:
assert os.path.exists(clean_input_audio)==True
assert os.path.exists(changed_input_audio)==True

## Test 1

This test will focus on the **vocals** only. The input will be westy's vocals and the changed westy's vocals to ice spice. The output will be vocals that were created by placing some sort of "ice spice" envelope on the westy's vocals. The will be done on the spectrogram itself. 

## Overview
---

**Experiment 1**: change the entire audio based on a linear combination of the spectrograms of the input audio. The importance of each song is specified through the parameter overlay_weight. 

---

### Experiment 1

1) preprocess audio (normalize the input, truncate so they are exactly the same length)
2) Extract and prepare audio features. This experiment will use the stft spectrogram spectrogram domain still 
3) Blend the changes in the spectrogram domain based on the overlay_weight parameter
4) Convert the blended spectrogram back to the time domain

In [None]:
import librosa
import numpy as np
import matplotlib.pyplot as plt
import soundfile as sf

# Configuration hyperparameters

overlay_weight = 0.2  # weight for song2 overlay
output_path = get_output_path("vocals", test_number="1", experiment_number="1")

print("[INFO] HYPERPARAMETERS:\n")
print(f"overlay_weight = {overlay_weight}")
print()

# Step 1: Load the audio files
song1_path = clean_input_audio  # Path to the original song
song2_path = changed_input_audio  # Path to the song with different vocals

song1, sr = librosa.load(song1_path, sr=None)
song2, _ = librosa.load(song2_path, sr=sr)  # Ensure same sample rate
print("[INFO] INPUTS:\n")
print(f"Original audio found at {song1_path}")
print(f"Corresponding changed audio found at {song2_path}")
print(f"Sample Rate: {sr}")
print()

# Ensure both songs are the same length
min_length = min(len(song1), len(song2))
song1 = song1[:min_length]
song2 = song2[:min_length]

# Step 2: Convert to spectrogram (frequency domain)
def compute_spectrogram(audio, sr):
    S = librosa.stft(audio, n_fft=2048, hop_length=512)
    S_db = librosa.amplitude_to_db(np.abs(S), ref=np.max)
    return S, S_db

print("[INFO] Generating spectrograms for each song...:\n")
print(f"Original audio found at {song1_path}")
print(f"Corresponding changed audio found at {song2_path}")
print(f"Sample Rate: {sr}")

# Compute spectrograms for both songs
S1, S1_db = compute_spectrogram(song1, sr)
S2, S2_db = compute_spectrogram(song2, sr)

print("Spectrograms generated successfully!\n")

def plot_spectrogram(S_db, title):
        plt.figure(figsize=(10, 6))
        librosa.display.specshow(S_db, sr=sr, hop_length=512, x_axis='time', y_axis='log')
        plt.colorbar(format='%+2.0f dB')
        plt.title(title)
        plt.show()
# Uncomment to visualize
plot_spectrogram(S1_db, 'Original Song Spectrogram')
plot_spectrogram(S2_db, 'Overlay Song Spectrogram')

# Step 4: Blending in the frequency domain (spectrogram domain)
# Create a mask that blends specific frequencies from song2
blended_spectrogram = (1 - overlay_weight) * S1 + overlay_weight * S2

# Step 5: Convert blended spectrogram back to time domain
blended_audio = librosa.istft(blended_spectrogram, hop_length=512)

# Step 6: Save the output
sf.write(output_path, blended_audio, sr)
print(f'Perturbed song saved to {output_path}')


---

## Test 2

This test will focus on the **vocals** only. The input will be westy's vocals and the changed westy's vocals to ice spice. The output will be vocals that were created by placing some sort of "ice spice" envelope on the westy's vocals. This will be done by loading the song using librosa and converting it to a numpy array. We will segment the song based on specified segment_length and we will experiment with different sampling techniques when choosing the segments to change.

### Overview 
---

**Experiment1**: segment the audio input into segments of length segment_length, loop over every segment and change it based on a linear combination of both input audios for that segment. For a given segment the value of the segment will be computed as follows: new_segment = (1 - overlay_weight) x segment_in_song1 + overlay_weight x segment_in_song2

**Experiment2**: Instead of looping through all segments computing new segments based on a linear combination, we will randomly sample a percentage specified by the hyperparameter selected_segments_size from the total number of segments. We will then only modify the sampled segments and not all possible samples. if selected_segments_size is set to 1, we will get the output of the experiment1.

**Experiment3**: Instead of changing all of the selected segment, we now only change the initial portion of the segment. The size of the segment is specified through the hyperparameter segment_pct. We will then only modify the selected portion of the segment. if segment_pct is set to 1, we will get the output of experiment 2. 

---

### Experiment 1

1) preprocess audio (normalize the input, truncate so they are exactly the same length)
2) load the audio inputs using libroasa and change it into a numpy array
3) Split the audio inputs into segments of length segment_length
4) Loop over the selected segments and overlay the audio based on the overlay_weight:
new_segment = (1 - overlay_weight) x segment_in_song1 + overlay_weight x segment_song2
5) Convert the blended spectrogram back to the time domain

In [None]:
import librosa
import numpy as np
import matplotlib.pyplot as plt
import soundfile as sf

def perturb_audio1(clean_audio_path, changed_audio_path, perturbed_audio_path, segment_length, overlay_weight):

    print("[INFO] HYPERPARAMETERS:\n")
    print(f"segment_length = {segment_length}")
    print(f"overlay_weight = {overlay_weight}")
    print()

    # Step 1: Load the audio files
    song1_path = clean_audio_path  # Path to the original song
    song2_path = changed_audio_path  # Path to the song with different vocals

    song1, sr = librosa.load(song1_path, sr=None)
    song2, _ = librosa.load(song2_path, sr=sr)  # Ensure same sample rate
    print("[INFO] INPUTS:\n")
    print(f"Original audio found at {song1_path}")
    print(f"Corresponding changed audio found at {song2_path}")
    print(f"Sample Rate: {sr}")
    print()

    # Ensure both songs are the same length
    min_length = min(len(song1), len(song2))
    song1 = song1[:min_length]
    song2 = song2[:min_length]


    print("[INFO] Retrieving Number of Segments\n")
    # Step 3: Divide song1 into segments and overlay song2 segments
    num_segments = int(len(song1) / (segment_length * sr))
    print(f"SONG LENGTH: {len(song1)}")
    print(f"NUMBER OF SEGMENTS OF {segment_length} FOUND: {num_segments}")

    # Initialize the blended song with a copy of song1
    blended_audio = np.copy(song1)

    for i in range(num_segments):
        # Define the time range for the current segment
        start_sample = int(i * segment_length * sr)
        end_sample = start_sample + int(segment_length * sr)
        
        # Segment in time domain
        segment_song1 = song1[start_sample:end_sample]
        segment_song2 = song2[start_sample:end_sample]
        
        # Blending in time domain
        blended_segment = (1 - overlay_weight) * segment_song1 + overlay_weight * segment_song2
        
        # Place the blended segment back into the song
        blended_audio[start_sample:end_sample] = blended_segment

    # Save the output
    sf.write(perturbed_audio_path, blended_audio, sr)
    print(f'Perturbed song saved to {perturbed_audio_path}')


In [None]:
experiment1_output_path = get_output_path("vocals", "2", "1")
perturb_audio1(
    clean_input_audio, 
    changed_input_audio, 
    experiment1_output_path, 
    segment_length=2.0, 
    overlay_weight=0.2)

### Experiment 2

1) preprocess audio (normalize the input, truncate so they are exactly the same length)
2) Split the original audio into segments of length segment_length (hyperparameter)
3) use the hyperparameter selected_segments_size to determine the percentage of segments that will be changed. If the input has 1000 segments and percentage was set to 0.15, only 150 segments will be changed (chosen randomly)
4) Loop over the selected segments and overlay the audio based on the overlay_weight:
new_segment = (1 - overlay_weight) x segment_in_song1 + overlay_weight x segment_song2
5) Convert the blended np array back to the time domain

In [31]:
import random
def perturb_audio2(clean_audio_path, changed_audio_path, perturbed_audio_path, segment_length, overlay_weight, selected_segments_size):
    """
    inputs:
    segment_length: number of seconds per segment 
    overlay_weight: weight for song2 overlay. Higher values will give more weight to the changed_audio_path 
    in the segments overlay step (audio more similar to the changed audio input)
    selected_segments_size: percentage of segments that will be changed. If the audio has 1000 time segments and segments_pct
    was set to 0.15, then only 150 time segments will be changed and overlayed. The remaining 850 time segments
    will remain unchanged
    """
    # Configuration hyperparameters
    output_path = perturbed_audio_path

    print("[INFO] HYPERPARAMETERS:\n")
    print(f"segment_length = {segment_length}")
    print(f"overlay_weight = {overlay_weight}")
    print(f"selected_segments_size = {selected_segments_size}")
    print()

    # Step 1: Load the audio files
    song1_path = clean_audio_path  # Path to the original song
    song2_path = changed_audio_path  # Path to the song with different vocals

    song1, sr = librosa.load(song1_path, sr=None)
    song2, _ = librosa.load(song2_path, sr=sr)  # Ensure same sample rate
    print("[INFO] INPUTS:\n")
    print(f"Original audio found at {song1_path}")
    print(f"Corresponding changed audio found at {song2_path}")
    print(f"Sample Rate: {sr}")
    print()

    # Ensure both songs are the same length
    min_length = min(len(song1), len(song2))
    song1 = song1[:min_length]
    song2 = song2[:min_length]

    print("[INFO] Retrieving Number of Segments\n")
    # Step 3: Divide song1 into segments and overlay song2 segments
    num_segments = int(len(song1) / (segment_length * sr))
    print(f"SONG LENGTH: {len(song1)}")
    print(f"NUMBER OF SEGMENTS OF {segment_length} COMPUTED: {num_segments}")
    print()

    # Initialize the blended song with a copy of song1
    blended_audio = np.copy(song1)

    print("[INFO] Sampling segments to perturb...\n")
    selected_segments = random.sample(range(num_segments), int(num_segments * selected_segments_size))
    print(f"TOTAL TIME SEGMENTS: {num_segments}")
    print(f"SEGMENTS SELECTED: {len(selected_segments)}")
    print()


    for i in selected_segments:
        # Define the time range for the current segment
        start_sample = int(i * segment_length * sr)
        end_sample = start_sample + int(segment_length * sr)
        
        # Segment in time domain
        segment_song1 = song1[start_sample:end_sample]
        segment_song2 = song2[start_sample:end_sample]
        
        # Blending in time domain
        blended_segment = (1 - overlay_weight) * segment_song1 + overlay_weight * segment_song2
        
        # Place the blended segment back into the song
        blended_audio[start_sample:end_sample] = blended_segment

    # Save the output
    sf.write(output_path, blended_audio, sr)
    print(f'Perturbed song saved to {output_path}')

In [34]:
experiment2_output_path = get_output_path("vocals", "2", "2")
perturb_audio2(clean_input_audio, changed_input_audio, experiment2_output_path, 2.0, 0.2, 0.15)

[INFO] HYPERPARAMETERS:

segment_length = 2.0
overlay_weight = 0.2

[INFO] INPUTS:

Original audio found at ../data/split_wavs/Westy - KING OF THE NIGHT_vocals.wav
Corresponding changed audio found at ../data/changed_wavs/vocals/Westy - KING OF THE NIGHT_changed.wav
Sample Rate: 44100

[INFO] Retrieving Number of Segments

SONG LENGTH: 7990156
NUMBER OF SEGMENTS OF 2.0 COMPUTED: 90

[INFO] Sampling segments to perturb...

TOTAL TIME SEGMENTS: 90
SEGMENTS SELECTED: 13

Perturbed song saved to ../data/perturbed_wavs/vocals/Westy - KING OF THE NIGHT_perturbed_exp2.wav


### Experiment 3

1) preprocess audio (normalize the input, truncate so they are exactly the same length)
2) Split the original audio into segments of length segment_length (hyperparameter)
3) use the hyperparameter selected_segments_size to determine the percentage of segments that will be changed. If the input has 1000 segments and percentage was set to 0.15, only 150 segments will be changed (chosen randomly)
4) Loop over the selected segments and select a portion of the segment to modify. The size of the portion is determined by the hyperparameter segment_pct. If segment_pct is set to 1, we will get the output of Experiment 2
5) overlay the audio for the selected portion based on the overlay_weight:
new_portion = (1 - overlay_weight) x portion_in_song1 + overlay_weight x portion_in_song1
6) Convert the blended np array back to the time domain

In [35]:
import random
def perturb_audio3(
        clean_audio_path, 
        changed_audio_path, 
        perturbed_audio_path, 
        segment_length, 
        overlay_weight, 
        selected_segments_size,
        segment_pct):
    """
    inputs:
    segment_length: number of seconds per segment 
    overlay_weight: weight for song2 overlay. Higher values will give more weight to the changed_audio_path 
    in the segments overlay step (audio more similar to the changed audio input)
    selected_segments_size: percentage of segments that will be changed. If the audio has 1000 time segments and segments_pct
    was set to 0.15, then only 150 time segments will be changed and overlayed. The remaining 850 time segments
    will remain unchanged
    segment_pct: percentage of the segment that would be changed. For segment_length=2.0 and segment_pct=0.10, the first 150ms
    of the segment will be chosen
    """
    # Configuration hyperparameters
    output_path = perturbed_audio_path

    print("[INFO] HYPERPARAMETERS:\n")
    print(f"segment_length = {segment_length}")
    print(f"overlay_weight = {overlay_weight}")
    print(f"selected_segments_size = {selected_segments_size}")
    print(f"segment_pct = {segment_pct}")
    print()

    # Step 1: Load the audio files
    song1_path = clean_audio_path  # Path to the original song
    song2_path = changed_audio_path  # Path to the song with different vocals

    song1, sr = librosa.load(song1_path, sr=None)
    song2, _ = librosa.load(song2_path, sr=sr)  # Ensure same sample rate
    print("[INFO] INPUTS:\n")
    print(f"Original audio found at {song1_path}")
    print(f"Corresponding changed audio found at {song2_path}")
    print(f"Sample Rate: {sr}")
    print()

    # Ensure both songs are the same length
    min_length = min(len(song1), len(song2))
    song1 = song1[:min_length]
    song2 = song2[:min_length]

    # Step 2: Convert to spectrogram (frequency domain)
    def compute_spectrogram(audio, sr):
        S = librosa.stft(audio, n_fft=2048, hop_length=512)
        S_db = librosa.amplitude_to_db(np.abs(S), ref=np.max)
        return S, S_db

    print("[INFO] Retrieving Number of Segments\n")
    # Step 3: Divide song1 into segments and overlay song2 segments
    num_segments = int(len(song1) / (segment_length * sr))
    print(f"SONG LENGTH: {len(song1)}")
    print(f"NUMBER OF SEGMENTS OF {segment_length} COMPUTED: {num_segments}")
    print()

    # Initialize the blended song with a copy of song1
    blended_audio = np.copy(song1)

    print("[INFO] Sampling segments to perturb...\n")
    selected_segments = random.sample(range(num_segments), int(num_segments * selected_segments_size))
    print(f"TOTAL TIME SEGMENTS: {num_segments}")
    print(f"SEGMENTS SELECTED: {len(selected_segments)}")
    print()

    for i in (selected_segments):
        # Define the time range for the current segment
        start_sample = int(i * segment_length * sr)
        end_sample = start_sample + int(segment_length * sr)
        
        
        # Define the portion of the segment to modify based on segment_pct
        portion_length = int(segment_pct * segment_length * sr)
        end_portion = start_sample + portion_length

        # Segment in time domain
        segment_song1 = song1[start_sample:end_portion]
        segment_song2 = song2[start_sample:end_portion]

        # Blending in time domain
        blended_segment = (1 - overlay_weight) * segment_song1 + overlay_weight * segment_song2
        
        # Place the blended segment back into the song
        blended_audio[start_sample:end_portion] = blended_segment

    #Save the output
    sf.write(output_path, blended_audio, sr)
    print(f'Perturbed song saved to {output_path}')

In [36]:
experiment3_output_path = get_output_path("vocals", "2", "3")
perturb_audio3(clean_input_audio, changed_input_audio, experiment3_output_path, segment_length=2.0, overlay_weight=0.2, selected_segments_size=0.15,segment_pct=0.10)

[INFO] HYPERPARAMETERS:

segment_length = 2.0
overlay_weight = 0.2

[INFO] INPUTS:

Original audio found at ../data/split_wavs/Westy - KING OF THE NIGHT_vocals.wav
Corresponding changed audio found at ../data/changed_wavs/vocals/Westy - KING OF THE NIGHT_changed.wav
Sample Rate: 44100

[INFO] Retrieving Number of Segments

SONG LENGTH: 7990156
NUMBER OF SEGMENTS OF 2.0 COMPUTED: 90

[INFO] Sampling segments to perturb...

TOTAL TIME SEGMENTS: 90
SEGMENTS SELECTED: 13

Perturbed song saved to ../data/perturbed_wavs/vocals/Westy - KING OF THE NIGHT_perturbed_exp3.wav


---