<a href="https://colab.research.google.com/github/dodorlee1210/deepfake_audio_detection/blob/main/HybridAudio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DATA Processing: Librispeech

In [1]:
import os
import urllib.request
import tarfile
import librosa
import soundfile as sf

In [2]:
# LibriSpeech URL [test-clean]
LIBRISPEECH_URL = "https://www.openslr.org/resources/12/test-clean.tar.gz"
DATA_DIR = "/content/real"
AUDIO_DIR = os.path.join(DATA_DIR, "LibriSpeech/test-clean")

# Download LibriSpeech test-clean dataset
os.makedirs(DATA_DIR, exist_ok=True)
tar_path = os.path.join(DATA_DIR, "test-clean.tar.gz")

urllib.request.urlretrieve(LIBRISPEECH_URL, tar_path)

# Extract files
with tarfile.open(tar_path, "r:gz") as tar:
    tar.extractall(DATA_DIR)

# Convert FLAC to WAV
wav_dir = os.path.join(DATA_DIR, "wav")
os.makedirs(wav_dir, exist_ok=True)

flac_files = []
for root, _, files in os.walk(AUDIO_DIR):
    for file in files:
        if file.endswith(".flac"):
            flac_path = os.path.join(root, file)
            wav_path = os.path.join(wav_dir, file.replace(".flac", ".wav"))
            flac_files.append((flac_path, wav_path))

for flac_path, wav_path in flac_files:
    # Load FLAC
    audio, sr = librosa.load(flac_path, sr=None)
    # Save as WAV
    sf.write(wav_path, audio, sr)

print(f"Saved in {wav_dir}")


Saved in /content/real/wav


# Manipulate Data: Deepfake

In [3]:
# https://pypi.org/project/pydub/
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [4]:
import librosa.effects
import numpy as np
import random
from pydub import AudioSegment

In [32]:
def apply_pitch_shift(audio_path, output_path, pitch_factor):
    """Shift the pitch of an audio file."""
    y, sr = librosa.load(audio_path, sr=None)
    y_shifted = librosa.effects.pitch_shift(y, sr=sr, n_steps=pitch_factor, bins_per_octave=24)
    sf.write(output_path, y_shifted, sr)

def time_stretch(audio_path, output_path, stretch_factor=0.8):
    """Stretch the time of audio."""
    y, sr = librosa.load(audio_path, sr=None)
    y_stretched = librosa.effects.time_stretch(y, stretch_factor)
    sf.write(output_path, y_stretched, sr)

def add_noise(audio_path, output_path, noise_level=0.02):
    """Add white noise to audio."""
    y, sr = librosa.load(audio_path, sr=None)
    noise = noise_level * np.random.randn(len(y))
    y_noisy = y + noise
    sf.write(output_path, y_noisy, sr)

def add_reverb(audio_path, output_path, room_scale=50):
    """Apply a reverb effect."""
    audio = AudioSegment.from_file(audio_path, format="wav")
    reverb_audio = audio.low_pass_filter(room_scale).fade_in(500).fade_out(500)
    reverb_audio.export(output_path, format="wav")

def create_deepfake_audio(original_audio_path, deepfake_output_path):
    """Apply multiple transformations to simulate deepfake audio."""
    # temp_pitch = "temp_pitch.wav"
    # temp_noise = "temp_noise.wav"

    apply_pitch_shift(original_audio_path, deepfake_output_path, pitch_factor=random.uniform(5,20))
    # add_noise(temp_pitch, temp_noise, noise_level=0.015)
    # add_reverb(temp_noise, deepfake_output_path, room_scale=random.randint(40, 60))

    # Cleanup temp files
    # os.remove(temp_pitch)
    # os.remove(temp_noise)

    print(f"Deepfake Saved at: {deepfake_output_path}")

# Hybrid

In [6]:
def split_audio(audio, segment_length):
    """Split audio into equal segments (in ms)."""
    # Convert the NumPy array to an AudioSegment for splitting
    audio_segment = AudioSegment(audio.tobytes(), frame_rate=sr, sample_width=audio.dtype.itemsize, channels=1)
    segments = []
    for i in range(0, len(audio_segment), segment_length):
        segments.append(audio_segment[i:i + segment_length])
    return segments

def mix_segments(deepfake_seg, real_seg):
    """Create mixed segments using deepfake and real segments."""
    mixed_segments = []
    for deepfake_segment, real_segment in zip(deepfake_seg, real_seg):
        mixed_segment = deepfake_segment.overlay(real_segment)
        mixed_segments.append(mixed_segment)
    return mixed_segments

def create_hybrid_audio(deepfake_path, real_path, mixed_path, segment_length=2000):
    """Create hybrid audio using deepfake and real segments."""
    global sr # Make sr accessible within the function
    deepfake, sr = librosa.load(deepfake_path, sr=None)  # Get sample rate (sr)
    real, _ = librosa.load(real_path, sr=sr)  # Use same sample rate for real audio

    deepfake_seg = split_audio(deepfake, segment_length)
    real_seg = split_audio(real, segment_length)
    mixed_audio = mix_segments(deepfake_seg, real_seg)

    # Concatenate the mixed segments into a single AudioSegment
    final_audio = sum(mixed_audio, AudioSegment.empty())
    final_audio.export(mixed_path, format="wav")

# Load, Mix, Label: 1 file

In [33]:
os.makedirs("/content/fake/wav", exist_ok=True)
os.makedirs("/content/hybrid/wav", exist_ok=True)
REAL_AUDIO = "/content/real/wav/1089-134686-0000.wav"
FAKE_AUDIO = "/content/fake/wav/1089-134686-0000.wav"
HYBRID_AUDIO = "/content/hybrid/wav/1089-134686-0000.wav"
create_deepfake_audio(REAL_AUDIO, FAKE_AUDIO)

Deepfake Saved at: /content/fake/wav/1089-134686-0000.wav


In [31]:
create_hybrid_audio(FAKE_AUDIO, REAL_AUDIO, HYBRID_AUDIO)