# Deepfake detection using wav2vec 2.0 via HuggingFace

Install necessary packages

In [1]:
!pip install datasets
!pip install transformers
!pip install librosa
!pip install evaluate
!pip install kagglehub

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

Our dataset comes from kaggle; we can download it using the kagglehub API.

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("birdy654/deep-voice-deepfake-voice-recognition")

Downloading from https://www.kaggle.com/api/v1/datasets/download/birdy654/deep-voice-deepfake-voice-recognition?dataset_version_number=2...


100%|██████████| 3.69G/3.69G [00:33<00:00, 117MB/s]


Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/birdy654/deep-voice-deepfake-voice-recognition/versions/2


The dataset comes with two resources: a .csv containing meta data for each second of the audio, and the raw audio files. The original paper focuses on classificaiton using the meta data features. Here we're going to use the raw audio directly via wav2vec2, so we need to process the audio.

For this POC, we sample from the first 3 files from the `REAL` and `FAKE` directories. We split each audio file into 1 second chunks (`split_audio_into_seconds`), and shuffle them together.

In [4]:
import librosa
import numpy as np
import random
import os
r = "/root/.cache/kagglehub/datasets/birdy654/deep-voice-deepfake-voice-recognition/versions/2/KAGGLE/"

def split_audio_into_seconds(audio_path, duration=1.0):
    # Load the audio file
    y, sr = librosa.load(audio_path, sr=16_000)  # y is the audio time-series, sr is the sample rate

    # Calculate the number of samples per second
    samples_per_second = int(sr * duration)

    # Split the audio into 1-second chunks
    num_chunks = len(y) // samples_per_second

    audio_chunks = []
    for i in range(num_chunks):
        start_sample = i * samples_per_second
        end_sample = (i + 1) * samples_per_second
        chunk = y[start_sample:end_sample]
        audio_chunks.append(chunk)

    # Handle remaining samples if any
    if len(y) % samples_per_second != 0:
        audio_chunks.append(y[num_chunks * samples_per_second:])

    return audio_chunks

real_paths = [r + "AUDIO/REAL/" + f  for f in os.listdir(r + "AUDIO/REAL")][:3]
fake_paths = [r + "AUDIO/FAKE/" + f  for f in os.listdir(r + "AUDIO/FAKE")][:3]
audio_chunks, labels = [], []

for p in real_paths:
  chunks = split_audio_into_seconds(p)
  audio_chunks.extend(chunks)
  labels.extend([0] * len(chunks))

for p in fake_paths:
  chunks = split_audio_into_seconds(p)
  audio_chunks.extend(chunks)
  labels.extend([1] * len(chunks))


# Combine the two lists into a list of tuples (pairing corresponding elements)
combined = list(zip(audio_chunks, labels))

# Shuffle the combined list
random.shuffle(combined)

# Unzip the shuffled list back into two lists
shuffled_chunks, shuffled_labels = zip(*combined)

Finally, we build datasets from the shuffled chunks. For colab memory reasons we just use the first 2000 as training data, and 100 more as evaluation data.

In [5]:
#!pip install -U datasets

from datasets import Dataset, Audio

train = Dataset.from_dict({"audio": shuffled_chunks[:2000], "label":shuffled_labels[:2000]})
eval = Dataset.from_dict({"audio": shuffled_chunks[2000:2100], "label":shuffled_labels[2000:2100]})

We're going to use Huggingface's AutoFeatureExtractor to get a wav2vec2 model.

In [6]:
from transformers import AutoFeatureExtractor

model_checkpoint = "facebook/wav2vec2-base"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]



We take the datasets and preprocess them using this feature extractor.

In [7]:
def preprocess_function(examples):
    audio_arrays = examples["audio"]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate),
        truncation=True,
    )
    return inputs

encoded_train = train.map(preprocess_function, remove_columns=["audio"], batched=True)
encoded_eval = eval.map(preprocess_function, remove_columns=["audio"], batched=True)


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['label', 'input_values'],
    num_rows: 2000
})


Finally, we use transformers Trainer functionality to automate the training process. We use the same wav2vec-base model, and fine tune it based on the 2000 samples of training data. For evaluation, we rely on accuracy: whether that 1-second chunk was from a `REAL` or `FAKE` clip of speech.

In [None]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer
import os
import evaluate

os.environ["WANDB_DISABLED"] = "true"

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

model = AutoModelForAudioClassification.from_pretrained(
    model_checkpoint,
    num_labels=2,
)

model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy = "steps",
    eval_steps = 5,
    save_strategy = "steps",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
#    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    warmup_ratio=0.1,
    logging_steps=1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_train,
    eval_dataset=encoded_eval,
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics
)

trainer.train()

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy
5,0.6949,0.696217,0.36
