In [1]:
!pip install torchaudio librosa

import os
import librosa
import numpy as np
import torch
import torchaudio
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from sklearn.model_selection import train_test_split
from glob import glob
from tqdm import tqdm


Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch==2.6.0->torchaudio)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (

In [2]:
class FusionAudioDataset(Dataset):
    def __init__(self, real_files, fake_files):
        self.samples = [(f, 0) for f in real_files] + [(f, 1) for f in fake_files]

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        path, label = self.samples[idx]
        # Raw waveform
        waveform, sr = torchaudio.load(path)
        waveform = waveform[:, :16000]  # Truncate or pad to 1 sec
        if waveform.shape[1] < 16000:
            pad = 16000 - waveform.shape[1]
            waveform = torch.nn.functional.pad(waveform, (0, pad))

        # Mel spectrogram
        mel = audio_to_mel(path)
        mel = torch.tensor(mel).unsqueeze(0)

        return waveform.float(), mel.float(), torch.tensor(label).long()

In [3]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
import torchaudio
import librosa
import numpy as np

class DeepfakeAudioDataset(Dataset):
    def __init__(self, root_dir, sample_rate=16000, duration=1.0):
        self.root_dir = root_dir
        self.sample_rate = sample_rate
        self.duration = duration
        self.audio_paths = []
        self.labels = []

        # change the label based on the file name.
        for label_str, label in [('real', 0), ('fake', 1)]:
            folder = os.path.join(root_dir, label_str)
            for file in os.listdir(folder):
                #follow what the file format
                if file.endswith(".wav"):
                    self.audio_paths.append(os.path.join(folder, file))
                    self.labels.append(label)

    def __len__(self):
        return len(self.audio_paths)

    def __getitem__(self, idx):
        path = self.audio_paths[idx]
        label = self.labels[idx]

        waveform, sr = torchaudio.load(path)
        waveform = waveform.mean(dim=0)  # mono
        waveform = torchaudio.functional.resample(waveform, sr, self.sample_rate)

        # Pad/crop to fixed duration
        max_len = int(self.sample_rate * self.duration)
        if waveform.shape[0] < max_len:
            waveform = torch.nn.functional.pad(waveform, (0, max_len - waveform.shape[0]))
        else:
            waveform = waveform[:max_len]

        # Mel spectrogram
        mel = librosa.feature.melspectrogram(y=waveform.numpy(), sr=self.sample_rate, n_mels=128)
        mel_db = librosa.power_to_db(mel, ref=np.max)
        mel_tensor = torch.tensor(mel_db).unsqueeze(0).float()  # (1, 128, T)

        return waveform.unsqueeze(0).float(), mel_tensor, torch.tensor(label).long()


In [4]:
class SpecRNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((32, 32))
        )
        self.fc = nn.Linear(64 * 32 * 32, 128)

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)


In [5]:
class RawGATST(nn.Module):
    def __init__(self):
        super(RawGATST, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv1d(1, 64, kernel_size=5, stride=2, padding=2),
            nn.BatchNorm1d(64), nn.ReLU(),
            nn.Conv1d(64, 128, kernel_size=5, stride=2, padding=2),
            nn.BatchNorm1d(128), nn.ReLU()
        )
        self.pool = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(128, 128)

    def forward(self, x):
        x = self.conv(x)
        x = self.pool(x).squeeze(-1)
        x = self.fc(x)
        return x

In [6]:
class FusionNet(nn.Module):
    def __init__(self):
        super(FusionNet, self).__init__()
        self.spec_model = SpecRNet()
        self.raw_model = RawGATST()
        self.classifier = nn.Sequential(
            nn.Linear(128 + 128, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 2)
        )

    def forward(self, raw_wave, mel_spec):
        raw_feat = self.raw_model(raw_wave)
        spec_feat = self.spec_model(mel_spec)
        fused = torch.cat((raw_feat, spec_feat), dim=1)
        return self.classifier(fused)

In [7]:
from torch.utils.data import Subset
from sklearn.model_selection import train_test_split

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Get dataset
dataset = DeepfakeAudioDataset("/kaggle/input/in-the-wild-audio-deepfake/release_in_the_wild")

# Create index list
indices = list(range(len(dataset)))

# Split into train + temp (val+test)
train_idx, temp_idx = train_test_split(indices, test_size=0.4, random_state=42)  # 60% train, 40% temp

# Split temp into validation + test (each 20%)
val_idx, test_idx = train_test_split(temp_idx, test_size=0.5, random_state=42)  # 20% val, 20% test

# Wrap subsets
train_set = Subset(dataset, train_idx)
val_set   = Subset(dataset, val_idx)
test_set  = Subset(dataset, test_idx)

# DataLoaders
train_loader = DataLoader(train_set, batch_size=8, shuffle=True, num_workers=2)
val_loader   = DataLoader(val_set, batch_size=8, shuffle=True, num_workers=2)
test_loader  = DataLoader(test_set, batch_size=8, shuffle=True, num_workers=2)


In [8]:
model = FusionNet().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss()

# Training loop
for epoch in range(20):
    model.train()
    total_loss = 0
    correct = 0
    for raw, mel, labels in train_loader:
        raw, mel, labels = raw.to(device), mel.to(device), labels.to(device)
        optimizer.zero_grad()
        output = model(raw, mel)
        loss = loss_fn(output, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        correct += (output.argmax(1) == labels).sum().item()
    acc = correct / len(train_loader.dataset)
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}, Train Acc: {acc:.4f}")


Epoch 1, Loss: 501.4216, Train Acc: 0.9413
Epoch 2, Loss: 112.8951, Train Acc: 0.9858
Epoch 3, Loss: 80.1846, Train Acc: 0.9890
Epoch 4, Loss: 56.1395, Train Acc: 0.9927
Epoch 5, Loss: 41.3158, Train Acc: 0.9951
Epoch 6, Loss: 38.1364, Train Acc: 0.9950
Epoch 7, Loss: 31.1283, Train Acc: 0.9961
Epoch 8, Loss: 27.8921, Train Acc: 0.9972
Epoch 9, Loss: 26.5467, Train Acc: 0.9969
Epoch 10, Loss: 20.7419, Train Acc: 0.9973
Epoch 11, Loss: 22.1530, Train Acc: 0.9974
Epoch 12, Loss: 19.7598, Train Acc: 0.9973
Epoch 13, Loss: 11.6636, Train Acc: 0.9983
Epoch 14, Loss: 15.2707, Train Acc: 0.9982
Epoch 15, Loss: 15.0848, Train Acc: 0.9982
Epoch 16, Loss: 12.6915, Train Acc: 0.9985
Epoch 17, Loss: 15.1656, Train Acc: 0.9983
Epoch 18, Loss: 8.1618, Train Acc: 0.9991
Epoch 19, Loss: 18.5750, Train Acc: 0.9980
Epoch 20, Loss: 16.3234, Train Acc: 0.9983


In [9]:
torch.save(model.state_dict(), "specRNet_rawGATST_fusion_adam_fulldataset_deep-voice-recog.pth")

#save checkpoint model state 

#tukar nama model everytime tukar nama model or anything.. baca paper

In [10]:
import torch
import torch.nn.functional as F

model.eval()

misclassified = []
correct_pred = []

with torch.no_grad():
    for raw_batch, mel_batch, labels_batch in test_loader:
        raw_batch, mel_batch, labels_batch = raw_batch.to(device), mel_batch.to(device), labels_batch.to(device)

        outputs = model(raw_batch, mel_batch)
        probs = F.softmax(outputs, dim=1)
        preds = torch.argmax(probs, dim=1)

        for i in range(len(labels_batch)):
            true_label = labels_batch[i].item()
            pred_label = preds[i].item()
            confidence = probs[i][pred_label].item()

            entry = {
                "true": "FAKE" if true_label == 1 else "REAL",
                "pred": "FAKE" if pred_label == 1 else "REAL",
                "confidence": confidence
            }

            if true_label != pred_label:
                misclassified.append(entry)
            else:
                correct_pred.append(entry)

print(f"✅ Total Correct: {len(correct_pred)}")
print(f"❌ Total Misclassified: {len(misclassified)}")

# Show some misclassifications
print("\n🔎 Sample Misclassifications:")
for item in misclassified[:10]:  # show first 10
    print(f"True={item['true']} | Pred={item['pred']} | Conf={item['confidence']*100:.2f}%")


✅ Total Correct: 6314
❌ Total Misclassified: 42

🔎 Sample Misclassifications:
True=FAKE | Pred=REAL | Conf=100.00%
True=FAKE | Pred=REAL | Conf=96.51%
True=REAL | Pred=FAKE | Conf=99.82%
True=FAKE | Pred=REAL | Conf=99.75%
True=FAKE | Pred=REAL | Conf=99.85%
True=REAL | Pred=FAKE | Conf=71.97%
True=REAL | Pred=FAKE | Conf=100.00%
True=FAKE | Pred=REAL | Conf=99.76%
True=FAKE | Pred=REAL | Conf=85.55%
True=REAL | Pred=FAKE | Conf=97.57%


In [11]:
import time
import torch.nn.functional as F

# Move model to device (if not already)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Get one batch from the validation loader
data_iter = iter(test_loader)
raw_batch, mel_batch, labels_batch = next(data_iter)

# Pick one sample (e.g. index 0)
raw_sample = raw_batch[2].unsqueeze(0).to(device)  # move to GPU
mel_sample = mel_batch[2].unsqueeze(0).to(device)  # move to GPU
label = labels_batch[2].item()

# Start timing
start_time = time.time()

# Run inference
model.eval()
with torch.no_grad():
    output = model(raw_sample, mel_sample)
    prediction = torch.argmax(output, dim=1).item()
    probs = F.softmax(output, dim=1)
    confidence = probs[0][prediction].item()

end_time = time.time()
inference_time = end_time - start_time

# Print results
print(f"True Label: {'FAKE' if label == 1 else 'REAL'}")
print(f"Prediction: {'FAKE' if prediction == 1 else 'REAL'} ({confidence*100:.2f}%)")
print(f"Inference Time: {inference_time:.4f} seconds")

True Label: REAL
Prediction: REAL (100.00%)
Inference Time: 0.0533 seconds
