<a href="https://colab.research.google.com/github/ken00H/Csharp-windows-form-sample/blob/master/Voice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import kagglehub
import shutil

# Define the target directory
target_dir = "/content/DataSet"
os.makedirs(target_dir, exist_ok=True)

# Download and extract the dataset to the default KaggleHub cache location
# The 'path' argument in dataset_download seems to be causing issues when trying to set the destination directly
downloaded_path_in_cache = kagglehub.dataset_download("birdy654/deep-voice-deepfake-voice-recognition")

# Move the contents from the downloaded cache path to the target directory
for item in os.listdir(downloaded_path_in_cache):
    source_item = os.path.join(downloaded_path_in_cache, item)
    destination_item = os.path.join(target_dir, item)
    # Use shutil.move to move files and directories
    shutil.move(source_item, destination_item)

# Optional: Remove the now empty cache directory if it's not the root of the cache
# Check if the directory is empty before removing
if not os.listdir(downloaded_path_in_cache):
    os.rmdir(downloaded_path_in_cache)

# The final path where the dataset contents are located is the target_dir
path = target_dir

print(f"Dataset downloaded and extracted to: {path}")
print(f"Contents of {path}: {os.listdir(path)}")

### Install Optuna for Hyperparameter Tuning

In [4]:
pip install optuna

Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.6.0


### Modified `train_model` for Optuna Tuning

I've modified the `train_model` function to accept an `optuna.trial.Trial` object. This allows Optuna to suggest a learning rate (`lr`) for each trial. The objective function will now return the validation accuracy, which Optuna will try to maximize.

In [14]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
import librosa
import numpy as np
import warnings
import random
import time
from torch.utils.data import Dataset, DataLoader
from torchaudio.transforms import MFCC, FrequencyMasking, TimeMasking
from torchvision.models import resnet18
import optuna

# --- Configuration ---
warnings.filterwarnings("ignore")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class UltimateForensicDataset(Dataset):
    def __init__(self, file_list, sample_rate=16000, duration=3, is_train=True):
        self.file_list = file_list
        self.sample_rate = sample_rate
        self.num_samples = sample_rate * duration
        self.is_train = is_train

        # Feature Extractor
        self.mfcc_transform = MFCC(
            sample_rate=sample_rate, n_mfcc=40,
            melkwargs={"n_fft": 400, "hop_length": 160, "n_mels": 40}
        ).to(device)

        # SpecAugment for generalization
        self.freq_mask = FrequencyMasking(freq_mask_param=10)
        self.time_mask = TimeMasking(time_mask_param=30)

    def __len__(self):
        return len(self.file_list)

    def simulate_compression(self, audio):
        """Simulates MP3/Lossy compression artifacts"""
        if random.random() > 0.5:
            temp_sr = random.randint(8000, 12000)
            audio = librosa.resample(audio, orig_sr=self.sample_rate, target_sr=temp_sr)
            audio = librosa.resample(audio, orig_sr=temp_sr, target_sr=self.sample_rate)
        return audio

    def __getitem__(self, idx):
        path, label = self.file_list[idx]
        try:
            # DEBUG: Print current file being processed
            print(f"Processing audio file: {path}")
            # Robust loading for both .wav and .mp3
            audio, _ = librosa.load(path, sr=self.sample_rate)

            # Random Cropping (Temporal Variability)
            if len(audio) > self.num_samples:
                start = random.randint(0, len(audio) - self.num_samples) if self.is_train else 0
                audio = audio[start: start + self.num_samples]
            else:
                audio = np.pad(audio, (0, max(0, self.num_samples - len(audio))))

            if self.is_train:
                audio = self.simulate_compression(audio)  # MP3-Robustness
                audio = audio + np.random.normal(0, 0.002, audio.shape)  # Noise-Robustness

            audio = audio / (np.max(np.abs(audio)) + 1e-9)
            waveform = torch.from_numpy(audio).float().unsqueeze(0).to(device)

            with torch.no_grad():
                mfcc = self.mfcc_transform(waveform)
                if self.is_train:
                    mfcc = self.freq_mask(mfcc)
                    mfcc = self.time_mask(mfcc)

                delta = torchaudio.functional.compute_deltas(mfcc)
                delta2 = torchaudio.functional.compute_deltas(delta)
                features = torch.cat([mfcc, delta, delta2], dim=0)

            return features, torch.tensor(label, dtype=torch.float32)
        except Exception as e:
            print(f"Error processing file {path}: {e}")
            return torch.zeros((3, 40, 301)).to(device), torch.tensor(label, dtype=torch.float32)


class UltimateResNet(nn.Module):
    def __init__(self):
        super(UltimateResNet, self).__init__()
        self.resnet = resnet18(weights=None)
        self.resnet.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.resnet.fc = nn.Sequential(
            nn.Linear(512, 256), nn.BatchNorm1d(256), nn.ReLU(),
            nn.Dropout(0.7),  # Strong dropout to stop memorization
            nn.Linear(256, 1), nn.Sigmoid()
        )

    def forward(self, x): return self.resnet(x)


def train_model_optuna(trial, data_path):
    all_files = []
    for root, _, files in os.walk(data_path):
        for file in files:
            if file.lower().endswith(('.wav', '.mp3')):
                file_path = os.path.join(root, file)
                label = 0 if 'REAL' in file_path.upper() else 1 if 'FAKE' in file_path.upper() else None
                if label is not None: all_files.append((file_path, label))

    random.shuffle(all_files)
    split = int(0.8 * len(all_files))
    train_loader = DataLoader(UltimateForensicDataset(all_files[:split], is_train=True), batch_size=4, shuffle=True)
    val_loader = DataLoader(UltimateForensicDataset(all_files[split:], is_train=False), batch_size=4)

    model = UltimateResNet().to(device)

    # Suggest a learning rate to Optuna
    lr = trial.suggest_float("lr", 1e-6, 1e-3, log=True)
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5)
    criterion = nn.BCELoss()

    print("\n" + "=" * 40)
    print("      FORENSIC TRAINING PROGRESS (Optuna Trial)")
    print("=" * 40)

    best_acc = 0
    for epoch in range(10): # Limiting epochs for Optuna trials to save time
        start_time = time.time()
        model.train()
        train_loss = 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device).view(-1, 1)
            optimizer.zero_grad()
            loss = criterion(model(inputs), labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        model.eval()
        correct, val_loss = 0, 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device).view(-1, 1)
                outputs = model(inputs)
                val_loss += criterion(outputs, labels).item()
                correct += ((outputs > 0.5).float() == labels).sum().item()

        acc = 100 * correct / len(all_files[split:])
        scheduler.step(val_loss)
        duration = time.time() - start_time
        current_lr = optimizer.param_groups[0]['lr']

        # --- PROGRESS TRACKER OUTPUT ---
        print(
            f"Epoch {epoch + 1:02d} | Loss: {train_loss / len(train_loader):.4f} | Acc: {acc:.2f}% | LR: {current_lr:.1e} | Time: {duration:.1f}s")

        # Report intermediate accuracy to Optuna
        trial.report(acc, epoch)

        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return acc # Optuna optimizes for this return value (using the last epoch's accuracy)

def train_model_final(data_path, lr, num_epochs=5, model_save_path="best_model.pth"):
    all_files = []
    for root, _, files in os.walk(data_path):
        for file in files:
            if file.lower().endswith(('.wav', '.mp3')):
                file_path = os.path.join(root, file)
                label = 0 if 'REAL' in file_path.upper() else 1 if 'FAKE' in file_path.upper() else None
                if label is not None: all_files.append((file_path, label))

    random.shuffle(all_files)
    split = int(0.8 * len(all_files))
    train_loader = DataLoader(UltimateForensicDataset(all_files[:split], is_train=True), batch_size=4, shuffle=True)
    val_loader = DataLoader(UltimateForensicDataset(all_files[split:], is_train=False), batch_size=4)

    model = UltimateResNet().to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5)
    criterion = nn.BCELoss()

    print("\n" + "=" * 40)
    print("      FORENSIC TRAINING PROGRESS (Final Training)")
    print("=" * 40)

    best_acc = 0
    for epoch in range(num_epochs):
        start_time = time.time()
        model.train()
        train_loss = 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device).view(-1, 1)
            optimizer.zero_grad()
            loss = criterion(model(inputs), labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        model.eval()
        correct, val_loss = 0, 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device).view(-1, 1)
                outputs = model(inputs)
                val_loss += criterion(outputs, labels).item()
                correct += ((outputs > 0.5).float() == labels).sum().item()

        acc = 100 * correct / len(all_files[split:])
        scheduler.step(val_loss)
        duration = time.time() - start_time
        current_lr = optimizer.param_groups[0]['lr']

        print(
            f"Epoch {epoch + 1:02d} | Loss: {train_loss / len(train_loader):.4f} | Acc: {acc:.2f}% | LR: {current_lr:.1e} | Time: {duration:.1f}s")

        if acc >= best_acc:
            best_acc = acc
            torch.save(model.state_dict(), model_save_path)
            print(f"   >>> SAVED BEST MODEL ({acc:.2f}%) to {model_save_path}")

    return best_acc

### Run Optuna Study

This will run multiple trials, each with a different suggested learning rate, and find the best performing one. I'm setting `n_trials=10` for demonstration purposes, but you can increase this for a more thorough search. Also, note that the `train_model_optuna` function now runs for fewer epochs to speed up the tuning process.

In [15]:
# Create an Optuna study and optimize the objective function
study = optuna.create_study(direction="maximize")
study.optimize(lambda trial: train_model_optuna(trial, "/content/DataSet/KAGGLE/AUDIO"), n_trials=3)

print("\n--- Optuna Study Results ---")
print(f"Number of finished trials: {len(study.trials)}")
print(f"Best trial:")
for key, value in study.best_trial.params.items():
    print(f"  {key}: {value}")
print(f"Best validation accuracy: {study.best_value:.2f}%")

# Retrain the model with the best hyperparameters found by Optuna
best_lr = study.best_trial.params['lr']
print(f"\nRetraining model with best learning rate: {best_lr:.1e}")
train_model_final(data_path="/content/DataSet/KAGGLE/AUDIO", lr=best_lr, num_epochs=5)

print("\nModel trained and saved as 'best_model.pth'. You can now run the inference cell.")

[I 2026-01-07 16:25:34,597] A new study created in memory with name: no-name-b535f43d-ea1f-4c33-87ae-56225beac7bd



      FORENSIC TRAINING PROGRESS (Optuna Trial)
Processing audio file: /content/DataSet/KAGGLE/AUDIO/FAKE/trump-to-taylor.wav
Processing audio file: /content/DataSet/KAGGLE/AUDIO/FAKE/musk-to-obama.wav
Processing audio file: /content/DataSet/KAGGLE/AUDIO/FAKE/obama-to-linus.wav


[W 2026-01-07 16:25:40,891] Trial 0 failed with parameters: {'lr': 1.3797410950052377e-05} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/optuna/study/_optimize.py", line 205, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipython-input-1051394554.py", line 3, in <lambda>
    study.optimize(lambda trial: train_model_optuna(trial, "/content/DataSet/KAGGLE/AUDIO"), n_trials=3)
                                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipython-input-1365596503.py", line 132, in train_model_optuna
    for inputs, labels in train_loader:
                          ^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 732, in __next__
    data = self._next_data()
           ^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py

KeyboardInterrupt: 

In [16]:
import torch
import torch.nn as nn
import torchaudio
import librosa
import numpy as np
from torchaudio.transforms import MFCC
from torchvision.models import resnet18

# --- 1. CONFIGURATION (Must Match Training) ---
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_PATH = "best_model.pth"
SAMPLE_RATE = 16000
WINDOW_DURATION = 3
STRIDE = 1.0  # Slightly smaller stride for more detailed scanning
NUM_SAMPLES = int(SAMPLE_RATE * WINDOW_DURATION)


# --- 2. UPDATED MODEL DEFINITION (Must Match Training) ---
class UltimateResNet(nn.Module):
    def __init__(self):
        super(UltimateResNet, self).__init__()
        self.resnet = resnet18(weights=None)
        self.resnet.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.resnet.fc = nn.Sequential(
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.7),  # Updated to match your new training script
            nn.Linear(256, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.resnet(x)


# --- 3. UPDATED ANALYSIS LOGIC ---
def analyze_audio_forensics(file_path):
    model = UltimateResNet().to(DEVICE)
    try:
        # Load the weights from your robust training
        model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
    except Exception as e:
        print(f"Error loading model: {e}")
        return
    model.eval()

    mfcc_transform = MFCC(
        sample_rate=SAMPLE_RATE,
        n_mfcc=40,
        melkwargs={"n_fft": 400, "hop_length": 160, "n_mels": 40}
    ).to(DEVICE)

    try:
        # Load with Librosa to handle .mp3 correctly
        audio, _ = librosa.load(file_path, sr=SAMPLE_RATE)
        total_len = len(audio)

        chunk_scores = []
        step_size = int(STRIDE * SAMPLE_RATE)

        print(f"Forensic Scan: {file_path}")

        # Sliding Window Loop
        for start in range(0, total_len - NUM_SAMPLES + 1, step_size):
            chunk = audio[start: start + NUM_SAMPLES]

            # Same normalization as training
            chunk = chunk / (np.max(np.abs(chunk)) + 1e-9)
            waveform = torch.from_numpy(chunk).float().unsqueeze(0).to(DEVICE)

            with torch.no_grad():
                mfcc = mfcc_transform(waveform)
                # Feature sync: MFCC + Delta + Delta2
                delta = torchaudio.functional.compute_deltas(mfcc)
                delta2 = torchaudio.functional.compute_deltas(delta)
                features = torch.cat([mfcc, delta, delta2], dim=0).unsqueeze(0)

                prob = model(features).item()
                chunk_scores.append(prob)

        if not chunk_scores:
            print("Audio too short for 3-second window analysis.")
            return

        # Verdict Calculation
        final_average = np.mean(chunk_scores)
        # 0 = REAL, 1 = FAKE
        verdict = "FAKE (AI Generated)" if final_average > 0.5 else "REAL (Human Voice)"
        confidence = max(final_average, 1 - final_average) * 100

        print("\n" + "=" * 35)
        print(f"RESULT: {verdict}")
        print(f"AI Detection Confidence: {confidence:.2f}%")
        print(f"Total Segments Scanned: {len(chunk_scores)}")
        print("=" * 35)

    except Exception as e:
        print(f"Inference Error: {e}")


if __name__ == "__main__":
    # Point this to your actual file path
    TEST_FILE = "/content/DataSet/DEMONSTRATION/DEMONSTRATION"
    analyze_audio_forensics(TEST_FILE)

Error loading model: [Errno 2] No such file or directory: 'best_model.pth'
