In [18]:
import os
import subprocess
import zipfile
import hashlib
import requests
import random
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

import librosa

from utils.config import load_config 

## Get Data Paths from Config

In [51]:
config = load_config()
config

{'data_paths': {'raw_data_path': '/app/data/raw',
  'processed_data_path': '/app/data/processed',
  'train_raw_path': '/app/data/raw/codecfake/train',
  'label_raw_path': '/app/data/raw/codecfake/label'}}

## Helper Functions

In [3]:
def calculate_md5(filename, block_size=4096):
    """Calculate the MD5 checksum of a file."""
    md5 = hashlib.md5()
    with open(filename, 'rb') as f:
        for block in iter(lambda: f.read(block_size), b""):
            md5.update(block)
    return md5.hexdigest()

In [4]:
def download_from_zenodo(direct_url, destination, expected_checksum):
    """Download a large file from Zenodo in chunks to avoid loading it all into memory."""
    if os.path.exists(destination):
        if calculate_md5(destination) == expected_checksum:
            print(f"File already exists and is verified: {destination}")
            return True
        else:
            print(f"Checksum mismatch or file corrupted. Re-downloading: {destination}")
            os.remove(destination) 
    try:
        with requests.get(direct_url, stream=True) as response:
            response.raise_for_status() 
            with open(destination, 'wb') as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)
        print(f"Downloaded file to {destination}")
    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error: {e.response.status_code} {e.response.reason}")
    except requests.exceptions.ConnectionError:
        print("Connection Error. Please check your internet connection.")
    except requests.exceptions.Timeout:
        print("The request timed out.")
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")

In [5]:
def download_parts(config):
    repo_urls = [
        "https://zenodo.org/record/11171708",
        "https://zenodo.org/records/11171720",
        "https://zenodo.org/records/11171724"
    ]

    files = [
        [f'train_split.z{str(num).zfill(2)}' for num in range(1, 7)] + ['train_split.zip', 'label.zip'],
        [f'train_split.z{str(num).zfill(2)}' for num in range(7, 15)],
        [f'train_split.z{str(num).zfill(2)}' for num in range(15, 20)]
    ]

    checksums = {
        'train_split.z01': '61423ee0c7e9991b7272b9e50b234439',
        'train_split.z02': '938387b24c700fd3167caff5b6c4c2cc',
        'train_split.z03': '6ed3919559200bfa2e09416816a748ab',
        'train_split.z04': 'cffba4cd8a551e1da36e821e3db1137b',
        'train_split.z05': 'c90ea493d8bfda6cf0fb7713e2bdf628',
        'train_split.z06': 'a8363316c2db890f62d9a3f05ffa882b',
        'train_split.z07': '8c89c7b19c2860dc360e53cf484f7844',
        'train_split.z08': '069fb8d4ff70deafe2b23e70f67c255f',
        'train_split.z09': '208fa914647e7519bf93eb04427e94ab',
        'train_split.z10': '3441024afe061775a29d49292d6b94f6',
        'train_split.z11': 'ef9b40ff9145bbe925944aa5a97a6060',
        'train_split.z12': 'c9a30c2d9c4d0fd59c23058990e79c68',
        'train_split.z13': '2fa3c4f13cad47c1a2c8da6b02593197',
        'train_split.z14': 'd4b19b65945532a1192cfdaea45fe6e5',
        'train_split.z15': 'f1416171017fe86806c1642f36865d22',
        'train_split.z16': '4005490382925a7dde0df498831d4595',
        'train_split.z17': '4aabe67a30484ab45919e58250f1d2c7',
        'train_split.z18': '24fc5547fb782d59a8f94e53eb9fd2bc',
        'train_split.z19': '2ded1a7fda786a04743923790a27f39f',
        'train_split.zip': '600a9ab2c5d820004fecc0e67ac2f645',
        'label.zip': '1886fa25a8e018307e709da28bdc57b2'
    }
    
    # Set up the ThreadPoolExecutor
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = []
        for repo_url, file_group in zip(repo_urls, files):
            for file in file_group:
                download_url = f"{repo_url}/files/{file}"
                zip_file_path = f"{config['data_paths']['raw_data_path']}/codecfake/{file}"
                expected_checksum = checksums.get(file, None)
                if expected_checksum:
                    print(f"Downloading {download_url} to {zip_file_path}")
                    futures.append(executor.submit(download_from_zenodo, download_url, zip_file_path, expected_checksum))
    
    # Wait for all futures to complete
    for future in futures:
        future.result() 
    print("All downloads completed.")

## Download the 19-Part Archived Train Dataset

Skip downloading if files are already present and verified, as verification is time-consuming due to each file being approximately 5GB.

In [9]:
download_parts(config) 

Downloading https://zenodo.org/record/11171708/files/train_split.z01 to /app/data/raw/codecfake/train_split.z01
Downloading https://zenodo.org/record/11171708/files/train_split.z02 to /app/data/raw/codecfake/train_split.z02
Downloading https://zenodo.org/record/11171708/files/train_split.z03 to /app/data/raw/codecfake/train_split.z03
Downloading https://zenodo.org/record/11171708/files/train_split.z04 to /app/data/raw/codecfake/train_split.z04
Downloading https://zenodo.org/record/11171708/files/train_split.z05 to /app/data/raw/codecfake/train_split.z05
Downloading https://zenodo.org/record/11171708/files/train_split.z06 to /app/data/raw/codecfake/train_split.z06
Downloading https://zenodo.org/record/11171708/files/train_split.zip to /app/data/raw/codecfake/train_split.zip
Downloading https://zenodo.org/record/11171708/files/label.zip to /app/data/raw/codecfake/label.zip
Downloading https://zenodo.org/records/11171720/files/train_split.z07 to /app/data/raw/codecfake/train_split.z07
Dow

## Extract Label files from label.zip

In [7]:
label_zip_path =  f"{config['data_paths']['raw_data_path']}/codecfake/label.zip"
label_output_path = f"{config['data_paths']['raw_data_path']}/codecfake/label"
if not os.path.exists(label_output_path):
    unzip_file(label_zip_path, label_output_path)

In [8]:
file_path = f"{label_output_path}/train.txt"

with open(file_path, 'r') as file:
    for _ in range(21):
        line = file.readline()
        if not line: 
            break
        print(line.strip())

SSB13650058.wav real 0
F01_SSB13650058.wav fake 1
F02_SSB13650058.wav fake 2
F03_SSB13650058.wav fake 3
F04_SSB13650058.wav fake 4
F05_SSB13650058.wav fake 5
F06_SSB13650058.wav fake 6
SSB13280206.wav real 0
F01_SSB13280206.wav fake 1
F02_SSB13280206.wav fake 2
F03_SSB13280206.wav fake 3
F04_SSB13280206.wav fake 4
F05_SSB13280206.wav fake 5
F06_SSB13280206.wav fake 6
SSB07860395.wav real 0
F01_SSB07860395.wav fake 1
F02_SSB07860395.wav fake 2
F03_SSB07860395.wav fake 3
F04_SSB07860395.wav fake 4
F05_SSB07860395.wav fake 5
F06_SSB07860395.wav fake 6


**Count Fakes Associated with Each Real Audio File**

In [9]:
file_path = f"{label_output_path}/train.txt"

real_to_fake_count = {}

with open(file_path, 'r') as file:
    for line in file:
        parts = line.strip().split()
        filename = parts[0]
        label = parts[1]
        if label == "real":
            real_to_fake_count[filename] = 0
        elif label == "fake" and parts[2] != '0':
            # Find the associated real file name by removing the first part (F01_, F02_, etc.)
            real_file = filename[4:]
            if real_file in real_to_fake_count:
                real_to_fake_count[real_file] += 1
            else:
                print(f"Warning: No real file found for {filename}")

consistent = True
for real, count in real_to_fake_count.items():
    if count != 6:
        print(f"Inconsistent count for {real}: {count} fakes found (expected 6).")
        consistent = False

if consistent:
    print("All real files have exactly 6 corresponding fake files.")
else:
    print("Some real files do not have the correct number of corresponding fake files.")


All real files have exactly 6 corresponding fake files.


In [10]:
print(f'{len(real_to_fake_count)} real audio files are there! Each has 6 corresponding fake files. So, totally {len(real_to_fake_count)*6} audio files')

105821 real audio files are there! Each has 6 corresponding fake files. So, totally 634926 audio files


## Extract Specific Audio Files

Extract only specific audio files, as the total size exceeds 101 GB.

In [11]:
def extract_specific_files_with_7z(archive_path, output_path, files_to_extract, verbose=False):
    """Extract specific files from a multi-part zip archive using 7z from a subprocess."""
    try:
        # Prepare the command to include specific files
        command = ['7z', 'x', archive_path, f'-o{output_path}', '-aos'] + files_to_extract
        if verbose:
            subprocess.run(command, check=True)
        else:
            subprocess.run(command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        if verbose:
            print(f"Extracted specified files to {output_path}")
    except subprocess.CalledProcessError as e:
        print(f"Failed to extract files: {str(e)}")

In [40]:
archive_path = f"{config['data_paths']['raw_data_path']}/codecfake/train_split.zip"
output_path = f"{config['data_paths']['raw_data_path']}/codecfake/"
files_to_extract = [
    # SSB13650058.wav
    'train/SSB13650058.wav',
    'train/F01_SSB13650058.wav',
    'train/F02_SSB13650058.wav',
    'train/F03_SSB13650058.wav',
    'train/F04_SSB13650058.wav',
    'train/F05_SSB13650058.wav',
    'train/F06_SSB13650058.wav',
    # SSB13280206.wav
    'train/SSB13280206.wav',
    'train/F01_SSB13280206.wav',
    'train/F02_SSB13280206.wav',
    'train/F03_SSB13280206.wav',
    'train/F04_SSB13280206.wav',
    'train/F05_SSB13280206.wav',
    'train/F06_SSB13280206.wav',
]

extract_specific_files_with_7z(archive_path, output_path, files_to_extract, verbose=True)


7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=C.UTF-8,Utf16=on,HugeFiles=on,64 bits,10 CPUs LE)

Scanning the drive for archives:
1 file, 1395022381 bytes (1331 MiB)

Extracting archive: /app/data/raw/codecfake/train_split.zip
--
Path = /app/data/raw/codecfake/train_split.zip
Type = zip
Physical Size = 1395022381
Embedded Stub Size = 4
64-bit = +
Total Physical Size = 101009742381
Multivolume = +
Volume Index = 19
Volumes = 20

Everything is Ok

Files: 14
Size:       2948044
Compressed: 101009742381
Extracted specified files to /app/data/raw/codecfake/


## Look at the train folder

In [41]:
train_dir_path = f"{config['data_paths']['raw_data_path']}/codecfake/train"
os.listdir(train_dir_path)

['F03_SSB13280206.wav',
 'F05_SSB13650058.wav',
 'SSB13650058.wav',
 'F02_SSB13650058.wav',
 'F04_SSB13280206.wav',
 'SSB13280206.wav',
 'F05_SSB13280206.wav',
 'F03_SSB13650058.wav',
 'F04_SSB13650058.wav',
 'F02_SSB13280206.wav',
 'F01_SSB13650058.wav',
 'F06_SSB13650058.wav',
 'F01_SSB13280206.wav',
 'F06_SSB13280206.wav']

## Making sure sampling rate is same across all audio files

In [19]:
file_to_sr = {}
archive_path = f"{config['data_paths']['raw_data_path']}/codecfake/train_split.zip"
output_path = f"{config['data_paths']['raw_data_path']}/codecfake/"

def process_file(file_path, output_path):
    """Function to process a single file to extract it and read its sampling rate."""
    extract_specific_files_with_7z(archive_path, output_path, [file_path])
    full_path = os.path.join(output_path, file_path)
    y, sr = librosa.load(full_path, sr=None)
    ################ DELETE THE FILE TO SAVE SPACE ###################
    os.remove(full_path)
    ##################################################################
    return file_path, sr


def process_files_concurrently(all_files_to_process):
    num_files = len(all_files_to_process)
    update_interval = num_files // 100
    count = 0
    
    with ThreadPoolExecutor(max_workers=4) as executor:
        future_to_file = {executor.submit(process_file, file, output_path): file for file in all_files_to_process}
        for future in as_completed(future_to_file):
            file, sr = future.result()
            file_to_sr[file] = sr
            count += 1
            if count % update_interval == 0:
                print(f"Processed {count}/{num_files} files. ({(count/num_files)*100:.2f}%)")

all_files_to_process = []
for real in random.sample(list(real_to_fake_count.keys()), 100): # real_to_fake_count.keys()
    real_file_path = os.path.join('train', real)
    fake_file_paths = [os.path.join('train', f'F{str(num).zfill(2)}_{real}') for num in range(1, 7)]
    all_files_to_process.extend([real_file_path] + fake_file_paths)
    
process_files_concurrently(all_files_to_process)

Processed 7/700 files. (1.00%)
Processed 14/700 files. (2.00%)
Processed 21/700 files. (3.00%)
Processed 28/700 files. (4.00%)
Processed 35/700 files. (5.00%)
Processed 42/700 files. (6.00%)
Processed 49/700 files. (7.00%)
Processed 56/700 files. (8.00%)
Processed 63/700 files. (9.00%)
Processed 70/700 files. (10.00%)
Processed 77/700 files. (11.00%)
Processed 84/700 files. (12.00%)
Processed 91/700 files. (13.00%)
Processed 98/700 files. (14.00%)
Processed 105/700 files. (15.00%)
Processed 112/700 files. (16.00%)
Processed 119/700 files. (17.00%)
Processed 126/700 files. (18.00%)
Processed 133/700 files. (19.00%)
Processed 140/700 files. (20.00%)
Processed 147/700 files. (21.00%)
Processed 154/700 files. (22.00%)
Processed 161/700 files. (23.00%)
Processed 168/700 files. (24.00%)
Processed 175/700 files. (25.00%)
Processed 182/700 files. (26.00%)
Processed 189/700 files. (27.00%)
Processed 196/700 files. (28.00%)
Processed 203/700 files. (29.00%)
Processed 210/700 files. (30.00%)
Proc

In [49]:
sr_map = {
    'p': {
        'real': 48000,
        'F01': 16000,
        'F02': 16000,
        'F03': 16000,
        'F04': 24000,
        'F05': 48000,
        'F06': 16000,
    },
    'SSB': {
        'real': 44100,
        'F01': 16000,
        'F02': 16000,
        'F03': 16000,
        'F04': 24000,
        'F05': 48000,
        'F06': 24000,
    }
}
for file in file_to_sr.keys():
    # Real Audios
    if not file.replace('train/', '').startswith('F0'):
        if file.replace('train/', '').startswith('p') and file_to_sr[file] != sr_map['p']['real']:
            print('Error', file, file_to_sr[file])
        elif file.replace('train/', '').startswith('SSB') and file_to_sr[file] != sr_map['SSB']['real']:
            print('Error', file, file_to_sr[file])

    # Fake Audios
    else:
        fake_number = file.replace('train/', '')[:3]
        if file.replace(f'train/{fake_number}_', '').startswith('p') and file_to_sr[file] != sr_map['p'][fake_number]:
            print('Error', file, file_to_sr[file])
        elif file.replace(f'train/{fake_number}_', '').startswith('SSB') and file_to_sr[file] != sr_map['SSB'][fake_number]:
            print('Error', file, file_to_sr[file])