In [1]:
import os
import pandas as pd

In [2]:
from tqdm            import tqdm
from getpass         import getpass
from datasets        import Dataset, DatasetDict, Audio
from huggingface_hub import HfApi, HfFolder

In [3]:
from utils.config import load_config

In [4]:
config    = load_config()
audio_dir = config['data_paths']['codecfake']['audio_files']

### Hugging Face Token Setup


This notebook requires access to the Hugging Face API. You have several options to set up your environment with the necessary token.

**Option 1: Using Environment Variables**

1.a Setting Environment Variables Directly

Set the `HF_TOKEN` environment variable with your Hugging Face token. This can be done in your terminal:

**For Windows:**
```cmd
set HF_TOKEN=your_token_here
```

**For macOS/Linux:**
```bash
export HF_TOKEN=your_token_here
```

1.b Using a `.env` File

Create a `.env` file in the root directory of your project and add the following line:
```
HF_TOKEN=your_token_here
```

Then, use a library like `python-dotenv` to load the variables from the `.env` file. Install `python-dotenv` if you haven't already:
```bash
pip install python-dotenv
```

Add the following code to your script to load the environment variables:
```python
from dotenv import load_dotenv
import os

load_dotenv()
token = os.getenv('HF_TOKEN')
```

**Option 2: Using Hugging Face CLI**

Run the following command in your terminal to log in using the Hugging Face CLI:
```bash
huggingface-cli login
```

**What It Does**

The `huggingface-cli login` command will prompt you to enter your Hugging Face credentials (username and password). After logging in, the CLI stores your token in a local file (usually located at `~/.huggingface/token`). This token is then used to authenticate your API requests without needing to re-enter your credentials.

**Example Code to Handle Token Setup in the Notebook**

Here’s an example of how to handle the Hugging Face token setup in your notebook, incorporating all the methods above:

```python
import os
from huggingface_hub import HfFolder, HfApi
from dotenv import load_dotenv

def get_hf_token():
    # Load environment variables from .env file, if it exists
    load_dotenv()
    
    # Check for the token in environment variables
    token = os.getenv('HF_TOKEN')
    if token is None:
        # If not found in environment variables, check the Hugging Face CLI session
        token = HfFolder.get_token()
    return token

# Attempt to retrieve the Hugging Face token
token = get_hf_token()

# If token is not set, prompt the user to enter it
if token is None:
    token = input("Please enter your Hugging Face token: ")

# Initialize Hugging Face API with the token
hf_api = HfApi(token)
```

### Uploading CodecFake Audio files to HuggingFace

In [5]:
# Attempt to retrieve the Hugging Face token from the environment variable or from the user's Hugging Face CLI session
token = HfFolder.get_token()

if token is None:
    token = getpass("Please enter your Hugging Face token: ")

# Initialize Hugging Face API with the token
hf_api = HfApi(token)

In [6]:
def create_df(audio_dir):
    data = []
    
    for root, _, files in tqdm(os.walk(audio_dir), total=len(os.listdir(audio_dir)), desc="Processing audio files"):
        for file in files:
            if file.endswith('.flac'):
                file_path = os.path.join(root, file)
                audio_id = os.path.basename(os.path.dirname(file_path))

                if audio_id.startswith('SSB'):
                    print(f'Skipping {audio_id}')
                    continue
                
                if file.startswith('F0'):
                    real_or_fake = file[:3]
                else:
                    real_or_fake = 'R'
                data.append({"audio": file_path, "audio_id": audio_id, "real_or_fake": real_or_fake})
                
    df = pd.DataFrame(data)
    return df

df = create_df(audio_dir)

Processing audio files:  18%|█▊        | 6300/35449 [00:02<00:12, 2263.32it/s]

Skipping SSB00050012
Skipping SSB00050012


Processing audio files:  27%|██▋       | 9737/35449 [00:04<00:14, 1775.18it/s]

Skipping SSB00050007


Processing audio files:  29%|██▊       | 10116/35449 [00:05<00:13, 1830.50it/s]

Skipping SSB00050009
Skipping SSB00050009


Processing audio files:  33%|███▎      | 11564/35449 [00:06<00:13, 1769.57it/s]

Skipping SSB00050008
Skipping SSB00050008


Processing audio files:  34%|███▍      | 12144/35449 [00:06<00:12, 1874.81it/s]

Skipping SSB00050006
Skipping SSB00050006


Processing audio files:  66%|██████▌   | 23241/35449 [00:12<00:06, 1897.90it/s]

Skipping SSB00050011
Skipping SSB00050011


Processing audio files:  70%|██████▉   | 24739/35449 [00:13<00:05, 1802.16it/s]

Skipping SSB00050010
Skipping SSB00050010


Processing audio files:  81%|████████  | 28547/35449 [00:15<00:03, 1906.34it/s]

Skipping SSB00050003
Skipping SSB00050003


Processing audio files:  87%|████████▋ | 30925/35449 [00:16<00:02, 1942.24it/s]

Skipping SSB00050002
Skipping SSB00050002
Skipping SSB00050002
Skipping SSB00050002
Skipping SSB00050002
Skipping SSB00050002


Processing audio files: 100%|██████████| 35449/35449 [00:19<00:00, 1842.38it/s]


In [7]:
# Calculate the number of unique audio IDs
unique_audio_ids = df['audio_id'].nunique()
unique_audio_ids

35433

Since the file is large, we are breaking it into more manageable chunks (of desired chunk size between 90 and 110). 

To ensure that the file is divided evenly, we are checking for the best common divisor within this range.

In [8]:
desired_chunk_size_min = 90
desired_chunk_size_max = 110

def find_best_chunk_size(total_ids, min_size, max_size):
    for size in range(max_size, min_size - 1, -1):
        if total_ids % size == 0:
            return size
    raise ValueError("No suitable chunk size found within the given range.")

try:
    chunk_size = find_best_chunk_size(unique_audio_ids, desired_chunk_size_min, desired_chunk_size_max)
    num_chunks = unique_audio_ids // chunk_size
    print(f"Chunk Size: {chunk_size}, Number of Chunks: {num_chunks}")
except ValueError as e:
    print(e)

Chunk Size: 93, Number of Chunks: 381


In [9]:
chunk_index      = 0
datasets         = []
current_chunk    = []
audio_ids_so_far = set()

# Group the DataFrame by audio_id
grouped = df.copy().groupby('audio_id')

def read_audio_file(path):
    with open(path, 'rb') as f:
        return {'bytes': f.read()}

def save_chunk(chunk_data, chunk_index):
    chunk_name = f"partition{chunk_index}"
    df_chunk = pd.DataFrame(chunk_data)
    ds_chunk = Dataset.from_pandas(df_chunk)
    ds_chunk = ds_chunk.cast_column("audio", Audio(decode=True))
    datasets.append((chunk_name, ds_chunk))

# Iterate over the grouped data and create chunks
for audio_id, group in tqdm(grouped, desc="Creating chunks"):
    if audio_id in audio_ids_so_far:
        continue

    if len(audio_ids_so_far) + 1 > chunk_size:
        save_chunk(current_chunk, chunk_index)
        current_chunk = []
        audio_ids_so_far = set()
        chunk_index += 1

    current_chunk.extend(group.to_dict('records'))
    audio_ids_so_far.add(audio_id)

# Save the last chunk if it has remaining data
if current_chunk:
    save_chunk(current_chunk, chunk_index)

# Create a DatasetDict from the chunks
dataset_dict = DatasetDict({chunk_name: ds for chunk_name, ds in datasets})


Creating chunks: 100%|██████████| 35433/35433 [00:06<00:00, 5193.88it/s]


### Push it to HuggingFace Dataset Repository

In [22]:
repo_id = 'ajaykarthick/codecfake-audio'
dataset_dict.push_to_hub(repo_id)