# Preprocessing
This notebook includes several steps to create a Hugging Face Dataset which then later can be used to finetune our model. The used dataset can be found under the following link: https://mtc.ethz.ch/publications/open-source/swiss-dial.html

## Generate a csv

In [None]:
import os
import json
import pandas as pd

base_path = "../dataset" 
json_file = os.path.join(base_path, "sentences_ch_de_transcribed.json")

with open(json_file, "r", encoding="utf-8") as f:
    data = json.load(f)

dataset = []

dialects = ["ag", "be", "bs", "gr", "lu", "sg", "vs", "zh"]
for item in data:
    german_translation = item["de"]

    for dialect in dialects:
        audio_file = os.path.join(base_path, dialect, f"ch_{dialect}_{str(item['id']).zfill(4)}.wav")
        
        if os.path.exists(audio_file):
            dataset.append({
                "ID": item["id"],
                "Audio File": audio_file,
                "Dialect": dialect,
                "Transcription": item[f"ch_{dialect}"],
                "German Translation": german_translation
            })

df = pd.DataFrame(dataset)

output_csv = os.path.join(base_path, "swiss_german_dataset.csv")
df.to_csv(output_csv, index=False, encoding="utf-8")

print(f"Dataset saved to {output_csv}")

Dataset saved to ../dataset/swiss_german_dataset.csv


## Load the Audio Files and generate a Hugging Face Dataset

In [1]:
from datasets import load_dataset

# Load the CSV into a Hugging Face dataset
dataset = load_dataset("csv", data_files={"train": "../dataset/swiss_german_dataset.csv"})


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torchaudio

def load_audio(batch):
    # Load the audio file from the given path
    audio, sampling_rate = torchaudio.load(batch["Audio File"])
    batch["audio"] = {"array": audio.squeeze(0).numpy(), "sampling_rate": sampling_rate}
    return batch

# Apply the function to the dataset
dataset = dataset.map(load_audio)


Map: 100%|██████████| 30921/30921 [02:42<00:00, 190.55 examples/s]


In [None]:
from datasets import DatasetDict
from huggingface_hub import HfApi
from dotenv import load_dotenv

load_dotenv()

hf_token = os.getenv("HF_TOKEN")
username = os.getenv("HF_USERNAME") 
dataset_name = "swiss_german_dialects_raw"
organization_name = "notebotIE"

assert hf_token is not None, "Make sure to set your HF_TOKEN in the .env file"
assert username is not None, "Make sure to set your username in the .env file"

dataset.push_to_hub(
    f"{organization_name}/{dataset_name}",
    private=True,
    token=hf_token
)

Creating parquet from Arrow format: 100%|██████████| 2/2 [00:08<00:00,  4.45s/ba]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:08<00:00,  4.30s/ba]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:08<00:00,  4.37s/ba]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:09<00:00,  4.83s/ba]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:08<00:00,  4.40s/ba]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:07<00:00,  3.89s/ba]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:08<00:00,  4.38s/ba]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:08<00:00,  4.35s/ba]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:08<00:00,  4.27s/ba]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:10<00:00,  5.11s/ba]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:05<00:00,  2.73s/ba]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:09<00:00,  4.74s/ba]
Creating parquet

CommitInfo(commit_url='https://huggingface.co/datasets/notebotIE/swiss_german_dialects_raw/commit/993c99bf66384663a0542185d4bf86e3c5566f09', commit_message='Upload dataset', commit_description='', oid='993c99bf66384663a0542185d4bf86e3c5566f09', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/notebotIE/swiss_german_dialects_raw', endpoint='https://huggingface.co', repo_type='dataset', repo_id='notebotIE/swiss_german_dialects_raw'), pr_revision=None, pr_num=None)

In [6]:
dataset = load_dataset("notebotIE/swiss_german_dialects_raw")

Downloading data: 100%|██████████| 23/23 [04:06<00:00, 10.70s/files]
Generating train split: 100%|██████████| 30921/30921 [01:09<00:00, 444.26 examples/s]


In [7]:
dataset["train"].features

{'ID': Value(dtype='int64', id=None),
 'Audio File': Value(dtype='string', id=None),
 'Dialect': Value(dtype='string', id=None),
 'Transcription': Value(dtype='string', id=None),
 'German Translation': Value(dtype='string', id=None),
 'audio': {'array': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None),
  'sampling_rate': Value(dtype='int64', id=None)}}

In [9]:
sample = dataset["train"][0].copy()
sample

{'ID': 0,
 'Audio File': '../dataset/ag/ch_ag_0000.wav',
 'Dialect': 'ag',
 'Transcription': 'Em Moment escher en "OParasite", de Siegerfelm fo Cannes z\'gseh.',
 'German Translation': 'Derzeit ist er in « Parasite », dem Siegerfilm von Cannes, zu sehen.',
 'audio': {'array': [0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.

In [10]:
from IPython.display import Audio as IPyAudio


IPyAudio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"])