<div style="text-align: center; color: #FFFFFF; font-family: Regular 400; background-color: #8B4000; padding: 10px; border-radius: 5px;">
 Download existing ASR data from community except kallama: kudo galsen AI
</div>

In [None]:
import os
import csv
import json
import soundfile as sf
from datasets import load_dataset
from tqdm import tqdm

# ============================== #
#     CONFIGURATION              #
# ============================== #

dataset_name = "galsenai/wolof-audio-data"
output_dir = "audio_galsenai"
train_dir = os.path.join(output_dir, "train")
dev_dir = os.path.join(output_dir, "dev")


EXCLUDED_SOURCES = ['kallama']  

# Create output directories
os.makedirs(train_dir, exist_ok=True)
os.makedirs(dev_dir, exist_ok=True)

print(f" Created output directories:")
print(f"  - {train_dir}")
print(f"  - {dev_dir}")

# ============================== #
#     LOAD DATASET               #
# ============================== #

print(f"\n Loading dataset '{dataset_name}'...")
dataset = load_dataset(dataset_name)
print(f" Loaded dataset with splits: {list(dataset.keys())}")


# ============================== #
#     FILTER OUT EXCLUDED SOURCES #
# ============================== #

print(f"\n Filtering out sources: {EXCLUDED_SOURCES}")

# Count before filtering
train_before = len(dataset['train'])
test_before = len(dataset['test'])

# Filter both splits
dataset['train'] = dataset['train'].filter(
    lambda x: x['source'] not in EXCLUDED_SOURCES
)
dataset['test'] = dataset['test'].filter(
    lambda x: x['source'] not in EXCLUDED_SOURCES
)

train_after = len(dataset['train'])
test_after = len(dataset['test'])

print(f" Filtering complete:")
print(f"   Train: {train_before} ‚Üí {train_after} ({train_before - train_after} removed)")
print(f"   Test:  {test_before} ‚Üí {test_after} ({test_before - test_after} removed)")
print(f"   Total removed: {(train_before - train_after) + (test_before - test_after)} files")

# Show remaining sources
from collections import Counter
train_sources = Counter(example['source'] for example in dataset['train'])
test_sources = Counter(example['source'] for example in dataset['test'])

print(f"\n Remaining sources in train split:")
for source, count in train_sources.most_common():
    print(f"   {source}: {count} files")

print(f"\n Remaining sources in test split:")
for source, count in test_sources.most_common():
    print(f"   {source}: {count} files")

# ============================== #
#     SAVE FUNCTION              #
# ============================== #

def save_split_to_folder(split_name, output_folder):
    """
    Saves audio from a dataset split to a local folder AND creates metadata files.
    
    Creates:
    - Audio files (.wav)
    - metadata.csv (filename, transcription, duration, etc.)
    - metadata.jsonl (one JSON object per line)
    """
    print(f"\n Processing '{split_name}' split...")
    split_data = dataset[split_name]
    
    saved_count = 0
    error_count = 0
    metadata_records = []
    
    # Metadata CSV file
    csv_path = os.path.join(output_folder, "metadata.csv")
 
    
    # Use tqdm for progress bar
    for i, example in enumerate(tqdm(split_data, desc=f"Saving {split_name}")):
        try:
            # Extract audio data
            audio_array = example["audio"]["array"]
            sampling_rate = example["audio"]["sampling_rate"]
            
            # Get transcription - field is called 'sentence' in this dataset
            transcription = example.get('sentence', '')
            
            # Get source field
            source = example.get('source', '')
            
            # Calculate duration
            duration = len(audio_array) / sampling_rate
            
            # Generate filename
            if 'path' in example["audio"] and example["audio"]["path"]:
                original_filename = os.path.basename(example["audio"]["path"])
                base_name, ext = os.path.splitext(original_filename)
                if not ext or ext.lower() not in ['.wav', '.flac', '.ogg']:
                    ext = '.wav'
                filename = f"{base_name}{ext}"
            else:
                filename = f"audio_{i:06d}.wav"
            
            # Full file path
            file_path = os.path.join(output_folder, filename)
            
            # Ensure the parent directory exists
            os.makedirs(os.path.dirname(file_path), exist_ok=True)
            
            # Save the audio file
            sf.write(file_path, audio_array, sampling_rate)
            saved_count += 1
            
            # Store metadata
            metadata_record = {
                'filename': filename,
                'sentence': transcription,  # Using 'sentence' to match HuggingFace
                'transcription': transcription,  # Also keep as 'transcription' for clarity
                'source': source,
                'duration': round(duration, 3),
                'sampling_rate': sampling_rate,
            }
            
            # Add any other fields from the example
            for key, value in example.items():
                if key not in ['audio', 'filename', 'sentence', 'transcription', 
                               'duration', 'sampling_rate', 'source']:
                    # Only add simple types
                    if isinstance(value, (str, int, float, bool)):
                        metadata_record[key] = value
            
            metadata_records.append(metadata_record)
            
        except Exception as e:
            error_count += 1
            print(f"\n Error saving file {i}: {e}")
            
            # Try saving with fallback filename
            try:
                safe_filename = f"audio_{i:06d}.wav"
                safe_path = os.path.join(output_folder, safe_filename)
                sf.write(safe_path, audio_array, sampling_rate)
                
                # Still save metadata with safe filename
                metadata_record = {
                    'filename': safe_filename,
                    'sentence': transcription,
                    'transcription': transcription,
                    'source': source,
                    'duration': round(len(audio_array) / sampling_rate, 3),
                    'sampling_rate': sampling_rate,
                    'error': str(e)
                }
                metadata_records.append(metadata_record)
                
                print(f"    Saved as fallback: {safe_filename}")
                saved_count += 1
                error_count -= 1
            except Exception as fallback_error:
                print(f"    Fallback also failed: {fallback_error}")
    
    # ============================== #
    #     SAVE METADATA FILES        #
    # ============================== #
    
    # Save CSV
    if metadata_records:
        print(f"\n Saving metadata to CSV...")
        with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = metadata_records[0].keys()
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(metadata_records)
        print(f"    Saved: {csv_path}")
        
      
    # ============================== #
    #     SUMMARY STATISTICS         #
    # ============================== #
    
    print(f"\n Summary for '{split_name}':")
    print(f"   Successfully saved: {saved_count} files")
    print(f"    Errors: {error_count} files")
    print(f"    Audio location: {output_folder}")
    print(f"    Metadata CSV: {csv_path}")
    print(f"    Metadata JSONL: {jsonl_path}")
    
    if metadata_records:
        # Calculate statistics
        total_duration = sum(r['duration'] for r in metadata_records)
        avg_duration = total_duration / len(metadata_records)
        
        print(f"\n Statistics:")
        print(f"   Total duration: {total_duration/3600:.2f} hours")
        print(f"   Average duration: {avg_duration:.2f} seconds")
        print(f"   Total files: {len(metadata_records)}")
        
        # Show sample transcriptions
        print(f"\n Sample transcriptions:")
        for i, record in enumerate(metadata_records[:5]):
            print(f"   {i+1}. [{record['source']}] {record['filename']}")
            trans = record['sentence'][:100]
            print(f"      \"{trans}{'...' if len(record['sentence']) > 100 else ''}\"")
        
        # Count by source
        from collections import Counter
        source_counts = Counter(r['source'] for r in metadata_records)
        print(f"\n Distribution by source:")
        for source, count in source_counts.most_common():
            print(f"   {source}: {count} files")
    
    return saved_count, error_count, metadata_records


# ============================== #
#     PROCESS ALL SPLITS         #
# ============================== #

# Save the 'train' split
print("\n" + "="*60)
train_saved, train_errors, train_metadata = save_split_to_folder("train", train_dir)

# Save the 'test' split (renamed to 'dev')
print("\n" + "="*60)
dev_saved, dev_errors, dev_metadata = save_split_to_folder("test", dev_dir)


In [None]:
# Load CSV in Python
import pandas as pd
train_df = pd.read_csv('audio_galsenai/train/metadata.csv')
print(train_df.head())

<div style="text-align: center; color: #FFFFFF; background-color: #8B4000; padding: 10px; border-radius: 5px;">
  Download kallama audio file  and reviewed annotation by clad team from GCP bucket
</div>

In [None]:
!pip install google-cloud-storage tqdm

In [None]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "powerful-bounty-463513-j4-e2ab92933732.json"

In [None]:
### download aud

In [None]:
import os
import logging
from concurrent.futures import ThreadPoolExecutor
from google.cloud import storage
from tqdm import tqdm

# ============================== #
#     CONFIGURATION              #
# ============================== #

# Google Cloud Storage bucket name
BUCKET_NAME = 'unchecked-audio-data'

# Local directory to save downloaded files
LOCAL_BASE_DIR = 'downloaded_wolof_audio'


TARGET_PREFIXES = [f'wolof-batch{i}/' for i in range(2, 9)]
# This will download: wolof-batch2, wolof-batch3, ... wolof-batch8

# Download entire wolof-audio folder 
# TARGET_PREFIXES = ['wolof-audio/']

MAX_WORKERS = 16

# ============================== #
#     LOGGING SETUP              #
# ============================== #

logging.basicConfig(
    filename='download_log.log', 
    level=logging.ERROR, 
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# ============================== #
#     DOWNLOAD FUNCTION          #
# ============================== #

def download_blob(blob, local_folder):
    """
    Downloads a single blob to the local folder structure.
    
    Args:
        blob: Google Cloud Storage blob object
        local_folder: Local base directory for downloads
        
    Returns:
        True if downloaded successfully, False if failed, None if skipped
    """
    try:
        # Create local path (preserves GCS folder structure)
        local_path = os.path.join(local_folder, blob.name)
        
        # Create parent directories if they don't exist
        os.makedirs(os.path.dirname(local_path), exist_ok=True)
        
        # Skip if file already exists and has correct size
        if os.path.exists(local_path):
            if os.path.getsize(local_path) == blob.size:
                return None  # Already downloaded
        
        # Download the file
        blob.download_to_filename(local_path)
        return True
        
    except Exception as e:
        logging.error(f"Failed to download {blob.name}: {e}")
        return False

# ============================== #
#     MAIN FUNCTION              #
# ============================== #

def main():
    """
    Main function to download files from Google Cloud Storage.
    """
    print("="*60)
    print("Google Cloud Storage Downloader")
    print("="*60)
    
    try:
        storage_client = storage.Client()
        bucket = storage_client.bucket(BUCKET_NAME)
        print(f" Connected to bucket: {BUCKET_NAME}")
    except Exception as e:
        print(f" Failed to connect to bucket: {e}")
        print("\nMake sure you have:")
        print("1. Installed google-cloud-storage: pip install google-cloud-storage")
        print("2. Set GOOGLE_APPLICATION_CREDENTIALS environment variable")
        print("3. Correct bucket name in BUCKET_NAME variable")
        return
    
    print(f" Target prefixes: {TARGET_PREFIXES}")
    print(f" Local directory: {os.path.abspath(LOCAL_BASE_DIR)}")
    print(f" Max parallel threads: {MAX_WORKERS}")
    
    # ============================== #
    #     LIST ALL FILES             #
    # ============================== #
    
    print("\n Listing files from bucket... (this might take a moment)")
    blobs_to_download = []
    
    for prefix in TARGET_PREFIXES:
        try:
            blobs = list(bucket.list_blobs(prefix=prefix))
            blobs_to_download.extend(blobs)
            print(f"   Found {len(blobs)} files in '{prefix}'")
        except Exception as e:
            logging.error(f"Failed to list blobs for prefix '{prefix}': {e}")
            print(f" Error listing files for '{prefix}': {e}")
    
    total_files = len(blobs_to_download)
    
    if total_files == 0:
        print("\n  No files found!")
        print("\nPossible issues:")
        print("1. Check if TARGET_PREFIXES are correct")
        print("2. Verify bucket name is correct")
        print("3. Ensure you have read permissions")
        print("\nTip: List bucket contents with:")
        print("   gsutil ls gs://unchecked-audio-data/")
        return
    
    print(f"\n Found {total_files} total files")
    
    # Calculate total size
    total_size = sum(blob.size for blob in blobs_to_download)
    total_size_gb = total_size / (1024**3)
    print(f" Total size: {total_size_gb:.2f} GB")
    
    # ============================== #
    #     DOWNLOAD FILES             #
    # ============================== #
    
    print(f"\n Starting download with {MAX_WORKERS} parallel threads...")
    print("="*60)
    
    downloaded = 0
    skipped = 0
    failed = 0
    
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # Use tqdm for progress bar
        results = list(tqdm(
            executor.map(lambda blob: download_blob(blob, LOCAL_BASE_DIR), blobs_to_download),
            total=total_files,
            unit="file",
            desc="Downloading"
        ))
        
        # Count results
        for result in results:
            if result is True:
                downloaded += 1
            elif result is None:
                skipped += 1
            elif result is False:
                failed += 1
    
if __name__ == "__main__":
    main()


<div style="text-align: center; color: #FFFFFF; background-color: #8B4000; padding: 10px; border-radius: 5px;">
  checked trancription metadata
</div>

In [None]:
import json
import csv
import logging
from google.cloud import storage
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# --- CONFIGURATION ---
BUCKET_NAME = 'checked-audio-data' 
OUTPUT_CSV = 'dataset_annotations.csv'
MAX_WORKERS = 16 


TARGET_PREFIXES = [
    'wolof-audio/', 
    'wolof-batch2/', 'wolof-batch3/', 'wolof-batch4/',
    'wolof-batch5/', 'wolof-batch6/', 'wolof-batch7/', 'wolof-batch8/',
   
]

# --- LOGGING ---
logging.basicConfig(filename='extraction_errors.log', level=logging.ERROR)

def extract_info_from_blob(blob):
    """
    Tente de lire un blob comme un JSON, peu importe son extension.
    """
    try:
        # Si c'est un dossier, on ignore
        if blob.name.endswith('/'):
            return None

        # On t√©l√©charge le contenu en texte
        json_content = blob.download_as_text()
        
        try:
            data = json.loads(json_content)
        except json.JSONDecodeError:
            # Ce n'est pas un JSON (peut-√™tre un fichier log ou autre), on ignore silencieusement
            return None

        # --- 1. FILTRE DE QUALIT√â ---
        # On ne garde que les annotations accept√©es/valid√©es
        last_action = data.get('last_action')
        if last_action != 'accepted':
             return None 

        # --- 2. R√âCUP√âRATION DE L'AUDIO ---
        audio_url = None
        if 'task' in data and 'data' in data['task']:
            audio_url = data['task']['data'].get('audio') or data['task']['data'].get('url')
        elif 'data' in data:
            audio_url = data['data'].get('audio') or data['data'].get('url')
            
        # Fallback sur le nom de fichier si l'URL est manquante
        if not audio_url:
            audio_url = f"gs://{blob.bucket.name}/{blob.name}"

        # --- 3. R√âCUP√âRATION DE LA TRANSCRIPTION ---
        transcription = None
        results = data.get('result', [])
        
        # Gestion des formats imbriqu√©s
        if not results and 'annotations' in data:
             results = data['annotations'][-1].get('result', [])

        for res in results:
            if 'value' in res and 'text' in res['value']:
                transcription = res['value']['text'][0]
                break

        if transcription and audio_url:
            return {'path': audio_url, 'text': transcription}
        
        return None

    except Exception as e:
        # On log l'erreur sans bloquer le script
        logging.error(f"Erreur sur {blob.name}: {str(e)}")
        return None

def main():
    storage_client = storage.Client()
    bucket = storage_client.bucket(BUCKET_NAME)

    print(f"üì° Connexion au bucket : {BUCKET_NAME}")
    
    unique_blobs = []
    seen_blob_names = set()

    print("Listing des fichiers (sans filtre d'extension)...")
    
    # Si TARGET_PREFIXES est vide, on scanne tout le bucket
    prefixes_to_scan = TARGET_PREFIXES if TARGET_PREFIXES else [None]

    for prefix in prefixes_to_scan:
        blobs = bucket.list_blobs(prefix=prefix)
        for b in blobs:
            # On prend tout ce qui n'est pas d√©j√† vu
            if b.name not in seen_blob_names:
                unique_blobs.append(b)
                seen_blob_names.add(b.name)

    print(f" {len(unique_blobs)} fichiers potentiels trouv√©s. Analyse en cours...")

    valid_rows = []

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        future_to_blob = {executor.submit(extract_info_from_blob, blob): blob for blob in unique_blobs}
        
        for future in tqdm(as_completed(future_to_blob), total=len(unique_blobs)):
            result = future.result()
            if result:
                valid_rows.append(result)

    print(f" √âcriture du CSV : {OUTPUT_CSV}")
    with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=['path', 'text'])
        writer.writeheader()
        writer.writerows(valid_rows)

    print(f" Termin√© ! {len(valid_rows)} segments valid√©s r√©cup√©r√©s.")

if __name__ == "__main__":
    main()

 <div style="text-align: center; color: #FFFFFF; background-color: #8B4000; padding: 10px; border-radius: 5px;">
  get audio file from subfolder
</div>

In [None]:
import os
import shutil

BASE_DIR = "downloaded_wolof_audio"
SUBFOLDERS = ['wolof-audio'] + [f'wolof-batch{i}' for i in range(2, 9)]
DEST_DIR = 'data'

SOURCE_FOLDERS = [os.path.join(BASE_DIR, folder) for folder in SUBFOLDERS]


print(SOURCE_FOLDERS)
def move_wav_files():
    # 1. Create the destination folder
    if not os.path.exists(DEST_DIR):
        os.makedirs(DEST_DIR)
        print(f" Created folder: {DEST_DIR}")

    count = 0
    errors = 0

    print(" Starting move operation...")

    for folder in SOURCE_FOLDERS:
        if not os.path.exists(folder):
            print(f" Skipping missing folder: {folder}")
            continue

        # Get list of files in this batch folder
        files = os.listdir(folder)
        
        for filename in files:
            # Filter for .wav files only
            if filename.lower().endswith('.wav'):
                src_path = os.path.join(folder, filename)
                dst_path = os.path.join(DEST_DIR, filename)

                if os.path.exists(dst_path):
                    base, ext = os.path.splitext(filename)
                    counter = 1
                    while os.path.exists(os.path.join(DEST_DIR, f"{base}_dup{counter}{ext}")):
                        counter += 1
                    new_filename = f"{base}_dup{counter}{ext}"
                    dst_path = os.path.join(DEST_DIR, new_filename)
                    print(f"   Note: Renamed duplicate {filename} -> {new_filename}")

                try:
                    shutil.move(src_path, dst_path)
                    count += 1
                except Exception as e:
                    print(f" Error moving {filename}: {e}")
                    errors += 1

    print("-" * 30)
    print(f" Moved {count} .wav files to '/{DEST_DIR}'")
    if errors > 0:
        print(f" {errors} files failed to move.")


if __name__ == "__main__":
    move_wav_files()

 <div style="text-align: center; color: #FFFFFF; background-color: #8B4000; padding: 10px; border-radius: 5px;">
  Prepare metadata
</div>

In [None]:
###  from clad 
train_clad = pd.read_csv("train/metadata.csv")
test_clad = pd.read_csv("test/metadata.csv")
train_clad["source"] = "clad_review"
test_clad["source"] = "clad_review"

In [None]:
train_clad['file_name'] = train_clad['file_name'].apply(os.path.basename)
test_clad['file_name'] = test_clad['file_name'].apply(os.path.basename)

In [None]:
###  from galsenai
train_galsenai = pd.read_csv("train_galsen_metadata.csv",usecols=["file_name","sentence","source"])
test_galsenai = pd.read_csv("test_galsen_metadata.csv",usecols=["file_name","sentence","source"])
train_galsenai = train_galsenai.rename(columns={'filename': 'file_name'})
test_galsenai = test_galsenai.rename(columns={'filename': 'file_name'})
train_galsenai.head(3)

In [None]:
## keep metadata
train_galsenai.to_csv("train_galsen_metadata.csv", index=False)
test_galsenai.to_csv("test_galsen_metadata.csv", index=False)

In [None]:
###  from kallama linguiste
#train_kallama = pd.read_csv("audio_galsenai/train/metadata.csv")
#test_kallama = pd.read_csv("audio_galsenai/test/metadata.csv")


In [None]:
global_train = pd.concat([train_clad,train_galsenai])
global_test = pd.concat([test_clad,test_galsenai])

In [None]:
global_train = pd.read_csv("gobal_corpus/train/metadata.csv")
global_test =   pd.read_csv("gobal_corpus/dev/metadata.csv")