# Hindi ASR Dataset Preprocessing
Simple step-by-step preprocessing

In [3]:
import pandas as pd
import json
import requests
import os
from tqdm import tqdm

In [4]:
# Load dataset
df = pd.read_csv('../dataset/FT Data - data.csv')
print(f"Total samples: {len(df)}")
print(f"Total duration: {df['duration'].sum() / 3600:.2f} hours")
df.head()

Total samples: 104
Total duration: 21.89 hours


Unnamed: 0,user_id,recording_id,language,duration,rec_url_gcp,transcription_url_gcp,metadata_url_gcp
0,245746,825780,hi,443,https://storage.googleapis.com/joshtalks-data-...,https://storage.googleapis.com/joshtalks-data-...,https://storage.googleapis.com/joshtalks-data-...
1,291038,825727,hi,443,https://storage.googleapis.com/joshtalks-data-...,https://storage.googleapis.com/joshtalks-data-...,https://storage.googleapis.com/joshtalks-data-...
2,246004,988596,hi,475,https://storage.googleapis.com/joshtalks-data-...,https://storage.googleapis.com/joshtalks-data-...,https://storage.googleapis.com/joshtalks-data-...
3,93626,990175,hi,475,https://storage.googleapis.com/joshtalks-data-...,https://storage.googleapis.com/joshtalks-data-...,https://storage.googleapis.com/joshtalks-data-...
4,286851,526266,hi,522,https://storage.googleapis.com/joshtalks-data-...,https://storage.googleapis.com/joshtalks-data-...,https://storage.googleapis.com/joshtalks-data-...


In [5]:
# Fix URLs
df['rec_url_gcp'] = df['rec_url_gcp'].str.replace('joshtalks-data-collection', 'upload_goai')
df['transcription_url_gcp'] = df['transcription_url_gcp'].str.replace('joshtalks-data-collection', 'upload_goai')
print("URLs fixed")
print("Sample URL:", df['transcription_url_gcp'].iloc[0])

URLs fixed
Sample URL: https://storage.googleapis.com/upload_goai/hq_data/hi/967179/825780_transcription.json


In [6]:
# Create directories
os.makedirs('audio', exist_ok=True)
os.makedirs('transcriptions', exist_ok=True)
print("Directories created")

Directories created


In [7]:
# Download and process
processed_data = []

for idx, row in tqdm(df.iterrows(), total=len(df)):
    recording_id = row['recording_id']
    audio_path = f"audio/{recording_id}.wav"
    trans_path = f"transcriptions/{recording_id}.json"
    
    # Download audio
    if not os.path.exists(audio_path):
        try:
            r = requests.get(row['rec_url_gcp'], timeout=30)
            with open(audio_path, 'wb') as f:
                f.write(r.content)
        except:
            continue
    
    # Download transcription
    if not os.path.exists(trans_path):
        try:
            r = requests.get(row['transcription_url_gcp'], timeout=30)
            with open(trans_path, 'wb') as f:
                f.write(r.content)
        except:
            continue
    
    # Process transcription
    try:
        with open(trans_path, 'r', encoding='utf-8') as f:
            trans_data = json.load(f)
        text = ' '.join([seg['text'] for seg in trans_data if 'text' in seg])
        
        processed_data.append({
            'audio': audio_path,
            'text': text.strip(),
            'recording_id': recording_id,
            'user_id': row['user_id'],
            'duration': row['duration']
        })
    except:
        continue

print(f"Processed {len(processed_data)} samples")

100%|██████████| 104/104 [01:24<00:00,  1.24it/s]

Processed 0 samples





In [10]:
!pip install transformers





In [12]:
import pandas as pd
import json
import os
from datasets import Dataset, Audio, DatasetDict, load_dataset
from transformers import (
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
import torch
import evaluate
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import numpy as np




In [13]:
df = pd.read_csv('../dataset/FT Data - data.csv')
print(f"Total samples in CSV: {len(df)}")

# Process transcriptions and create dataset
data_list = []

for idx, row in df.iterrows():
    recording_id = row['recording_id']
    audio_path = f"audio/{recording_id}.wav"
    trans_path = f"transcriptions/{recording_id}.json"
    
    if not os.path.exists(audio_path) or not os.path.exists(trans_path):
        continue
    try:
        with open(trans_path, 'r', encoding='utf-8') as f:
            trans_data = json.load(f)
        
        # Combine all text segments
        text = ' '.join([seg['text'] for seg in trans_data if 'text' in seg])
        
        if text.strip():
            data_list.append({
                'audio': audio_path,
                'text': text.strip(),
                'user_id': row['user_id']
            })
    except Exception as e:
        print(f"Error processing {recording_id}: {e}")
        continue

print(f"Valid samples: {len(data_list)}")


Total samples in CSV: 104
Error processing 825780: Expecting value: line 1 column 1 (char 0)
Error processing 825727: Expecting value: line 1 column 1 (char 0)
Error processing 988596: Expecting value: line 1 column 1 (char 0)
Error processing 990175: Expecting value: line 1 column 1 (char 0)
Error processing 526266: Expecting value: line 1 column 1 (char 0)
Error processing 520199: Expecting value: line 1 column 1 (char 0)
Error processing 542785: Expecting value: line 1 column 1 (char 0)
Error processing 494019: Expecting value: line 1 column 1 (char 0)
Error processing 523045: Expecting value: line 1 column 1 (char 0)
Error processing 522951: Expecting value: line 1 column 1 (char 0)
Error processing 254219: Expecting value: line 1 column 1 (char 0)
Error processing 253253: Expecting value: line 1 column 1 (char 0)
Error processing 351501: Expecting value: line 1 column 1 (char 0)
Error processing 350606: Expecting value: line 1 column 1 (char 0)
Error processing 629904: Expecting v

In [2]:
audio = []
transcription = []
import os
for fname in sorted(os.listdir('audio')):
    if not fname.lower().endswith(('.wav', '.mp3', '.flac')):
        continue
    rid = os.path.splitext(fname)[0]
    a_path = f'audio/{fname}'
    t_path = f'transcriptions/{rid}.json'
    if os.path.exists(t_path):
        audio.append(a_path)
        transcription.append(t_path)

audio, transcription

(['audio/1020918.wav',
  'audio/1021370.wav',
  'audio/238079.wav',
  'audio/238123.wav',
  'audio/239492.wav',
  'audio/240907.wav',
  'audio/240909.wav',
  'audio/240994.wav',
  'audio/241695.wav',
  'audio/243702.wav',
  'audio/253253.wav',
  'audio/254219.wav',
  'audio/255349.wav',
  'audio/255381.wav',
  'audio/269383.wav',
  'audio/269794.wav',
  'audio/269907.wav',
  'audio/270037.wav',
  'audio/270150.wav',
  'audio/270153.wav',
  'audio/270291.wav',
  'audio/270296.wav',
  'audio/272241.wav',
  'audio/282447.wav',
  'audio/301057.wav',
  'audio/301080.wav',
  'audio/302503.wav',
  'audio/302506.wav',
  'audio/305308.wav',
  'audio/305347.wav',
  'audio/319105.wav',
  'audio/319126.wav',
  'audio/319431.wav',
  'audio/330457.wav',
  'audio/350297.wav',
  'audio/350347.wav',
  'audio/350606.wav',
  'audio/351501.wav',
  'audio/365033.wav',
  'audio/365059.wav',
  'audio/366972.wav',
  'audio/367249.wav',
  'audio/400490.wav',
  'audio/400503.wav',
  'audio/443952.wav',
  'audio

In [3]:
audio[0], transcription[0]

('audio/1020918.wav', 'transcriptions/1020918.json')

In [5]:
# conert into the df formate
import pandas as pd
df = pd.DataFrame({'audio': audio, 'transcription': transcription})
df.head()

Unnamed: 0,audio,transcription
0,audio/1020918.wav,transcriptions/1020918.json
1,audio/1021370.wav,transcriptions/1021370.json
2,audio/238079.wav,transcriptions/238079.json
3,audio/238123.wav,transcriptions/238123.json
4,audio/239492.wav,transcriptions/239492.json


In [20]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    df,
    test_size=0.1,
    random_state=42,
    stratify=df['user_id']
)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.