In [None]:
!pip install pandas matplotlib datasets librosa soundfile evaluate datasets
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
CSV_PATH = "data/indian_accents.tsv"
OUTPUT_PATH = "data/train_clean.csv"
OUTPUT_UNZIP = "data/train_unzip.csv"

df = pd.read_csv(CSV_PATH, sep="\t")
# Remove all columns except 'path', 'sentence', 'age', 'gender', 'accents'
df = df[['path', 'sentence', 'age', 'gender', 'accents']]
# Filter out rows where 'accents' is NaN or empty
df_filtered = df[df['accents'].notna() & (df['accents'] != '') & df['gender'].notna() & df['age'].notna()]
# Prepend 'dataset/' to the 'path' column
df_filtered['path'] = "dataset/" + df_filtered['path'].astype(str)
df_filtered.to_csv(OUTPUT_PATH, index=False)
df_files = 'cv-corpus-21.0-2025-03-14/en/clips/' + df_filtered['path']
df_files.to_csv(OUTPUT_UNZIP, index=False, header=False)

In [None]:
# Chart the distribution of accents in a pie chart
# Create a function to display the actual counts
def make_autopct(values):
    def my_autopct(pct):
        total = sum(values)
        val = int(round(pct*total/100.0))
        return '{v:d}'.format(v=val)
    return my_autopct

top_values = 10

# Create a figure with 2 rows and 2 columns of subplots
fig, axs = plt.subplots(1,3, figsize=(30,20))

# Accents distribution
accents_value_counts = df_filtered['accents'].value_counts()
axs[0].pie(accents_value_counts[0:top_values],              
             autopct=make_autopct(accents_value_counts[0:top_values]), 
             startangle=120)
axs[0].legend(accents_value_counts.index[0:top_values], loc='lower right', bbox_to_anchor=(-0.1, 0))
axs[0].set_title("Distribution of Accents")

# Gender distribution
gender_value_counts = df_filtered['gender'].value_counts()
axs[1].pie(gender_value_counts[0:top_values],              
              autopct=make_autopct(gender_value_counts[0:top_values]),
              startangle=30)
axs[1].legend(gender_value_counts.index[0:top_values], loc='lower center', bbox_to_anchor=(-0.1, 0))
axs[1].set_title("Distribution of Gender")

# Age distribution
age_value_counts = df_filtered['age'].value_counts()
axs[2].pie(age_value_counts[0:top_values],              
              autopct=make_autopct(age_value_counts[0:top_values]),
              startangle=30)
axs[2].legend(age_value_counts.index[0:top_values], loc='lower center', bbox_to_anchor=(-0.1, 0))
axs[2].set_title("Distribution of Age")

plt.tight_layout()
plt.show()

In [None]:
CSV_PATH = "data/train_clean.csv"
OUTPUT_PATH = "data/encoding"

from datasets import load_dataset, load_from_disk, Audio
# Load the dataset and save it in a format compatible with Hugging Face datasets
dataset = load_dataset("csv", data_files=CSV_PATH, split="train")
dataset = dataset.cast_column("path", Audio(sampling_rate=16000))

def load_audio(dataset):
    return {"path": [x['path'] for x in dataset['path']]}

dataset = dataset.map(load_audio, batched=True)
dataset.save_to_disk(OUTPUT_PATH)

In [1]:
DATASET_PATH = "data/encoding"

from datasets import load_from_disk
dataset = load_from_disk(DATASET_PATH)
from datasets import ClassLabel
accent_classes = dataset.unique("accents")
accent_classes = sorted(accent_classes)
accent_classes = ClassLabel(names=accent_classes)
dataset = dataset.cast_column("accents", accent_classes)
dataset[0]
dataset_split = dataset.train_test_split(test_size=0.2, seed=42)
dataset_split

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['path', 'sentence', 'age', 'gender', 'accents'],
        num_rows: 239
    })
    test: Dataset({
        features: ['path', 'sentence', 'age', 'gender', 'accents'],
        num_rows: 60
    })
})

In [2]:
dataset_split["train"][0]

{'path': {'path': 'common_voice_en_19310868.mp3',
  'array': array([ 3.27418093e-10, -8.73114914e-11,  9.45874490e-11, ...,
         -1.25746010e-05,  7.54914945e-05,  8.74343095e-05], shape=(69120,)),
  'sampling_rate': 16000},
 'sentence': 'Tepoto Atoll is permanently uninhabited.',
 'age': 'twenties',
 'gender': 'male_masculine',
 'accents': 0}

In [3]:
labels = dataset_split["train"].features["accents"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = i
    id2label[i] = label
print(label2id)
print(id2label)

{'india and south asia (india, pakistan, sri lanka)': 0}
{0: 'india and south asia (india, pakistan, sri lanka)'}


In [4]:
from transformers import AutoFeatureExtractor
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")

In [5]:
def preprocess_function(examples):    
    audio_arrays = [x["array"] for x in examples["path"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=16000,
        truncation=True
    )
    return inputs

encoded_dataset = dataset_split.map(preprocess_function, remove_columns="path", batched=True)
encoded_dataset = encoded_dataset.rename_column("accents", "label")
encoded_dataset["train"][0]
encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'age', 'gender', 'label', 'input_values'],
        num_rows: 239
    })
    test: Dataset({
        features: ['sentence', 'age', 'gender', 'label', 'input_values'],
        num_rows: 60
    })
})

In [6]:
import evaluate
accuracy = evaluate.load("accuracy")
import numpy as np
def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

In [7]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base-960h", num_labels=num_labels, label2id=label2id, id2label=id2label, ignore_mismatched_sizes=True
)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
training_args = TrainingArguments(
    output_dir="mms-lid-256-indian-accents",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    processing_class=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,,1.0
2,No log,,1.0
3,No log,,1.0
4,No log,,1.0
5,0.000000,,1.0
6,0.000000,,1.0
7,0.000000,,1.0
8,0.000000,,1.0
9,0.000000,,1.0
10,0.000000,,1.0


TrainOutput(global_step=20, training_loss=0.0, metrics={'train_runtime': 149.5289, 'train_samples_per_second': 15.984, 'train_steps_per_second': 0.134, 'total_flos': 2.169787304352e+16, 'train_loss': 0.0, 'epoch': 10.0})