# Importing libraries, loading and transforming data

In [1]:
!pip install librosa soundfile




In [2]:
!pip install evaluate transformers
!pip install -U datasets





In [3]:
from datasets import load_dataset
from collections import Counter
import itertools
import pandas as pd
import librosa
import numpy as np
import tqdm


In [4]:
from datasets import load_dataset
import itertools

# Clear local cache by resetting the cache dir OR force remote streaming
dataset = load_dataset(
    "mozilla-foundation/common_voice_16_0",
    "en",
    split="test",
    streaming=True,
)



In [None]:
# Filter: keep only samples where accent is a non-empty string
filtered_stream = (example for example in dataset if example["accent"].strip() != "")

# Count the number of valid examples
count = sum(1 for _ in itertools.islice(filtered_stream, None))  # No limit

print(f"Number of valid examples with non-empty accent: {count}")

Reading metadata...: 16390it [00:01, 12730.22it/s]


KeyboardInterrupt: 

In [5]:
# Filter: keep only samples where accent is a non-empty string
filtered_stream = (example for example in dataset if example["accent"].strip() != "")
accent_counter = Counter()

examples_buffer = []

for example in itertools.islice(filtered_stream, None):
    accent = example["accent"].strip()
    accent_counter[accent] += 1
    examples_buffer.append(example)

# Step 3: Select top 6 accents
top_accents = set([accent for accent, _ in accent_counter.most_common(6)])
print("Top accents:", top_accents)

Reading metadata...: 16390it [00:01, 10814.70it/s]


Top accents: {'United States English', 'India and South Asia (India, Pakistan, Sri Lanka)', 'Southern African (South Africa, Zimbabwe, Namibia)', 'Canadian English', 'England English', 'Australian English'}


In [8]:
import gc
gc.collect()

2336

In [None]:
top_accent_samples = [ex for ex in examples_buffer if ex["accent"].strip() in top_accents]

top_accent_samples

In [None]:
example = top_accent_samples[0]  # You can pick any index here
# Inspect array and rate
raw_array = example["audio"]["array"]
raw_sr = example["audio"]["sampling_rate"]
print("Original sample rate:", raw_sr)
print("Array shape:", raw_array.shape)

# Listen to raw Hugging Face audio
display(Audio(raw_array, rate=raw_sr))


In [9]:
import torchaudio
import torch
import torchaudio.transforms as T
import pandas as pd
import numpy as np
import tqdm

# Parameters
RATE_HZ = 16000
MAX_LENGTH = 80000  # ~5 seconds

data = []

for idx, example in enumerate(tqdm.tqdm(examples_buffer)):
    try:
        # Raw audio and metadata
        raw_array = example["audio"]["array"]
        raw_sr = example["audio"]["sampling_rate"]
        label = example["accent"].strip()

        # Convert raw_array to torch.Tensor and add channel dim
        waveform = torchaudio.functional.resample(
            torch.tensor(raw_array).unsqueeze(0),
            orig_freq=raw_sr,
            new_freq=RATE_HZ
        )

        # Trim and flatten to 1D NumPy array
        waveform = waveform.squeeze(0).numpy()
        waveform = waveform[:MAX_LENGTH]

        data.append({
            "label": label,
            "audio": waveform
        })

    except Exception as e:
        print(f"Failed on index {idx}: {e}")

# Create DataFrame
df = pd.DataFrame(data)

print(f"DataFrame created with shape: {df.shape}")
df.head()


100%|██████████| 2197/2197 [00:40<00:00, 53.90it/s]


DataFrame created with shape: (2197, 2)


Unnamed: 0,label,audio
0,England English,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,United States English,"[-1.9372989087549362e-13, -3.987597456528895e-..."
2,Midatlantic,"[1.540944636640868e-12, -2.5851442987768314e-1..."
3,England English,"[2.7428736767212937e-14, 1.717624576596339e-14..."
4,"England English,Esturine, from the region arou...","[7.748188492444908e-11, 2.7364676404044055e-10..."


In [10]:
accent_mapping = {
    "England English": "British",
    "United States English": "American",
    "India and South Asia (India, Pakistan, Sri Lanka)": "Indian",
    "Australian English": "Australian",
    "Southern African (South Africa, Zimbabwe, Namibia)": "South_African",
    "Canadian English": "Canadian"
}

df["label"] = df["label"].replace(accent_mapping)
df["label"].value_counts()


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
American,928
Indian,444
British,226
Canadian,76
Australian,57
...,...
"English Cumbrian,English Northern",1
indian,1
"Dutch English,United States English",1
"hey you are you ok ,hello my friend,i love you,yes i am,ho are you",1


In [11]:
# Your keyword list
keywords = ['United', 'India', 'Pakistan', 'England', 'Africa', 'States','American', 'Canada', 'Canadian', 'African','Australian','British','Indian','South_African']

# Helper function to assign label or 'Other'
def relabel_accent(accent: str) -> str:
    if any(keyword in accent for keyword in keywords):
        return accent
    else:
        return "Other"


# Relabel accents
df["label"] = df["label"].apply(relabel_accent)

# Check value counts
df["label"].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
American,928
Indian,444
Other,288
British,226
Canadian,76
...,...
"London, England",1
"Filipino,Canadian English",1
"England English,I think I speak clearly with an accent that is easy to understand.",1
"United States English,Norwegian",1


In [13]:
df.shape

(1977, 2)

In [12]:
allowed_labels = [
    "British",
    "American",
    "Indian",
    "Australian",
    "South_African",
    "Other"
]
df = df[df["label"].isin(allowed_labels)].reset_index(drop=True)
df["label"].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
American,928
Indian,444
Other,288
British,226
Australian,57
South_African,34


In [14]:
from sklearn.utils import resample

# Target sample size
TARGET_SIZE = 200

# Balanced dataset placeholder
balanced_df = pd.DataFrame()

# Group by label and apply sampling
for label, group in df.groupby("label"):
    if len(group) > TARGET_SIZE:
        sampled = group.sample(n=TARGET_SIZE, random_state=42)  # Undersample
    else:
        sampled = resample(group,
                           replace=True,
                           n_samples=TARGET_SIZE,
                           random_state=42)  # Oversample
    balanced_df = pd.concat([balanced_df, sampled], axis=0)

# Shuffle the final DataFrame
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Check results
print(balanced_df["label"].value_counts())
print(f"Final balanced shape: {balanced_df.shape}")

label
South_African    200
Other            200
American         200
British          200
Australian       200
Indian           200
Name: count, dtype: int64
Final balanced shape: (1200, 2)


In [20]:
balanced_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   1200 non-null   object
 1   audio   1200 non-null   object
dtypes: object(2)
memory usage: 18.9+ KB


In [15]:
# Keep only balanceDF
for name in dir():
    if name != "balanced_df" and not name.startswith("_"):
        del globals()[name]

# Force garbage collection
import gc
gc.collect()

5

In [16]:
# Save df to use again later
balanced_df.to_parquet("balanceDF.parquet")

In [4]:
import pandas as pd
balanced_df = pd.read_parquet("balanceDF.parquet")

In [5]:
balanced_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   1200 non-null   object
 1   audio   1200 non-null   object
dtypes: object(2)
memory usage: 18.9+ KB


In [32]:
labels = [
    "British",
    "American",
    "Indian",
    "Australian",
    "South_African",
    "Other"
]
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = i
    id2label[i] = label

print(id2label, '\n\n', label2id)

{0: 'British', 1: 'American', 2: 'Indian', 3: 'Australian', 4: 'South_African', 5: 'Other'} 

 {'British': 0, 'American': 1, 'Indian': 2, 'Australian': 3, 'South_African': 4, 'Other': 5}


In [7]:
from datasets import Dataset, ClassLabel
dd = Dataset.from_pandas(balanced_df)

In [33]:
def encode_labels(example):
    example["label"] = label2id[example["label"]]
    return example

dd["train"] = dd["train"].map(encode_labels)
dd["test"] = dd["test"].map(encode_labels)


Map:   0%|          | 0/1080 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

In [34]:
from collections import Counter
Counter(dd['label']).items()

KeyError: 'label'

In [9]:
dd = dd.train_test_split(test_size=0.1)
dd

DatasetDict({
    train: Dataset({
        features: ['label', 'audio'],
        num_rows: 1080
    })
    test: Dataset({
        features: ['label', 'audio'],
        num_rows: 120
    })
})

# Load facebook/wav2vec2-base model

In [10]:
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification

model_str = "facebook/wav2vec2-base"
feature_extractor=AutoFeatureExtractor.from_pretrained(model_str)
model=AutoModelForAudioClassification.from_pretrained(model_str,num_labels=len(labels))
model.config.id2label = id2label
# number of trainable parameters
print(model.num_parameters(only_trainable=True)/1e6)

preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


94.570118


In [11]:
dd["test"]

Dataset({
    features: ['label', 'audio'],
    num_rows: 120
})

In [12]:
RATE_HZ=16000
MAX_LENGTH=80000

In [13]:
def preprocess_function(batch):
    inputs = feature_extractor(batch['audio'], sampling_rate=RATE_HZ, max_length=MAX_LENGTH, truncation=True)
    inputs['input_values'] = inputs['input_values'][0]
    return inputs

dd['test'] = dd['test'].map(preprocess_function, remove_columns="audio", batched=False)
dd['train'] = dd['train'].map(preprocess_function, remove_columns="audio", batched=False)

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

Map:   0%|          | 0/1080 [00:00<?, ? examples/s]

In [15]:
import gc
gc.collect()

509

In [16]:
import evaluate

accuracy = evaluate.load("accuracy")

from sklearn.metrics import roc_auc_score
def compute_metrics(eval_pred):
    # Compute the ROC AUC score
    predictions = eval_pred.predictions
    predictions = np.exp(predictions)/np.exp(predictions).sum(axis=1, keepdims=True)
    label_ids = eval_pred.label_ids
    roc_auc = roc_auc_score(label_ids, predictions, average='macro', multi_class='ovr')

    # Calculate accuracy using the loaded accuracy metric
    acc_score = accuracy.compute(predictions=predictions.argmax(axis=1), references=label_ids)['accuracy']

    return {
        "roc_auc": roc_auc,
        "accuracy": acc_score
    }

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [29]:
print(type(dd['train'][0]['input_values']))
print(type(dd['train'][0]['label']))

<class 'list'>
<class 'str'>


In [23]:
from transformers import TrainingArguments, Trainer
batch_size=4
warmup_steps=50
weight_decay=0.02
num_train_epochs=1
model_name = "accent_classification"
training_args = TrainingArguments(
    output_dir=model_name,
    logging_dir='./logs',
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=1e-5, # 3e-5
    logging_strategy='steps',
    logging_first_step=True,
    load_best_model_at_end=True,
    logging_steps=1,
    eval_strategy='epoch',
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    eval_steps=1,
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    save_strategy='epoch',
    save_total_limit=1,
    report_to=[]# save fewer checkpoints to limit used space
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dd["train"],
    eval_dataset=dd["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [24]:
trainer.train()

AttributeError: 'str' object has no attribute 'dtype'

In [None]:
trainer.evaluate()

{'eval_loss': 0.14365944266319275,
 'eval_roc_auc': 0.9945890081183189,
 'eval_accuracy': 0.9769784172661871,
 'eval_runtime': 85.5118,
 'eval_samples_per_second': 8.128,
 'eval_steps_per_second': 2.035,
 'epoch': 10.0}

In [None]:
trainer.save_model()

In [None]:
from transformers import pipeline

pipe=pipeline('audio-classification',model=model_name,device=0)

In [None]:
# foreign example
audio,rate=torchaudio.load('/kaggle/input/speech-accent-archive/recordings/recordings/azerbaijani3.mp3')
transform=torchaudio.transforms.Resample(rate,16000)
audio=transform(audio).numpy().reshape(-1)
# make a classification pipeline
pipe(audio)

[{'score': 0.9846792817115784, 'label': 'other'},
 {'score': 0.0051173982210457325, 'label': 'french'},
 {'score': 0.004047855269163847, 'label': 'arabic'},
 {'score': 0.002300234977155924, 'label': 'spanish'},
 {'score': 0.0022628363221883774, 'label': 'mandarin'}]

In [None]:
# english example
audio,rate=torchaudio.load('/kaggle/input/speech-accent-archive/recordings/recordings/english102.mp3')
transform=torchaudio.transforms.Resample(rate,16000)
audio=transform(audio).numpy().reshape(-1)
# make a classification pipeline
pipe(audio)

[{'score': 0.9932954907417297, 'label': 'english'},
 {'score': 0.0018747301073744893, 'label': 'mandarin'},
 {'score': 0.0018277985509485006, 'label': 'arabic'},
 {'score': 0.0015415801899507642, 'label': 'spanish'},
 {'score': 0.0008816014742478728, 'label': 'french'}]

In [None]:
# spanish example
audio,rate=torchaudio.load('/kaggle/input/speech-accent-archive/recordings/recordings/spanish10.mp3')
transform=torchaudio.transforms.Resample(rate,16000)
audio=transform(audio).numpy().reshape(-1)
# make a classification pipeline
pipe(audio)

[{'score': 0.9844197630882263, 'label': 'spanish'},
 {'score': 0.005216183606535196, 'label': 'arabic'},
 {'score': 0.004913764074444771, 'label': 'french'},
 {'score': 0.0024457182735204697, 'label': 'english'},
 {'score': 0.0021028988994657993, 'label': 'other'}]

In [None]:
audio.shape

(566260,)

In [None]:
from IPython.display import Audio
Audio(audio,rate=16000)

# Send model to Huggingface

In [None]:
# finally, save the model to Huggingface
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from huggingface_hub import HfApi
api = HfApi()
repo_id = f"dima806/{model_name}"
try:
    api.create_repo(repo_id)
except:
    print(f"Repo {repo_id} already exists")

In [None]:
#!rm -rv */checkpoint-*

In [None]:
api.upload_folder(
    folder_path=model_name,
    path_in_repo = ".",
    repo_id=repo_id,
    repo_type="model"
)

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/757M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

Upload 52 LFS files:   0%|          | 0/52 [00:00<?, ?it/s]

scheduler.pt:   0%|          | 0.00/623 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/3.57k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/757M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/623 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/3.57k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/757M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/623 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/3.57k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/757M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/623 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/3.57k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/757M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/623 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/3.57k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/757M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/623 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/3.57k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/757M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/623 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/3.57k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/757M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/623 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/3.57k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/757M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/623 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/3.57k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/757M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/623 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/3.57k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/3.57k [00:00<?, ?B/s]

'https://huggingface.co/dima806/multiple_accent_classification/tree/main/.'