In [None]:
!pip install datasets
!pip install wandb
!pip install evaluate

Collecting wandb
  Obtaining dependency information for wandb from https://files.pythonhosted.org/packages/e0/71/7b7050ecab7288782ae0c7560f1ca06f4cf854a5ae08abeaf643785af1a0/wandb-0.19.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading wandb-0.19.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting docker-pycreds>=0.4.0 (from wandb)
  Obtaining dependency information for docker-pycreds>=0.4.0 from https://files.pythonhosted.org/packages/f5/e8/f6bd1eee09314e7e6dee49cbe2c5e22314ccdb38db16c9fc72d2fa80d054/docker_pycreds-0.4.0-py2.py3-none-any.whl.metadata
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Obtaining dependency information for gitpython!=3.1.29,>=1.0.0 from https://files.pythonhosted.org/packages/1d/9a/4114a9057db2f1462d5c8f8390ab7383925fe1ac012eaa42402ad65c2963/GitPython-3.1.44-py3-none-any.whl.metadata
  Downloading GitPython-3.1.44-py3

In [1]:
import pandas as pd
import numpy as np
import os
import evaluate
import torchaudio
from datasets import load_dataset, DatasetDict, Audio
from sklearn.model_selection import train_test_split
import random
from IPython.display import Audio, display
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification, TrainingArguments, Trainer

In [2]:
import os
import pandas as pd
from datasets import load_dataset, Audio
from sklearn.model_selection import train_test_split
from pathlib import Path

# Step 1: Get File Paths and Labels
directory = "/home/joregan/dialect"  # Ensure this is correct
files = [f for f in os.listdir(directory) if f.endswith(".wav")]
print("Found files:", files)

df = pd.DataFrame({
    "file": files,  # Use only filenames, not full paths
    "label": [f.split("_")[0] for f in files]
})

# Step 2: Split into Train and Validation Sets
train_df, eval_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)

# Step 3: Save CSV Files
train_csv = "train.csv"
eval_csv = "eval.csv"
train_df.to_csv(train_csv, index=False)
eval_df.to_csv(eval_csv, index=False)

# Step 4: Load Dataset from CSV
dataset = load_dataset("csv", data_files={"train": train_csv, "validation": eval_csv})

# Step 5: Set `data_dir` and Cast Audio
dataset = dataset.cast_column("file", Audio(sampling_rate=16000))

# Verify Dataset
print(dataset)


Found files: ['Finland_19.wav', 'Gotaland_55.wav', 'Svealand_93.wav', 'Norrland_26.wav', 'Finland_27.wav', 'Gotaland_120.wav', 'Finland_28.wav', 'Svealand_10.wav', 'Gotaland_89.wav', 'Svealand_104.wav', 'Norrland_2.wav', 'Norrland_11.wav', 'Gotaland_40.wav', 'Svealand_92.wav', 'Svealand_80.wav', 'Norrland_100.wav', 'Norrland_18.wav', 'Norrland_112.wav', 'Norrland_19.wav', 'Gotaland_94.wav', 'Norrland_106.wav', 'Svealand_17.wav', 'Gotaland_85.wav', 'Gotaland_7.wav', 'Finland_35.wav', 'Gotaland_27.wav', 'Svealand_57.wav', 'Norrland_67.wav', 'Gotaland_155.wav', 'Norrland_119.wav', 'Svealand_51.wav', 'Svealand_75.wav', 'Gotaland_150.wav', 'Norrland_95.wav', 'Norrland_110.wav', 'Norrland_94.wav', 'Svealand_87.wav', 'Gotaland_22.wav', 'Gotaland_50.wav', 'Gotaland_82.wav', 'Finland_17.wav', 'Gotaland_60.wav', 'Gotaland_92.wav', 'Gotaland_68.wav', 'Norrland_40.wav', 'Norrland_49.wav', 'Svealand_59.wav', 'Svealand_74.wav', 'Gotaland_145.wav', 'Norrland_6.wav', 'Gotaland_42.wav', 'Norrland_115.w

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['file', 'label'],
        num_rows: 342
    })
    validation: Dataset({
        features: ['file', 'label'],
        num_rows: 86
    })
})


In [3]:
def get_labels(df):
    labels = list(set(df['label']))
    label2id = {label: i for i, label in enumerate(labels)}
    id2label = {i: label for label, i in label2id.items()}
    return labels, label2id, id2label

labels, label2id, id2label = get_labels(df)

In [4]:
model_checkpoint = "facebook/wav2vec2-large-xlsr-53"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint)
feature_extractor

Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0,
  "return_attention_mask": true,
  "sampling_rate": 16000
}

In [5]:
max_duration = 10.0  # seconds
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["file"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
    )
    # Add label conversion to numerical IDs using label2id
    inputs["labels"] = [label2id[x] for x in examples["label"]]
    return inputs

In [6]:
# Update the dataset mapping to remove 'file' and 'label'
encoded_dataset = dataset.map(preprocess_function, remove_columns=["file", "label"], batched=True)
encoded_dataset

Map:   0%|          | 0/342 [00:00<?, ? examples/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_values', 'attention_mask', 'labels'],
        num_rows: 342
    })
    validation: Dataset({
        features: ['input_values', 'attention_mask', 'labels'],
        num_rows: 86
    })
})

In [7]:
print(len(encoded_dataset["train"][0]['input_values']))

160000


In [8]:
num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    model_checkpoint,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

  return self.fget.__get__(instance, owner)()
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"jimregan/{model_name}-dialect-classifier",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=4,
    num_train_epochs=30,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
    report_to=["tensorboard"]
)

In [15]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = np.mean(predictions == labels)
    return {"accuracy": accuracy}

In [11]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [22]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics
)

In [23]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
0,1.3719,1.369342,0.360465
2,1.3638,1.358849,0.360465
4,1.3446,1.344011,0.360465
6,1.3303,1.333987,0.360465
8,1.334,1.326732,0.360465
10,1.3241,1.320493,0.360465
12,1.3239,1.316107,0.360465
14,1.3129,1.312827,0.360465
16,1.3231,1.310691,0.360465
18,1.3082,1.30923,0.360465


Checkpoint destination directory jimregan/wav2vec2-large-xlsr-53-dialect-classifier/checkpoint-21 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory jimregan/wav2vec2-large-xlsr-53-dialect-classifier/checkpoint-43 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=630, training_loss=1.322001112075079, metrics={'train_runtime': 3811.1747, 'train_samples_per_second': 2.692, 'train_steps_per_second': 0.165, 'total_flos': 3.03740825016576e+18, 'train_loss': 1.322001112075079, 'epoch': 29.3})

In [24]:
trainer.evaluate()



{'eval_loss': 1.3693418502807617,
 'eval_accuracy': 0.36046511627906974,
 'eval_runtime': 14.5009,
 'eval_samples_per_second': 5.931,
 'eval_steps_per_second': 0.414,
 'epoch': 29.3}

In [25]:
trainer.push_to_hub()

events.out.tfevents.1741363856.sbtaldeep22.1481335.4:   0%|          | 0.00/411 [00:00<?, ?B/s]

events.out.tfevents.1741355065.sbtaldeep22.1481335.3:   0%|          | 0.00/29.8k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/jimregan/wav2vec2-large-xlsr-53-dialect-classifier/commit/195e419197214ad6cfec59212b1ac0d0f53b9139', commit_message='End of training', commit_description='', oid='195e419197214ad6cfec59212b1ac0d0f53b9139', pr_url=None, pr_revision=None, pr_num=None)