# Importing libraries, loading and transforming data

In [None]:
# Install the 'evaluate' library with the specified version (4.28.1) quietly (-q).
!pip install -q evaluate transformers==4.28.1

# Upgrade the 'datasets' library to the latest version quietly (-q).
!pip install -U -q datasets

# Install the 'torchaudio' library with the specified version (0.12.0+cu113) from the provided CUDA version repository.
!pip install -q torchaudio==0.12.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html

# Add the 'ffmpeg4' repository to the package manager's sources list (-y for yes).
!add-apt-repository -y ppa:savoury1/ffmpeg4 

# Install the 'ffmpeg' package quietly (-qq).
!apt-get -qq install -y ffmpeg

# Install the 'mlflow' library quietly (-q).
!pip install -q mlflow

In [None]:
# Import necessary libraries
import pandas as pd  # Pandas for data manipulation
import gc  # Garbage collection module
import re  # Regular expressions for text processing
import numpy as np  # NumPy for numerical operations

# Suppress warnings
import warnings 
warnings.filterwarnings("ignore")

# Import tqdm for progress tracking
from tqdm import tqdm
tqdm.pandas()

# Import Path from pathlib for working with file paths
from pathlib import Path

# Import oversampling and undersampling methods from imblearn
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# Import class_weight calculation function from scikit-learn
from sklearn.utils.class_weight import compute_class_weight

# Import matplotlib for data visualization
import matplotlib.pyplot as plt

# Import itertools for working with iterators
import itertools

# Import various metrics from scikit-learn
from sklearn.metrics import (
    accuracy_score,  # For calculating accuracy
    roc_auc_score,   # For ROC AUC score
    confusion_matrix,  # For confusion matrix
    classification_report,  # For classification report
    f1_score  # For F1 score
)

# Import PyTorch for deep learning
import torch

# Import the Hugging Face Transformers library
import transformers

# Print the version of the transformers library
print(transformers.__version__)

# Import torchaudio for audio processing with PyTorch
import torchaudio

# Print the version of torchaudio
print(torchaudio.__version__)

# Import a custom module named 'evaluate' for evaluation functions
import evaluate

# Import Audio for displaying audio clips in the notebook
from IPython.display import Audio

# Import various classes and modules from Hugging Face Transformers and Datasets
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification, pipeline, TrainingArguments, Trainer
from datasets import Dataset, Image, ClassLabel  # Import custom 'Dataset', 'ClassLabel', and 'Image' classes

In [None]:
# Define the resampling rate in Hertz (Hz) for audio data
RATE_HZ = 16000

# Define the maximum audio interval length to consider in seconds
MAX_SECONDS = 1

# Calculate the maximum audio interval length in samples by multiplying the rate and seconds
MAX_LENGTH = RATE_HZ * MAX_SECONDS

# Define the minimum number of records per label required for the dataset
MIN_RECORDS_PER_LABEL = 25

# Define the fraction of records to be used for testing data
TEST_SIZE = 0.1

# Ensure that the product of MIN_RECORDS_PER_LABEL and TEST_SIZE is greater than 2
# This ensures a sufficient number of samples for testing

In [None]:
df0 = pd.read_csv('/kaggle/input/common-voice/cv-valid-train.csv', usecols=['filename', 'accent'])
df1 = pd.read_csv('/kaggle/input/common-voice/cv-valid-dev.csv', usecols=['filename', 'accent'])
df2 = pd.read_csv('/kaggle/input/common-voice/cv-valid-test.csv', usecols=['filename', 'accent'])
dd = pd.concat([df0, df1, df2], axis=0)
dd = dd[~dd['accent'].isnull()].drop_duplicates()
print(dd.shape)
dd.sample(5).T

In [None]:
dd['accent'].value_counts()

In [None]:
# Define a function to load bird sound data from a specified directory.
def load_data():
    # Initialize empty lists to store file paths and corresponding labels.
    file_list = []  # To store file paths
    full_list = []  # To store labels

    # Iterate through all the .mp3 files in the specified directory and its subdirectories.
    for file in tqdm(Path('/kaggle/input/common-voice/').glob('cv-valid-*/*/*.mp3')):
        # Extract the label from the file path by splitting the path and retrieving the second-to-last part.
        # The label is assumed to be the second-to-last part, separated by '/' and '_' characters.
        full_path = str(file)
        file_name = '/'.join(str(file).split('/')[-2:])
        # Append the current file path to the file_list and its corresponding label to the label_list.
        file_list.append(file_name)
        full_list.append(full_path)

    # Create an empty DataFrame to organize the data.
    df = pd.DataFrame()

    # Create two columns in the DataFrame: 'file' to store file paths and 'label' to store labels.
    df['filename'] = file_list
    df['file'] = full_list

    # Return the DataFrame containing the file paths and labels.
    return df

In [None]:
df = load_data()
df.shape

In [None]:
# merge dataframes to get the label
df = df.merge(dd, on='filename', how='inner')
df.rename(columns={'accent': 'label'}, inplace=True)

In [None]:
df.sample(5).T

In [None]:
from collections import Counter
labels = [lang for lang, _ in Counter(df['label']).most_common(5)]
print(labels)

In [None]:
df = df[df['label'].isin(labels)]
print(df.shape)

In [None]:
# Retrieve unique values in the 'label' column of the DataFrame 'df'
unique_labels = df['label'].unique()
unique_labels

In [None]:
# random undersampling of a majority class
rus = RandomUnderSampler(random_state=83, sampling_strategy='majority')
y = df[['label']]
df = df.drop(['label'], axis=1)
df, y_resampled = rus.fit_resample(df, y)
del y
df['label'] = y_resampled
del y_resampled
# # random oversampling of all minority classes
# y = df[['label']]
# df = df.drop(['label'], axis=1)
# ros = RandomOverSampler(random_state=83)
# df, y_resampled = ros.fit_resample(df, y)
# del y
# df['label'] = y_resampled
# del y_resampled

gc.collect()

print(df.shape)

In [None]:
# # This function takes a file path as input and performs several audio transformations.
# def get_transform_audio(file):
#     try:
#         # Load the audio file using torchaudio and get its sample rate.
#         audio, rate = torchaudio.load(str(file))
        
#         # Create a transformation to resample the audio to a specified sample rate (RATE_HZ).
#         transform = torchaudio.transforms.Resample(rate, RATE_HZ)
        
#         # Apply the resampling transformation to the audio and convert it to a NumPy array.
#         audio = transform(audio).squeeze(0).numpy().reshape(-1)
        
#         # Truncate the audio to the first MAX_LENGTH samples to save memory.
#         audio = audio[:MAX_LENGTH]
        
#         # Return the preprocessed audio data.
#         return audio
#     except:
#         # If an exception occurs (e.g., file not found), return None.
#         return None

# # Apply the 'get_transform_audio' function to each file path in the 'df' DataFrame
# # and store the preprocessed audio in a new 'audio' column.
# df['audio'] = df['file'].progress_apply(get_transform_audio)

# Split files by chunks with == MAX_LENGTH size
def split_audio(file):
    try:
        # Load the audio file using torchaudio and get its sample rate.
        audio, rate = torchaudio.load(str(file))

        # Calculate the number of segments based on the MAX_LENGTH
        num_segments = (len(audio[0]) // MAX_LENGTH)  # Floor division to get segments

        # Create an empty list to store segmented audio data
        segmented_audio = []

        # Split the audio into segments
        for i in range(num_segments):
            start = i * MAX_LENGTH
            end = min((i + 1) * MAX_LENGTH, len(audio[0]))
            segment = audio[0][start:end]

            # Create a transformation to resample the audio to a specified sample rate (RATE_HZ).
            transform = torchaudio.transforms.Resample(rate, RATE_HZ)
            segment = transform(segment).squeeze(0).numpy().reshape(-1)

            segmented_audio.append(segment)

        # Create a DataFrame from the segmented audio
        df_segments = pd.DataFrame({'audio': segmented_audio})

        return df_segments

    except Exception as e:
        # If an exception occurs (e.g., file not found), return nothing
        print(f"Error processing file: {e}")
        return None
    
df_list = []
for input_file, input_label in tqdm(zip(df['file'].values, df['label'].values)):
    resulting_df = split_audio(input_file)
    if resulting_df is not None:
        resulting_df['label'] = input_label
        df_list.append(resulting_df)
df = pd.concat(df_list, axis=0)
df.sample(5)

In [None]:
del df_list
gc.collect()

In [None]:
# Selecting rows in the DataFrame where the 'audio' column is not null (contains non-missing values).
df = df[~df['audio'].isnull()]

In [None]:
df.info()

In [None]:
# Removing the 'file' column from the DataFrame 'df'
if 'file' in df.columns:
    df = df.drop(['file'], axis=1)

In [None]:
# Identify the unique classes in the training data.
classes = np.unique(df[['label']])

print(classes)

# Calculate class weights using the 'balanced' option, which automatically adjusts for class imbalance.
weights = compute_class_weight(class_weight='balanced', classes=classes, y=df['label'])

# Create a dictionary mapping each class to its respective class weight.
class_weights = dict(zip(classes, weights))

# Print the computed class weights to the console.
print(class_weights)

In [None]:
# Create a dataset from the Pandas DataFrame 'df'
dataset = Dataset.from_pandas(df)

In [None]:
# Create a list of unique labels
labels_list = ['us', 'england', 'indian', 'australia', 'canada'] #sorted(list(df['label'].unique()))

# Deleting the DataFrame 'df'
del df

# Performing garbage collection to free up memory
gc.collect()

In [None]:
# Initialize empty dictionaries to map labels to IDs and vice versa
label2id, id2label = dict(), dict()

# Iterate over the unique labels and assign each label an ID, and vice versa
for i, label in enumerate(labels_list):
    label2id[label] = i  # Map the label to its corresponding ID
    id2label[i] = label  # Map the ID to its corresponding label

# Print the resulting dictionaries for reference
print("Mapping of IDs to Labels:", id2label, '\n')
print("Mapping of Labels to IDs:", label2id)

In [None]:
# Creating classlabels to match labels to IDs
ClassLabels = ClassLabel(num_classes=len(labels_list), names=labels_list)

# Mapping labels to IDs
def map_label2id(example):
    example['label'] = ClassLabels.str2int(example['label'])
    return example

dataset = dataset.map(map_label2id, batched=True)

# Casting label column to ClassLabel Object
dataset = dataset.cast_column('label', ClassLabels)

# Splitting the dataset into training and testing sets using the predefined train/test split ratio.
dataset = dataset.train_test_split(test_size=TEST_SIZE, shuffle=True, stratify_by_column="label")

# Load facebook/wav2vec2-base-960h model

In [None]:
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification

model_str = "dima806/english_accents_classification" #"facebook/wav2vec2-base-960h" 
feature_extractor=AutoFeatureExtractor.from_pretrained(model_str)
model=AutoModelForAudioClassification.from_pretrained(model_str,num_labels=len(labels))
model.config.id2label = id2label
# number of trainable parameters
print(model.num_parameters(only_trainable=True)/1e6)

In [None]:
def preprocess_function(batch):    
    inputs = feature_extractor(batch['audio'], sampling_rate=RATE_HZ, max_length=MAX_LENGTH, truncation=True)
    inputs['input_values'] = inputs['input_values'][0]
    return inputs

dataset['train'] = dataset['train'].map(preprocess_function, remove_columns="audio", batched=False)
gc.collect()
dataset['test'] = dataset['test'].map(preprocess_function, remove_columns="audio", batched=False)
gc.collect()

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

from sklearn.metrics import roc_auc_score
def compute_metrics(eval_pred):
    # Compute the ROC AUC score
    predictions = eval_pred.predictions
    predictions = np.exp(predictions)/np.exp(predictions).sum(axis=1, keepdims=True)
    label_ids = eval_pred.label_ids
    roc_auc = roc_auc_score(label_ids, predictions, average='macro', multi_class='ovr')
    
    # Calculate accuracy using the loaded accuracy metric
    acc_score = accuracy.compute(predictions=predictions.argmax(axis=1), references=label_ids)['accuracy']
    
    return {
        "roc_auc": roc_auc,
        "accuracy": acc_score
    }

# Training and validation

In [None]:
from transformers import TrainingArguments, Trainer
batch_size=8
warmup_steps=50
weight_decay=0.02
num_train_epochs=1
model_name = "english_accents_classification"
training_args = TrainingArguments(
    output_dir=model_name,
    logging_dir='./logs',
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=1e-6, # 3e-5
    logging_strategy='steps',
    logging_first_step=True,
    load_best_model_at_end=True,
    logging_steps=1,
    evaluation_strategy='epoch',
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    eval_steps=1,
    gradient_accumulation_steps=1, 
    gradient_checkpointing=True,
    save_strategy='epoch',
    save_total_limit=1, # save fewer checkpoints to limit used space
    report_to="mlflow",  # log to mlflow
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)


In [None]:
trainer.evaluate()

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
# Use the trained 'trainer' to make predictions on the test dataset.
outputs = trainer.predict(dataset["test"])

# Print the metrics obtained from the prediction outputs.
print(outputs.metrics)

In [None]:
# Extract the true labels from the model outputs
y_true = outputs.label_ids

# Predict the labels by selecting the class with the highest probability
y_pred = outputs.predictions.argmax(1)

# Define a function to plot a confusion matrix
def plot_confusion_matrix(cm, classes, title='Confusion Matrix', cmap=plt.cm.Blues, figsize=(10, 8), is_norm=True):
    """
    This function plots a confusion matrix.

    Parameters:
        cm (array-like): Confusion matrix as returned by sklearn.metrics.confusion_matrix.
        classes (list): List of class names, e.g., ['Class 0', 'Class 1'].
        title (str): Title for the plot.
        cmap (matplotlib colormap): Colormap for the plot.
    """
    # Create a figure with a specified size
    plt.figure(figsize=figsize)
    
    
    # Display the confusion matrix as an image with a colormap
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    # Define tick marks and labels for the classes on the axes
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)
    
    if is_norm:
        fmt = '.3f'
    else:
        fmt = '.0f'
    # Add text annotations to the plot indicating the values in the cells
    thresh = cm.max() / 2.0
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black")

    # Label the axes
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

    # Ensure the plot layout is tight
    plt.tight_layout()
    # Display the plot
    plt.show()

# Calculate accuracy and F1 score
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average='macro')

# Display accuracy and F1 score
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")

# Get the confusion matrix if there are a relatively small number of labels
if len(labels) <= 120:
    # Compute the confusion matrix
    cm = confusion_matrix(y_true, y_pred) # normalize='true'

    # Plot the confusion matrix using the defined function
    plot_confusion_matrix(cm, labels, figsize=(8, 6), is_norm=False)

# Finally, display classification report
print()
print("Classification report:")
print()
print(classification_report(y_true, y_pred, target_names=labels, digits=4))

In [None]:
trainer.save_model()

In [None]:
from transformers import pipeline

pipe=pipeline('audio-classification',model=model_name,device=0)

In [None]:
# us example
audio,rate=torchaudio.load('/kaggle/input/common-voice/cv-valid-test/cv-valid-test/sample-000003.mp3')
transform=torchaudio.transforms.Resample(rate,RATE_HZ)
audio=transform(audio).numpy().reshape(-1)
# make a classification pipeline
pipe(audio)

In [None]:
from IPython.display import Audio
Audio(audio,rate=RATE_HZ)

In [None]:
# england example
audio,rate=torchaudio.load('/kaggle/input/common-voice/cv-valid-test/cv-valid-test/sample-000008.mp3')
transform=torchaudio.transforms.Resample(rate,RATE_HZ)
audio=transform(audio).numpy().reshape(-1)
# make a classification pipeline
pipe(audio)

In [None]:
from IPython.display import Audio
Audio(audio,rate=RATE_HZ)

In [None]:
# indian example
audio,rate=torchaudio.load('/kaggle/input/common-voice/cv-valid-test/cv-valid-test/sample-000033.mp3')
transform=torchaudio.transforms.Resample(rate,RATE_HZ)
audio=transform(audio).numpy().reshape(-1)
# make a classification pipeline
pipe(audio)

In [None]:
from IPython.display import Audio
Audio(audio,rate=RATE_HZ)

In [None]:
# australia example
audio,rate=torchaudio.load('/kaggle/input/common-voice/cv-valid-test/cv-valid-test/sample-000065.mp3')
transform=torchaudio.transforms.Resample(rate,RATE_HZ)
audio=transform(audio).numpy().reshape(-1)
# make a classification pipeline
pipe(audio)

In [None]:
from IPython.display import Audio
Audio(audio,rate=RATE_HZ)

In [None]:
# canada example
audio,rate=torchaudio.load('/kaggle/input/common-voice/cv-valid-test/cv-valid-test/sample-000037.mp3')
transform=torchaudio.transforms.Resample(rate,RATE_HZ)
audio=transform(audio).numpy().reshape(-1)
# make a classification pipeline
pipe(audio)

In [None]:
from IPython.display import Audio
Audio(audio,rate=RATE_HZ)

# Send model to Huggingface

In [None]:
# finally, save the model to Huggingface
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from huggingface_hub import HfApi
api = HfApi()
repo_id = f"dima806/{model_name}"
try:
    api.create_repo(repo_id)
except:
    print(f"Repo {repo_id} already exists")

In [None]:
api.upload_folder(
    folder_path=model_name,
    path_in_repo = ".",
    repo_id=repo_id,
    repo_type="model"
)