In [1]:
%load_ext autoreload
%autoreload 2

# Configuring the Environment

In [2]:
!pip install --upgrade transformers datasets librosa 

In [None]:
import torch
import librosa
from datasets import load_dataset, Dataset
from transformers import Wav2Vec2ForSequenceClassification, HubertForSequenceClassification, Wav2Vec2FeatureExtractor

from sklearn.metrics import accuracy_score
import numpy as np 
import pandas as pd

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
!nvidia-smi

# Downloading & Unpacking the FSC Dataset

In [5]:
!wget http://140.112.21.28:9000/fluent.tar.gz && tar -xf fluent.tar.gz
!rm fluent.tar.gz

In [6]:
import os

fsc_folder = os.path.join('./', 'fluent_speech_commands_dataset')
data_folder = f'{fsc_folder}/data/'
speakers_folder = f"{fsc_folder}/wavs/speakers"

# Studying FSC Dataset

## Speaker Demographics

In [None]:
nRowsRead = None # specify 'None' if want to read whole file
df_demographics = pd.read_csv(f'{data_folder}/speaker_demographics.csv', delimiter=',', nrows = nRowsRead)
nRow, nCol = df_demographics.shape
print(f'There are {nRow} rows and {nCol} columns')
df_demographics.head(1)

In [None]:
for col in df_demographics.columns:
    if col!='speakerId':
        print(f'Metadata: {col}')
        frequency = dict(df_demographics[col].value_counts())
        for k, v in frequency.items():
              print(f'{k}: {v}')
    print()

## DataFrame for DivExplorer

In [None]:
nRowsRead = None # Specify 'None' if want to read whole file
df_test = pd.read_csv(f'{data_folder}/test_data.csv', delimiter=',', nrows = nRowsRead, index_col = 0)
df_test.dataframeName = 'test_data.csv'

nRow, nCol = df_test.shape
print(f'There are {nRow} rows and {nCol} columns')

df_test['ID'] = df_test.index
df_test.head(1)

In [10]:
df_divExplorer = df_test.merge(df_demographics, on='speakerId')
df_divExplorer['path'] = f'{fsc_folder}/' + df_divExplorer['path']

df_divExplorer.set_index('ID', inplace=True)
df_divExplorer = df_divExplorer.sort_values('ID')

# Accuracy Evaluation

## Inference

In [11]:
def map_to_array(example, audio_col = 'path'):
    speech, _ = librosa.load(example[audio_col], sr=16000, mono=True)
    example["speech"] = speech
    return example

In [None]:
dataset = Dataset.from_pandas(df_divExplorer) 
dataset = dataset.map(lambda x: map_to_array(x, audio_col = 'path'))

In [52]:
target_model = 'wav2vec2-base'

# Wav2Vec2 Large
if target_model == 'wav2vec2-large':
    model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-large-superb-ic").cuda()
    feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-large-superb-ic")

# Wav2Vec2 Base
elif target_model == 'wav2vec2-base':
    model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-ic").cuda()
    feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")

# Hubert Large
elif target_model == 'hubert-large':
    model = HubertForSequenceClassification.from_pretrained("superb/hubert-large-superb-ic").cuda()
    feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("superb/hubert-large-superb-ic")

# Hubert Base
elif target_model == 'hubert-base':
    model = HubertForSequenceClassification.from_pretrained("superb/hubert-base-superb-ic").cuda()
    feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("superb/hubert-base-superb-ic")
else:
    raise ValueError(f'{target_model} is not available')

In [53]:
def preprocess_function(examples):
    inputs = feature_extractor(
      examples,
      sampling_rate=feature_extractor.sampling_rate, 
      padding=True, 
      return_tensors="pt")
      
    return inputs

In [None]:
from tqdm import tqdm

# compute attention masks and normalize the waveform if needed
with torch.no_grad():
    inputs = preprocess_function(dataset[0:48]["speech"]).to(device)
    logits_concatenation = model(**inputs).logits
    for i in tqdm(range(48, len(dataset), 48)):
        inputs = preprocess_function(dataset[i:i+48]["speech"]).to(device)
        logits = model(**inputs).logits
        logits_concatenation = torch.cat((logits_concatenation, logits))

## Intent Accuracy

In [None]:
action_ids = torch.argmax(logits_concatenation[:, :6], dim=-1).tolist()
action_labels = [model.config.id2label[_id] for _id in action_ids]

action_gt = list(df_divExplorer['action'].values)

accuracy_score(action_gt, action_labels)

In [None]:
object_ids = torch.argmax(logits_concatenation[:, 6:20], dim=-1).tolist()
object_labels = [model.config.id2label[_id + 6] for _id in object_ids]

object_gt = list(df_divExplorer['object'].values)
object_gt = [f'{x}_object' if x=='none' else x for x in object_gt]

accuracy_score(object_gt, object_labels)

In [None]:
location_ids = torch.argmax(logits_concatenation[:, 20:24], dim=-1).tolist()
location_labels = [model.config.id2label[_id + 20] for _id in location_ids]

location_gt = list(df_divExplorer['location'].values)
location_gt = [f'{x}_location' if x=='none' else x for x in location_gt]

accuracy_score(location_gt, location_labels)

In [58]:
intents_predicted = [ action_labels[i]  + " " + object_labels[i] + " " + location_labels[i] for i in range(0, len(df_divExplorer))]

intents_gt = [ action_gt[i]  + " " + object_gt[i] + " " + location_gt[i] for i in range(0, len(df_divExplorer))]

is_correct = (np.array(intents_predicted) == np.array(intents_gt)).astype(int)

In [59]:
df_divExplorer['prediction'] = is_correct

In [60]:
output_folder = os.path.join('./data_precomputed', f'FSC_for_DivExplorer_{target_model}.csv')
df_divExplorer.to_csv(output_folder, index=False)

# Extracting Signal Metadata

In [61]:
from signal_metadata_extraction import MetadataExtractor

In [None]:
metadata_extractor = MetadataExtractor()
metadatas = []
audio_col = 'path'

from tqdm import tqdm
for i in tqdm(range(len(df_divExplorer))):

    audio_file = df_divExplorer[audio_col].iloc[i]
    signal_metadata = metadata_extractor.signal_metadata(audio_file)
    
    sentence = df_divExplorer['transcription'].iloc[i] 
    text_metadata = metadata_extractor.text_metadata(sentence)
    mixed_metadata = metadata_extractor.mixed_metadata(signal_metadata, text_metadata)
    metadatas.append(signal_metadata + text_metadata + mixed_metadata)

In [None]:
meta_cols = metadata_extractor.list_signal_metadata \
        + metadata_extractor.list_text_metadata \
        + metadata_extractor.list_sig_text_metadata
df_divExplorer_metas = pd.concat([df_divExplorer, pd.DataFrame(metadatas, columns = meta_cols)], axis=1)

print("Metadata columns: ", meta_cols)
print("---")
print("Df for DivExplorer Columns: ", df_divExplorer_metas.columns)

## Save Metadata in a CSV

In [64]:
output_folder = os.path.join('./data_precomputed', f'FSC_for_DivExplorer_{target_model}')

df_divExplorer_metas['path'] = df_divExplorer_metas['path'].str.replace(fsc_folder, \
    "fluent_speech_commands_dataset", regex = False)
df_divExplorer_metas.to_csv(f'{output_folder}.csv', index=False)