# Per Question

In [None]:
import argparse
import audiofile
import audtorch
import json
import glob
import os
import pandas as pd
import tqdm
import torch
import torchaudio

from transformers import (
    Wav2Vec2Model, 
    Wav2Vec2Processor,
    Wav2Vec2CTCTokenizer,
    Wav2Vec2FeatureExtractor
)

In [None]:
# Define cell parameters
src_default = '../data/cropped_data/cropped_interview_data/per_question'
dst_default = '../data/features/features_interview/facebook-wav2vec2.csv'
dst_wdw_default = '../data/features/features_interview/windowed_facebook-wav2vec2.csv'
model_default = 'facebook/wav2vec2-large-xlsr-53-german'
device_default = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define a function to parse cell parameters
def parse_cell_parameters(src=src_default, dst=dst_default, dst_wdw=dst_wdw_default, model=model_default, device=device_default):
    return {
        'src': src,
        'dst': dst,
        'dst_wdw': dst_wdw,
        'model': model,
        'device': device
    }

# Parse cell parameters
parameters = parse_cell_parameters()

In [None]:
dst = parameters['dst']
if os.path.isfile(dst):
    exit()
os.makedirs(os.path.dirname(dst), exist_ok=True)

files = glob.glob(os.path.join(parameters['src'], '*.wav'))

In [None]:
vocab_dict = {}
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)
tokenizer = Wav2Vec2CTCTokenizer('./vocab.json')
tokenizer.save_pretrained('./tokenizer')

In [None]:
try:
    extractor = Wav2Vec2FeatureExtractor.from_pretrained(parameters['model'])
except OSError:
    extractor = Wav2Vec2FeatureExtractor(
        feature_size=1, 
        sampling_rate=16000, 
        padding_value=0.0, 
        do_normalize=True, 
        return_attention_mask=True
    )
processor = Wav2Vec2Processor(feature_extractor=extractor, tokenizer=tokenizer)
model = Wav2Vec2Model.from_pretrained(parameters['model']).to(parameters['device'])
model.eval()

In [None]:
num_features = 768 if 'base' in parameters['model'] else 1024
embeddings = torch.zeros(len(files), num_features)
for counter, (file) in tqdm.tqdm(
    enumerate(files), 
    total=len(files), 
    desc=parameters['model']
):
    audio, fs = audiofile.read(
        file,
        always_2d=True
    )
    audio = audtorch.transforms.Expand(4000)(audio)
    audio = torch.from_numpy(audio)
    if fs != 16000:
        audio = torchaudio.transforms.Resample(fs, 16000)(audio)
    if len(audio.shape) == 2:
        audio = audio.mean(0)
    inputs = processor(audio, sampling_rate=16000, return_tensors="pt", padding=True)
    with torch.no_grad():
        embeddings[counter, :] = model(
            inputs.input_values.to(parameters['device']),
        )[0].cpu().mean(1).squeeze(0)

In [None]:
features = pd.DataFrame(
    data=embeddings.numpy(),
    columns=[f'Neuron_{x}' for x in range(num_features)],
    index=pd.Index(files, name='file')
).reset_index()
features['file'] = features['file'].apply(os.path.basename)
features.to_csv(parameters['dst'], index=False)

# With windowing

In [None]:
# Define cell parameters
src_default = '../data/cropped_data/cropped_interview_data/per_question/windowed_2000_500'
dst_default = '../data/features/features_interview/windowed_facebook-wav2vec2.csv'
model_default = 'facebook/wav2vec2-large-xlsr-53-german'
device_default = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define a function to parse cell parameters
def parse_cell_parameters(src=src_default, dst=dst_default, model=model_default, device=device_default):
    return {
        'src': src,
        'dst': dst,
        'model': model,
        'device': device
    }

# Parse cell parameters
parameters = parse_cell_parameters()

In [None]:
dst = parameters['dst']
if os.path.isfile(dst):
    exit()
os.makedirs(os.path.dirname(dst), exist_ok=True)

files = glob.glob(os.path.join(parameters['src'], '*.wav'))

In [None]:
vocab_dict = {}
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)
tokenizer = Wav2Vec2CTCTokenizer('./vocab.json')
tokenizer.save_pretrained('./tokenizer')

In [None]:
try:
    extractor = Wav2Vec2FeatureExtractor.from_pretrained(parameters['model'])
except OSError:
    extractor = Wav2Vec2FeatureExtractor(
        feature_size=1, 
        sampling_rate=16000, 
        padding_value=0.0, 
        do_normalize=True, 
        return_attention_mask=True
    )
processor = Wav2Vec2Processor(feature_extractor=extractor, tokenizer=tokenizer)
model = Wav2Vec2Model.from_pretrained(parameters['model']).to(parameters['device'])
model.eval()

In [None]:
num_features = 768 if 'base' in parameters['model'] else 1024
embeddings = torch.zeros(len(files), num_features)
for counter, (file) in tqdm.tqdm(
    enumerate(files), 
    total=len(files), 
    desc=parameters['model']
):
    audio, fs = audiofile.read(
        file,
        always_2d=True
    )
    audio = audtorch.transforms.Expand(4000)(audio)
    audio = torch.from_numpy(audio)
    if fs != 16000:
        audio = torchaudio.transforms.Resample(fs, 16000)(audio)
    if len(audio.shape) == 2:
        audio = audio.mean(0)
    inputs = processor(audio, sampling_rate=16000, return_tensors="pt", padding=True)
    with torch.no_grad():
        embeddings[counter, :] = model(
            inputs.input_values.to(parameters['device']),
        )[0].cpu().mean(1).squeeze(0)

In [None]:
features = pd.DataFrame(
    data=embeddings.numpy(),
    columns=[f'Neuron_{x}' for x in range(num_features)],
    index=pd.Index(files, name='file')
).reset_index()
features['file'] = features['file'].apply(os.path.basename)
features.to_csv(parameters['dst'], index=False)