# Part 3 : Audio to text classification

## 0. Initialization

In [None]:
import os
import torch
import numpy as np
import pandas as pd
from utils import *
import torch.nn as nn
from tqdm import tqdm
import torch.optim as optim
from model.classifier import Classifier
from torch.utils.data import DataLoader, TensorDataset
from dataset.PhraseDataset import PhraseDataset
from transformers import BertModel, BertTokenizer, BertForSequenceClassification, pipeline

%load_ext autoreload
%autoreload 2

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
DATA_DIR = './data/'
RESULTS_DIR = './results/'
RECORDINGS_DIR = DATA_DIR + 'recordings/'

In [None]:
record_df = pd.read_csv(DATA_DIR + 'overview-of-recordings-label.csv')

## 1. Speech to text

#### 1.1 Load dataset

In [None]:
test_df = record_df[record_df.split == 'test']

test_files = [os.path.join(RECORDINGS_DIR, "test", f) for f in test_df.file_name]

test_phrases = test_df.phrase.values

test_labels = test_df.label.values

##### 1.2 Initialize speech to text pipeline

In [None]:
asr_pipeline = pipeline(task="automatic-speech-recognition", model="facebook/wav2vec2-base-960h")

#### 1.3 Predict text phrases

In [None]:
predicted_test_phrases = [transcribe_wav_to_text(f, asr_pipeline) for f in tqdm(test_files)]

#### 1.4 Test conversion

In [None]:
test_acc = test_conversion(test_phrases, predicted_test_phrases)
print(f"Test accuracy: {test_acc * 100:.2f}%")

#### 1.5 Save results

In [None]:
if not os.path.exists(RESULTS_DIR + "speech_to_text"):
    os.makedirs(RESULTS_DIR + "speech_to_text")

np.save(RESULTS_DIR + "speech_to_text/test_acc.npy", np.array(test_acc))

## 2. Text classification

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(record_df.prompt.unique())).to(device)

model.load_state_dict(torch.load(RESULTS_DIR + 'large_bert.pt'), strict=False)

#### 2.1 Load dataset

In [None]:
max_seq_length = 37
batch_size = 256

test_dataset = PhraseDataset(predicted_test_phrases, test_labels, tokenizer, max_seq_length, device)

test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

#### 2.2 Test model

In [None]:
test_acc, cm = test_transformer(model, test_loader, device)

#### 2.3 Save results

In [None]:
if not os.path.exists(RESULTS_DIR + "bert_on_asr"):
    os.makedirs(RESULTS_DIR + "bert_on_asr")

np.save(RESULTS_DIR + "bert_on_asr/test_acc.npy", np.array(test_acc))
np.save(RESULTS_DIR + "bert_on_asr/cm.npy", cm)