## Voice-Driven Disease Classification: A Deep Learning Approach

##### Team 34 : Jules Maglione, Paul Nadal

In [None]:
# pip install -r requirements.txt

In [None]:
import os
import torch
import librosa
import warnings
import pandas as pd
from utils import *
import seaborn as sns
from math import ceil, sqrt
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from transformers import AutoFeatureExtractor, HubertForSequenceClassification, BertTokenizer, BertForSequenceClassification

%load_ext autoreload
%autoreload 2

warnings.filterwarnings("ignore")

In [None]:
DATA_DIR = './data/'
RESULTS_DIR = './results/'
RECORDINGS_DIR = DATA_DIR + 'recordings/'

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

## 0. Dataset

> You can download the "Medical Speech, Transcription, and Intent" dataset from this [link](https://www.kaggle.com/datasets/paultimothymooney/medical-speech-transcription-and-intent/download?datasetVersionNumber=1). The dataset is approximately 6GB in size and includes thousands of audio utterances related to common medical symptoms such as "knee pain" or "headache." In total, the dataset comprises over 8 hours of aggregated audio content. Each utterance has been created by individual human contributors, who based their recordings on specific medical symptoms. This extensive collection of audio snippets is suitable for training conversational agents in the medical field.

#### 0.1 Load data

In [None]:
# Load the metadata from the CSV file

record_df = pd.read_csv(DATA_DIR + "overview-of-recordings.csv", sep=",")

print("Overview of recordings: ")
print("Number of recordings: ", len(record_df))
print("Number of features: ", len(record_df.columns))

#### 0.2 Split data

In [None]:
# Add a column to categorize the recordings

train_files = set(os.listdir(os.path.join(RECORDINGS_DIR, "train")))
valid_files = set(os.listdir(os.path.join(RECORDINGS_DIR, "validate")))
test_files = set(os.listdir(os.path.join(RECORDINGS_DIR, "test")))

record_df["split"] = record_df.file_name.apply(lambda x: "train" if x in train_files else ("validate" if x in valid_files else "test"))

# Check the distribution of the recordings

print("Number of training samples: ", len(record_df[record_df.split == "train"]))
print("Number of validation samples: ", len(record_df[record_df.split == "validate"]))
print("Number of test samples: ", len(record_df[record_df.split == "test"]))

In [None]:
# Create a dictionary mapping each prompt to an integer
prompt_to_id = {prompt: i for i, prompt in enumerate(record_df.prompt.unique())}
id_to_prompt = {i: prompt for prompt, i in prompt_to_id.items()}
record_df["label"] = record_df.prompt.apply(lambda x: prompt_to_id[x])

# Display the mapping
print("Prompt to ID mapping: ")
for prompt, id in prompt_to_id.items():
    print(prompt, ":", id)

In [None]:
# Save the updated CSV file 
record_df.to_csv(DATA_DIR + "overview-of-recordings-label.csv", index=False)

#### 0.3 Visualize data

In [None]:
# Create train, validation, and test dataframes
train_df = record_df[record_df['split'] == 'train']
valid_df = record_df[record_df['split'] == 'validate']
test_df = record_df[record_df['split'] == 'test']

# Create a function to plot the count of each prompt
def create_count_plot(ax, df, title, color):
    sns.countplot(y='prompt', data=df, color=color, ax=ax)
    ax.set_title(title)
    ax.set_xlabel('Count')
    ax.set_ylabel('Prompt')

# Create subplots
fig, axes = plt.subplots(1, 3, figsize=(15, 10))

# Combine the count plots on the same row
create_count_plot(axes[0], train_df, 'Training Set', 'blue')
create_count_plot(axes[1], valid_df, 'Validation Set', 'green')
create_count_plot(axes[2], test_df, 'Test Set', 'red')

# Adjust layout for better visualization
plt.suptitle('Count of Recordings by Prompt in Each Set', fontsize=20)
plt.tight_layout()
plt.show()

In [None]:
# Combine all phrases from the training set
all_phrases = ' '.join(train_df['phrase'])

# Get unique prompts in the training set
unique_prompts = train_df['prompt'].unique()

# Calculate the number of rows and columns for the subplot grid
num_prompts = len(unique_prompts)
cols = ceil(sqrt(num_prompts))
rows = ceil(num_prompts / cols)

# Create a figure with subplots
fig, axs = plt.subplots(rows, cols, figsize=(15, 10))

# Generate and display word clouds for each unique prompt
for ax, prompt in zip(axs.flatten(), unique_prompts):
    prompt_phrases = ' '.join(train_df[train_df['prompt'] == prompt]['phrase'])
    wordcloud = WordCloud(width=800, height=400).generate(prompt_phrases)
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.axis("off")
    ax.set_title(prompt, fontsize=15)

# Remove empty subplots if there are more than needed
for ax in axs.flatten()[num_prompts:]:
    fig.delaxes(ax)

# Adjust layout for better visualization
fig.suptitle('Word Clouds for Each Prompt in the Training Set', fontsize=20)
plt.tight_layout()
plt.show()

## 1. Audio classification

#### 1.1 Train models

> See [part1.ipynb](./part1.ipynb) to know more about this part.

In [None]:
# Uncomment to run the code for part 1
# run ./part1.ipynb

#### 1.2 Visualize Results

In [None]:
RESULTS_WAVEFORMS = RESULTS_DIR + "waveforms/"
RESULTS_WAV2VEC = RESULTS_DIR + "wav2vec/"
RESULTS_HUBERT = RESULTS_DIR + "hubert/"

# Plot the performance of each model
plot_model_performance(RESULTS_WAVEFORMS, "Waveforms")
plot_model_performance(RESULTS_WAV2VEC, "Wav2Vec")
plot_model_performance(RESULTS_HUBERT, "HuBERT")

#### 1.3 Experiment by yourself

In [None]:
# Record a sample audio file
speech_to_wav(DATA_DIR + "sample.wav")

In [None]:
# Initialize the tokenizer and model and load the weights
feature_extractor = AutoFeatureExtractor.from_pretrained("superb/hubert-base-superb-ks")
model = HubertForSequenceClassification.from_pretrained("superb/hubert-base-superb-ks", num_labels=len(record_df.prompt.unique()), ignore_mismatched_sizes=True).to(device)
model.load_state_dict(torch.load(RESULTS_DIR + "hubert.pt"), strict=False)

In [None]:
# Predict the prompt for the sample audio file
y, sr = librosa.load(DATA_DIR + "sample.wav", sr=16000)
inputs = feature_extractor(y, sampling_rate=sr, return_tensors="pt")
logits = model(inputs.input_values.to(device), attention_mask=inputs.attention_mask.to(device)).logits
preds = torch.argmax(logits, dim=1)

print("Predicted prompt: ", id_to_prompt[preds.item()])

## 2. Text classification

#### 2.1 Train models

> See [part2.ipynb](./part2.ipynb) to know more about this part.

In [None]:
# Uncomment to run the code for part 2
# run ./part2.ipynb

#### 2.2 Visualize Results

In [None]:
RESULTS_EMBEDDINGS = RESULTS_DIR + "embeddings/"
RESULTS_BASE_BERT = RESULTS_DIR + "base_bert/"
RESULTS_LARGE_BERT = RESULTS_DIR + "large_bert/"


# Plot the performance of each model
plot_model_performance(RESULTS_EMBEDDINGS, "Embeddings")
plot_model_performance(RESULTS_BASE_BERT, "Base BERT")
plot_model_performance(RESULTS_LARGE_BERT, "Large BERT")

#### 2.3 Experiment by yourself

In [None]:
# Write a sample phrase
phrase = "My foot hurts"

In [None]:
# Initialize the tokenizer and model and load the weights
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(record_df.prompt.unique())).to(device)
model.load_state_dict(torch.load(RESULTS_DIR + "base_bert.pt"), strict=False)

In [None]:
# Predict the prompt for the sample phrase
inputs = tokenizer(phrase, return_tensors="pt")
logits = model(inputs.input_ids.to(device), attention_mask=inputs.attention_mask.to(device)).logits
preds = torch.argmax(logits, dim=1)

print("Predicted prompt: ", id_to_prompt[preds.item()])

## 3. Audio to text classification

#### 3.1 Train models

> See [part3.ipynb](./part3.ipynb) to know more about this part.

In [None]:
# Uncomment to run the code for part 3
#  run ./part3.ipynb

#### 3.2 Visualize Results

In [None]:
RESULTS_SPEECH_TO_TEXT = RESULTS_DIR + "speech_to_text/"
RESULTS_BERT_ON_ASR = RESULTS_DIR + "bert_on_asr/"

print(f'Speech-to-Text Accuracy (word by word): {np.load(RESULTS_SPEECH_TO_TEXT + "test_acc.npy") * 100:.2f}%')

# Plot the performance of each model
plot_model_performance(RESULTS_BERT_ON_ASR, "BERT-on-ASR")

#### 3.3 Experiment by yourself

In [None]:
# Record a sample audio and convert it to text
phrase = speech_to_text()
print("Phrase: ", phrase)

In [None]:
# Initialize the tokenizer and model and load the weights
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(record_df.prompt.unique())).to(device)
model.load_state_dict(torch.load(RESULTS_DIR + "base_bert.pt"), strict=False)

In [None]:
# Predict the prompt for the sample phrase
inputs = tokenizer(phrase, return_tensors="pt")
logits = model(inputs.input_ids.to(device), attention_mask=inputs.attention_mask.to(device)).logits
preds = torch.argmax(logits, dim=1)

print("Predicted prompt: ", id_to_prompt[preds.item()])