## Libraries

In [None]:
import os
from os import path
import re
from tqdm import tqdm
import pickle
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn import utils

import tensorflow as tf

from transformers import BertTokenizerFast, BertModel, TFBertModel
import torch

tf.random.set_seed(42)
np.random.seed(42)

## Load pre-trained text model

### File Preprocessing

In [None]:
def file_preprocess():
  transcribed_dir = "./diagnosis/train/hubert/transcribed-correction/"

  comb_df = pd.DataFrame(columns=['file', 'text', 'label'])

  for folder in sorted(os.listdir(transcribed_dir)):
    for file in sorted(os.listdir(transcribed_dir+folder)):

      df = pd.read_csv(transcribed_dir+folder+'/'+file)
      df2 = df[['speaker', 'transcript_correction']]

      # Drop INV rows
      drop_inv = df2[df2['speaker']=='INV'].index
      df3 = df2.drop(drop_inv)

      df3 = df2

      # Combining rows
      lst = []
      for i, row in df3.iterrows():
        lst.append(row['transcript_correction'])

      lst2 = [x for x in lst if pd.isnull(x) == False]

      lst3 = ''.join(lst2)

      dic = {}
      dic['file'] = file
      dic['text'] = lst3
      dic['label'] = folder
      
      comb_df = comb_df.append(dic, ignore_index=True)

      comb_df['label'] = comb_df['label'].replace(['ad'], 1)
      comb_df['label'] = comb_df['label'].replace(['cn'], 0)

  return comb_df

In [None]:
comb_df = file_preprocess()

### Data Preprocessing

In [None]:
def clean(text): 
  text = text.lower()
  return text

In [None]:
def data_preprocess(train_df):
  train_df = comb_df.drop(columns=['file'])
  train_df['text'] = train_df['text'].apply(clean)
  train = train_df 

  return train

In [None]:
train = data_preprocess(comb_df)

### Load BERT model

In [None]:
# Load models
text_model_torch = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', truncation=True, padding=True, max_length=512)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if yo

### Extract BERT embeddings function

In [None]:
def extract_embeddings(text):

  # Special tokens CLS and SEP
  # To mark as classification task
  # To mark end of sentence or separation between two sentences
  marked_text = "[CLS] " + text + " [SEP]"

  # Tokenize sentences with BERT
  tokenized_text = tokenizer.tokenize(marked_text) #, truncation=True, max_length=512)
  # Map tokens to vocab indices
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
  segments_ids = [1] * len(tokenized_text)

  # Convert to pytorch tensors
  tokens_tensor = torch.tensor([indexed_tokens])
  segments_tensors = torch.tensor([segments_ids])

  # Set it to eval mode
  text_model_torch.eval()
  
  with torch.no_grad():
    outputs = text_model_torch(tokens_tensor, segments_tensors)
    # Get hidden states from all the layers
    hidden_states = outputs[2]

  # # For word embeddings, if we want to extract the last 4 layers
  # # To combine and reshape the tensors 
  # token_embeddings = torch.stack(hidden_states, dim=0)
  # token_embeddings = torch.squeeze(token_embeddings, dim=1)
  # token_embeddings = token_embeddings.permute(1,0,2)

  # # If we want to concat the last 4 layers
  # token_vecs_cat = []
  # for token in token_embeddings:
  # # Alternatively we can also sum() 
  #   cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
  #   token_vecs_cat.append(cat_vec)

  # For sentence embeddings
  # We take the second-to-last hidden layer and average it 
  token_vecs = hidden_states[-2][0]

  # Calculate the average 
  sentence_embedding = torch.mean(token_vecs, dim=0)

  return sentence_embedding

### Load saved text model

In [None]:
# Load the model
text_model = tf.keras.models.load_model('./text_model2/')

text_model.trainable=False

# Remove classification layer
text_model.pop()

# Add normalization layer
norm_layer2 = tf.keras.layers.BatchNormalization()(text_model.output)
text_model = tf.keras.models.Model(text_model.input, norm_layer2)

In [None]:
# View the new text model

text_model.summary()

Model: "model_724"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_196_input (InputLayer  [(None, 768)]            0         
 )                                                               
                                                                 
 dense_196 (Dense)           (None, 32)                24608     
                                                                 
 leaky_re_lu_147 (LeakyReLU)  (None, 32)               0         
                                                                 
 dropout_147 (Dropout)       (None, 32)                0         
                                                                 
 dense_197 (Dense)           (None, 32)                1056      
                                                                 
 leaky_re_lu_148 (LeakyReLU)  (None, 32)               0         
                                                         

## Load pre-trained audio model

### Load features

In [None]:
# Load features
features_df = pd.read_csv('./Audio_Classification/features_df.csv')

### Load saved audio model

In [None]:
# Load model

audio_model = tf.keras.models.load_model('./audio_model2/')

audio_model.trainable=False

# Remove classification layer
audio_model.pop()

# Add norm layer
norm_layer = tf.keras.layers.BatchNormalization()(audio_model.output)
audio_model = tf.keras.models.Model(audio_model.input, norm_layer)

In [None]:
# View new audio model summary

audio_model.summary()

Model: "model_725"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_132_input (InputLayer  [(None, 88)]             0         
 )                                                               
                                                                 
 dense_132 (Dense)           (None, 32)                2848      
                                                                 
 leaky_re_lu_99 (LeakyReLU)  (None, 32)                0         
                                                                 
 dropout_99 (Dropout)        (None, 32)                0         
                                                                 
 dense_133 (Dense)           (None, 32)                1056      
                                                                 
 leaky_re_lu_100 (LeakyReLU)  (None, 32)               0         
                                                         

## Get audio and text inputs

In [None]:
# Text features
text_df = train

# Audio features
audio_df = features_df

To ensure that the loaded audio and text data are correct for each file, some preprocesing is done

In [None]:
# Combine first
combined_df = pd.concat([audio_df, text_df], axis=1, join="inner")

In [None]:
# Split function
def preproc(combined_df):

  combined_df = combined_df.drop('file', 1)

  # Split to features and labels
  Y = combined_df['label']
  X = combined_df.drop('label', 1)

  return X, Y

In [None]:
# Split to train/labels
X_train, y_train = preproc(combined_df)

  combined_df = combined_df.drop('file', 1)
  X = combined_df.drop('label', 1)


In [None]:
# Remove duplicates
y_train = y_train.loc[:,~y_train.columns.duplicated()]

In [None]:
# Split into audio/text

# X for audio, X2 for text
X2_train = X_train['text'] # Text
X_train = X_train.drop(['text'], 1) # Audio

  X_train = X_train.drop(['text'], 1) # Audio


In [None]:
# Normalize audio features

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [None]:
# Extract bert embeddings
X2_train_embedd = []
all_sentences = X2_train

for sent in all_sentences:

  embedd = extract_embeddings(sent)
  embedd = embedd.cpu().detach().numpy()
  X2_train_embedd.append(embedd)

# Normalize text features

scaler = StandardScaler()
X2_train_embedd = scaler.fit_transform(X2_train_embedd)

In [None]:
y_train = y_train.to_numpy()

## Bilinear Pooling

### Install dependencies

In [None]:
!pip install --upgrade keras-hypetune

### Libraries

In [None]:
from kerashypetune import KerasGridSearchCV  

from tensorflow.keras import regularizers, Model

from tensorflow.keras.models import Sequential

from tensorflow.keras.layers import Dense, Dropout, LeakyReLU, Layer, Input, Reshape, Lambda
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.regularizers import l2

from sklearn.model_selection import GridSearchCV, KFold, LeaveOneOut
from sklearn.metrics import classification_report, confusion_matrix 
from keras.wrappers.scikit_learn import KerasClassifier

# from keras import backend as K

### Bilinear class functions

In [None]:
class bili_regularizer_l2(regularizers.Regularizer):
    '''
        Standard L2 regularization applied to the weight matrix for the bilinear layer.
    '''
    def __init__(self, strength):
        self.strength = strength

    def __call__(self, weights):
        w0 = weights[0]
        w1 = weights[1]

        T1 = tf.matmul(tf.transpose(w0, perm=[0,2,1]), w0)
        T2 = tf.matmul(tf.transpose(w1, perm=[0,2,1]), w1) 
        z = tf.linalg.trace(tf.matmul(T1, T2))
        
        return self.strength * tf.reduce_sum(z) 

In [None]:
''' Credit: https://github.com/konstantinkutzkov/bilinear_pooling/blob/main/bilinear_pooling.ipynb '''

class bilinear_layer(Layer):
    def __init__(self, num_outputs, channels_X, channels_Y, regularizer, d, rank, seed=42):
        super().__init__()
        self.num_outputs = num_outputs
        self.channels_X = channels_X # the number of features in the first NN
        self.channels_Y = channels_Y # the number of features in the second NN
        self.d = d # the dimensionality of the feature maps
        self.rank = rank # the rank of the low-rank matrices
        self.kernel_regularizer = regularizer 

    def build(self, input_shape):
        
        self.w = self.add_weight(shape=(2, self.num_outputs, self.channels_X, self.rank),
                                    initializer="random_normal",
                                    trainable=True,
                                    regularizer=self.kernel_regularizer,
                                 name='bilinear_weight')
        b_init = tf.zeros_initializer()
        self.b = tf.Variable(
            initial_value=b_init(shape=(self.num_outputs,), dtype="float32"), trainable=True
        )

    def get_config(self):
      config = super().get_config()
      config.update({
          "num_outputs": self.num_outputs,
          "channels_X": self.channels_X,
          "channels_Y": self.channels_Y,
          "d": self.d,
          "rank": self.rank,
          "kernel_regularizer": self.kernel_regularizer
      })
      return config
        
    def call(self, inputs):
        X, Y = inputs[0], inputs[1]       
        X = tf.reshape(X, (-1, 1, self.channels_X, self.d*self.d)) 
        Y = tf.reshape(Y,  (-1, 1, self.channels_Y, self.d*self.d)) 
        T1 = tf.matmul(tf.transpose(X, perm=[0,1,3,2]), self.w[0])
        T2 = tf.matmul(tf.transpose(self.w[1], perm=[0,2,1]), Y) 
        
        z = tf.linalg.trace(tf.matmul(T1, T2))/(self.d*self.d) + self.b

        return z 



In [None]:
# Audio features input
audio_input = Input(shape=(88,), name='audio')
audio = audio_model(audio_input)

# Text features input
text_input = Input(shape=(768,), name='text', dtype='float64')
text = text_model(text_input)

### Define the bilinear pooling model

In [None]:

# Bilinear layer

def bilinear_model(param):

  # Bilinear pooling #
  bili_reg = bili_regularizer_l2(strength=param['bili_reg'])

  bili_layer = bilinear_layer(num_outputs=param['bili_out_dim'],
                channels_X=32,
                channels_Y=32,
                regularizer=bili_reg, 
                rank=1,
                d=1,
                )
  #-#

  x = bili_layer([audio, text])

  x = tf.keras.layers.Dense(1, activation='sigmoid', name='output')(x)

  model = tf.keras.Model(inputs=[audio_input, text_input], outputs=x)

  optimizer = Adam(learning_rate=param['learning_rate'])

  model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"], run_eagerly=True)

  return model

### Train model

In [None]:
# For debugging
# params = {
# 'batch_size':[1], 
# 'epochs':[1],
# 'bili_reg':[0.5],
# 'bili_out_dim':[32],
# 'learning_rate':[5e-2],
# }

params = {
'batch_size':[8], 
'epochs':[50],
'bili_reg':[0.5],
'bili_out_dim':[16],
'learning_rate':[1e-4],
}

model = bilinear_model

cv10 = KFold(n_splits=10, shuffle=True)

kgs = KerasGridSearchCV(model, params, monitor='val_accuracy', cv=cv10, greater_is_better=True, tuner_verbose=1)

kgs.search([X_train,X2_train_embedd], y_train)

best_model = kgs.folds_best_models

c_scores=[]
combined_scores = kgs.folds_scores.values

for value in kgs.folds_scores.values():
    c_scores.append(value)

mean_arr = np.mean(c_scores,axis=0)
std_arr = np.std(c_scores,axis=0)

print('Mean: ', mean_arr)
print('Std: ', std_arr)


##################
###  Fold 001  ###
##################

1 trials detected for ('batch_size', 'epochs', 'bili_reg', 'bili_out_dim', 'learning_rate')

***** (1/1) *****
Search({'batch_size': 8, 'epochs': 50, 'bili_reg': 0.5, 'bili_out_dim': 16, 'learning_rate': 0.0001})
SCORE: 0.58824 at epoch 8

##################
###  Fold 002  ###
##################

1 trials detected for ('batch_size', 'epochs', 'bili_reg', 'bili_out_dim', 'learning_rate')

***** (1/1) *****
Search({'batch_size': 8, 'epochs': 50, 'bili_reg': 0.5, 'bili_out_dim': 16, 'learning_rate': 0.0001})
SCORE: 0.82353 at epoch 19

##################
###  Fold 003  ###
##################

1 trials detected for ('batch_size', 'epochs', 'bili_reg', 'bili_out_dim', 'learning_rate')

***** (1/1) *****
Search({'batch_size': 8, 'epochs': 50, 'bili_reg': 0.5, 'bili_out_dim': 16, 'learning_rate': 0.0001})
SCORE: 0.88235 at epoch 37

##################
###  Fold 004  ###
##################

1 trials detected for ('batch_size', 'epochs'

## Concatenation

#### Define the concatenation model

In [None]:
# Audio features input
audio_input = Input(shape=(88,), name='audio')
audio = audio_model(audio_input)

# Text features input
text_input = Input(shape=(768,), name='text', dtype='float64')
text = text_model(text_input)

In [None]:

def concat_model(param):
  
  # Concatenate layer
  x = tf.keras.layers.Concatenate(axis=1)([audio, text])

  x = tf.keras.layers.Dense(1, activation='sigmoid', name='output')(x)

  model = tf.keras.Model(inputs=[audio_input, text_input], outputs=x)

  optimizer = Adam(learning_rate=param['lr'])

  model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

  return model

#### Train and tune the model

In [None]:
# Grid search train the model

params = {
'batch_size':[1], 
'epochs':[50],
'lr':[1e-5],
}

model = concat_model

cvf = KFold(n_splits=10, shuffle=True)

kgs = KerasGridSearchCV(model, params, monitor='val_accuracy', cv=cvf, greater_is_better=True, tuner_verbose=1)

kgs.search([X_train,X2_train_embedd], y_train)

best_concat_model = kgs.folds_best_models



##################
###  Fold 001  ###
##################

1 trials detected for ('batch_size', 'epochs', 'lr')

***** (1/1) *****
Search({'batch_size': 1, 'epochs': 50, 'lr': 1e-05})
SCORE: 0.82353 at epoch 1

##################
###  Fold 002  ###
##################

1 trials detected for ('batch_size', 'epochs', 'lr')

***** (1/1) *****
Search({'batch_size': 1, 'epochs': 50, 'lr': 1e-05})
SCORE: 1.0 at epoch 6

##################
###  Fold 003  ###
##################

1 trials detected for ('batch_size', 'epochs', 'lr')

***** (1/1) *****
Search({'batch_size': 1, 'epochs': 50, 'lr': 1e-05})
SCORE: 0.94118 at epoch 21

##################
###  Fold 004  ###
##################

1 trials detected for ('batch_size', 'epochs', 'lr')

***** (1/1) *****
Search({'batch_size': 1, 'epochs': 50, 'lr': 1e-05})
SCORE: 1.0 at epoch 1

##################
###  Fold 005  ###
##################

1 trials detected for ('batch_size', 'epochs', 'lr')

***** (1/1) *****
Search({'batch_size': 1, 'epochs': 5

In [None]:
c_scores=[]
combined_scores = kgs.folds_scores.values

for value in kgs.folds_scores.values():
    c_scores.append(value)

mean_arr = np.mean(c_scores,axis=0)
std_arr = np.std(c_scores,axis=0)

print('Mean: ', mean_arr)
print('Std: ', std_arr)

Mean:  [0.755515]
Std:  [0.22212376]
