### 1. CODIFY THE TEXT USING BERT

Python 3.8

De forma local optimiza el uso del GPU

In [31]:
text = "the scene is in the in the kitchen . the mother is wiping dishes and the water is running on the floor . a child is trying to get a boy is trying to get cookies outta a jar and hes about to tip over on a stool . uh the little girl is reacting to his falling . uh it seems to be summer out . the window is open . the curtains are blowing . it must be a gentle breeze . theres grass outside in the garden . uh mothers finished certain of the the dishes . kitchens very tidy . the mother seems to have nothing in the house to eat except cookies in the cookie jar . uh the children look to be almost about the same size . perhaps theyre twins . theyre dressed for summer warm weather . um you want more the mothers in a short sleeve dress . Ill hafta say its warm ."

import tensorflow as tf
from transformers import BertConfig, TFBertModel, BertTokenizer

custom_config = BertConfig(
    vocab_size=30522,               
    num_attention_heads=12,         
    num_hidden_layers=12,          
    attention_probs_dropout_prob=0.1,  
    hidden_size=768,                
    intermediate_size=3072,        
    hidden_dropout_prob=0.1,        
    hidden_act="relu",              
    max_position_embeddings=512    
)

model = TFBertModel(custom_config)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

inputs = tokenizer(
    text,
    return_tensors="tf",
    padding=True,
    truncation=True,
    max_length=512
)

bert_outputs = model(**inputs)

bert_last_hidden_state = bert_outputs.last_hidden_state 
bert_pooled_output = bert_outputs.pooler_output        

print("Last Hidden State Shape:", bert_last_hidden_state.shape)

# shape: number of samples is 1, number of tokens is 171 and size of the embedding vector is 768

Last Hidden State Shape: (1, 171, 768)


### 2.1. CNN TEXT MODEL

In [None]:
class TextCNN(tf.keras.Model):
    def __init__(self, dropout_prob=0.5):
        super(TextCNN, self).__init__()
        
        # convolution kernels
        self.conv1 = tf.keras.layers.Conv2D(130, (5, 768), activation='relu')
        self.conv2 = tf.keras.layers.Conv2D(130, (10, 768), activation='relu')
        self.conv3 = tf.keras.layers.Conv2D(130, (15, 768), activation='relu')
        self.conv4 = tf.keras.layers.Conv2D(130, (20, 768), activation='relu')

        # max pooling
        self.pool = tf.keras.layers.GlobalMaxPooling2D()

        # fusion layer
        self.fc = tf.keras.layers.Dense(260, activation='relu')
        self.dropout = tf.keras.layers.Dropout(dropout_prob)

    def call(self, inputs):
        # input the output of the bert model
        x = tf.expand_dims(inputs, -1) 
        x1 = self.conv1(x)
        x2 = self.conv2(x)
        x3 = self.conv3(x)
        x4 = self.conv4(x)
        
        # pool the outputs of the convolution layers 
        pooled_1 = self.pool(x1)
        pooled_2 = self.pool(x2)
        pooled_3 = self.pool(x3)
        pooled_4 = self.pool(x4)
        
        # fusion of all the features
        fused_features = tf.concat([pooled_1, pooled_2, pooled_3, pooled_4], axis=-1)

        feature_vector = self.fc(fused_features)
        feature_vector = self.dropout(feature_vector)
        
        return feature_vector

textcnn_model = TextCNN()
feature_vector_cnn = textcnn_model(bert_last_hidden_state)  
print("Feature Vector Shape:", feature_vector_cnn.shape)

Feature Vector Shape: (1, 260)


### 2.2. LTSM MODEL

In [11]:
lstm_units = 260 
dropout_rate = 0.5

lstm_model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=bert_last_hidden_state.shape[1:]), 
    tf.keras.layers.LSTM(
        units=lstm_units,
        activation='relu',          
        return_sequences=False      
    ),
    tf.keras.layers.Dropout(dropout_rate)  
])

lstm_features = lstm_model(bert_last_hidden_state)
print("Shape del vector de características (LSTM):", lstm_features.shape)

Shape del vector de características (LSTM): (1, 260)


Now we have to concatenate the CNN + LTSM

In [12]:
concatenated_vector = tf.concat([lstm_features, feature_vector_cnn], axis=-1)
print("Concatenated Feature Vector Shape:", concatenated_vector.shape)

Concatenated Feature Vector Shape: (1, 520)


Now, the fused features need to be passed into the fully connected layer first, and the Softmax classifier is used for the classification task. The dimension of the output vector must be the same as the number of categories (2 in this study) in the classification. Finally, we used the feature vector F to do the classification; y =soft max(WcF +bc)

In [14]:
input_dim = 520  
num_classes = 2  

classification_model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(input_dim,)),  
    tf.keras.layers.Dense(
        units=num_classes,  
        activation='softmax' 
    )
])

#random example to see if the dimensions work
y_pred = classification_model(concatenated_vector)

print("Prediction class:", y_pred.numpy()) #probability of belonging to each class
print("Shape:", y_pred.shape) 

Prediction class: [[0.9726935  0.02730652]]
Shape: (1, 2)


#### Training the model

Example: https://www.tensorflow.org/tutorials/keras/classification?

We have not trained the model, so the last prediction is randomly made.

For the training of the network, the paper mentions the following:

- epoch: 10
- batch size: 16
- learning rate: 1e-5
- dropout: 0.2
- max grad norm: 10
- train: test is proportion 7:3
- 10 consecutive runs
- performance: accuracy
- loss: crossentropy
- optimizer: AdamW, where W stands for weight decay


In [37]:
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel, BertConfig

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
custom_config = BertConfig(
    vocab_size=30522,               
    num_attention_heads=12,        
    num_hidden_layers=12,          
    attention_probs_dropout_prob=0.1,  
    hidden_size=768,               
    intermediate_size=3072,        
    hidden_dropout_prob=0.1,       
    hidden_act="relu",             
    max_position_embeddings=512    
)
bert_model = TFBertModel(custom_config)

data = pd.read_csv("C:\\Users\\lclai\\Desktop\\transcripts_cleaned.csv")
data = data[["label", "clean_transcripts"]]

def preprocess_and_get_bert_embeddings(text, tokenizer, model, max_length=512):
    inputs = tokenizer(
        text,
        return_tensors="tf",
        padding='max_length',  
        truncation=True,
        max_length=max_length
    )
    
    bert_outputs = model(**inputs)
    return bert_outputs.last_hidden_state, bert_outputs.pooler_output  # Obtener la última capa oculta y la salida de pooling

input_ids = []
attention_masks = []
last_hidden_states = []
pooler_outputs = []

for text in data['clean_transcripts']:
    last_hidden_state, pooler_output = preprocess_and_get_bert_embeddings(text, tokenizer, bert_model)
    last_hidden_states.append(last_hidden_state.numpy())  
    pooler_outputs.append(pooler_output.numpy())

last_hidden_states_tensor = tf.convert_to_tensor(last_hidden_states, dtype=tf.float32)
pooler_outputs_tensor = tf.convert_to_tensor(pooler_outputs, dtype=tf.float32)

print(f"Last Hidden States Tensor Shape: {last_hidden_states_tensor.shape}")
print(f"Pooler Outputs Tensor Shape: {pooler_outputs_tensor.shape}")


Last Hidden States Tensor Shape: (101, 1, 512, 768)
Pooler Outputs Tensor Shape: (101, 1, 768)


In [52]:
from sklearn.model_selection import train_test_split

X = last_hidden_states_tensor.numpy()
y = data['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

print(f"x train shape: {X_train.shape}, x test shape: {X_test.shape}")
print(f"y train shape: {y_train.shape}, y test shape: {y_test.shape}")


x train shape: (70, 1, 512, 768), x test shape: (31, 1, 512, 768)
y train shape: (70,), y test shape: (31,)
