## Get drive data

In [1]:
from google.colab import drive
import os

drive.mount('/content/drive/')
save_path = '/content/drive/MyDrive/nlu'
os.makedirs(save_path, exist_ok=True)

Mounted at /content/drive/


## Imports

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Dense, Dropout, Dot, Softmax, Reshape, Permute,
    Concatenate, GlobalAveragePooling1D, GlobalMaxPooling1D,
    Lambda, BatchNormalization, LSTM, Bidirectional
)
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from transformers import TFAutoModel, AutoTokenizer

from keras.saving import register_keras_serializable

## Load data

In [3]:
train_data = pd.read_csv('/content/drive/MyDrive/nlu/train.csv')
valid_data = pd.read_csv('/content/drive/MyDrive/nlu/dev.csv')

## Calculate Class Weights

In [4]:
def calculate_normalized_class_weights(train_data):
    # Extract labels from training data
    y_train = train_data['label'].values

    # Calculate class distribution
    class_counts = np.bincount(y_train)
    total_samples = len(y_train)
    class_percentages = class_counts / total_samples * 100

    # Print distribution statistics
    for class_idx, (count, percentage) in enumerate(zip(class_counts, class_percentages)):
        print(f"Class {class_idx}: {count} samples ({percentage:.2f}%)")

    # Calculate weights with a small epsilon to avoid division by zero
    epsilon = 1e-6
    raw_class_weights = total_samples / (len(class_counts) * (class_counts + epsilon))

    # Normalize weights
    normalized_class_weights = raw_class_weights / np.sum(raw_class_weights)

    # Format as dictionary
    class_weights = {
        i: normalized_class_weights[i] for i in range(len(normalized_class_weights))
    }

    return class_weights


class_weights = calculate_normalized_class_weights(train_data)

Class 0: 11784 samples (48.23%)
Class 1: 12648 samples (51.77%)


## Hyperparameters

In [5]:
MODEL_NAME = 'microsoft/deberta-v3-base'
MAX_SEQUENCE_LENGTH = 128
HIDDEN_SIZE = 512
NUM_CLASSES = 2
BATCH_SIZE = 32
EPOCHS = 15
DROPOUT_RATE = 0.3

## Get pre-trained tokenizer and model

In [6]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
transformer_model = TFAutoModel.from_pretrained(MODEL_NAME, trainable=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



tf_model.h5:   0%|          | 0.00/736M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFDebertaV2Model.

All the layers of TFDebertaV2Model were initialized from the model checkpoint at microsoft/deberta-v3-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDebertaV2Model for predictions without further training.


## Model Architecture

In [7]:
def create_decomposable_attention_model(transformer_model, hidden_dim, dropout_rate, max_seq_length, num_classes=2):

    # Define inputs
    input_ids = Input(shape=(max_seq_length,), dtype='int32', name='input_ids')
    attention_mask = Input(shape=(max_seq_length,), dtype='int32', name='attention_mask')
    token_type_ids = Input(shape=(max_seq_length,), dtype='int32', name='token_type_ids')

    # Convert attention_mask to float32
    @keras.saving.register_keras_serializable()
    def cast_to_float(x):
      return tf.cast(x, tf.float32)

    attention_mask_float = Lambda(cast_to_float)(attention_mask)

    # Method to handle the transformer model call in sequence_output layer
    @keras.saving.register_keras_serializable()
    def get_transformer_embeddings(inputs):
        return transformer_model(
            input_ids=inputs[0],
            attention_mask=inputs[1],
            token_type_ids=inputs[2]
        )[0]

    embedding_dim = transformer_model.config.hidden_size

    # Get transformer embeddings
    sequence_output = Lambda(
        get_transformer_embeddings,
        output_shape=(max_seq_length, embedding_dim)
    )([input_ids, attention_mask, token_type_ids])

    # Split premise and hypothesis with token_type_ids
    @register_keras_serializable()
    def cast_to_float0(x):
      return tf.cast(tf.equal(x, 0), tf.float32)

    @register_keras_serializable()
    def cast_to_float1(x):
      return tf.cast(tf.equal(x, 1), tf.float32)

    premise_mask = Lambda(cast_to_float0)(token_type_ids)
    hypothesis_mask = Lambda(cast_to_float1)(token_type_ids)

    @register_keras_serializable()
    def multiply_pair(x):
        return x[0] * x[1]

    # Include the attention mask
    premise_mask = Lambda(multiply_pair)([premise_mask, attention_mask_float])
    hypothesis_mask = Lambda(multiply_pair)([hypothesis_mask, attention_mask_float])

    @register_keras_serializable()
    def expand_last_dim(x):
        return tf.expand_dims(x, axis=-1)

    # Reshape masks
    premise_mask_expanded = Lambda(expand_last_dim)(premise_mask)
    hypothesis_mask_expanded = Lambda(expand_last_dim)(hypothesis_mask)

    # Extract premise and hypothesis embeddings
    premise_embedded = Lambda(multiply_pair)([sequence_output, premise_mask_expanded])
    hypothesis_embedded = Lambda(multiply_pair)([sequence_output, hypothesis_mask_expanded])

    # Encoding layer
    premise_encoded = Dense(hidden_dim, activation='tanh')(premise_embedded)
    premise_encoded = Dropout(dropout_rate)(premise_encoded)

    hypothesis_encoded = Dense(hidden_dim, activation='tanh')(hypothesis_embedded)
    hypothesis_encoded = Dropout(dropout_rate)(hypothesis_encoded)

    # Attention mechanism
    @register_keras_serializable()
    def compute_attention(inputs):
        p_enc, h_enc = inputs
        return tf.matmul(p_enc, tf.transpose(h_enc, perm=[0, 2, 1]))

    attention_scores = Lambda(compute_attention)([premise_encoded, hypothesis_encoded])

    # Apply softmax to get attention weights
    @register_keras_serializable()
    def apply_softmax_1(x):
        return tf.nn.softmax(x, axis=-1)
    @register_keras_serializable()
    def apply_softmax_2(x):
        return tf.nn.softmax(x, axis=1)

    premise_attention = Lambda(apply_softmax_1)(attention_scores)
    hypothesis_attention = Lambda(apply_softmax_2)(attention_scores)

    # Get attended vectors
    @register_keras_serializable()
    def get_attended_1(inputs):
        att, h_enc = inputs
        return tf.matmul(att, h_enc)
    @register_keras_serializable()
    def get_attended_2(inputs):
        att, p_enc = inputs
        return tf.matmul(tf.transpose(att, perm=[0, 2, 1]), p_enc)

    attended_hypothesis = Lambda(get_attended_1)([premise_attention, hypothesis_encoded])
    attended_premise = Lambda(get_attended_2)([hypothesis_attention, premise_encoded])

    # Combine original and attended vectors
    enhanced_premise = Concatenate()([premise_encoded, attended_hypothesis])
    enhanced_hypothesis = Concatenate()([hypothesis_encoded, attended_premise])

    # Compare step
    compared_premise = Dense(hidden_dim, activation='tanh')(enhanced_premise)
    compared_premise = Dropout(dropout_rate)(compared_premise)

    compared_hypothesis = Dense(hidden_dim, activation='tanh')(enhanced_hypothesis)
    compared_hypothesis = Dropout(dropout_rate)(compared_hypothesis)

    # Aggregate step
    @register_keras_serializable()
    def pooling_with_mask(inputs):
        compared, mask = inputs
        sum_values = tf.reduce_sum(compared * mask, axis=1)
        count = tf.reduce_sum(mask[:,:,0], axis=1, keepdims=True)
        return sum_values / (count + 1e-10)

    aggregated_premise = Lambda(pooling_with_mask)([compared_premise, premise_mask_expanded])
    aggregated_hypothesis = Lambda(pooling_with_mask)([compared_hypothesis, hypothesis_mask_expanded])

    # Combine aggregated vectors
    merged = Concatenate()([aggregated_premise, aggregated_hypothesis])

    # Final classification layers
    dense = Dense(hidden_dim, activation='tanh')(merged)
    dense = Dropout(dropout_rate)(dense)
    output = Dense(num_classes, activation='softmax')(dense)

    # Create model
    model = Model(inputs=[input_ids, attention_mask, token_type_ids], outputs=output)

    # Compile model
    model.compile(
        optimizer=Adam(learning_rate=1e-5),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    return model

### Prepare training and validation data

In [8]:
def prepare_deberta_data(data, tokenizer, max_length=MAX_SEQUENCE_LENGTH):
    # Encode premises
    premises = data['premise'].fillna('').astype(str).tolist()
    hypotheses = data['hypothesis'].fillna('').astype(str).tolist()

    # Tokenize inputs
    encoded = tokenizer(
        premises, hypotheses,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='np'
    )

    # Convert labels to one-hot encoding
    labels = to_categorical(data['label'].values, num_classes=2)
    print(labels.shape)

    return encoded, labels

# Prepare data
train_inputs, train_labels = prepare_deberta_data(
    train_data,
    tokenizer
)

valid_inputs, valid_labels = prepare_deberta_data(
    valid_data,
    tokenizer
)

(24432, 2)
(6736, 2)


## Create and train model

In [9]:
# Create model
model = create_decomposable_attention_model(
    transformer_model=transformer_model,
    hidden_dim=HIDDEN_SIZE,
    dropout_rate=DROPOUT_RATE,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    num_classes=NUM_CLASSES
)
callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),
    tf.keras.callbacks.ModelCheckpoint('decomposable_attentions.keras', save_best_only=True)
]

# Train model
history = model.fit(
    [
        train_inputs['input_ids'],
        train_inputs['attention_mask'],
        train_inputs['token_type_ids'],
    ],
    train_labels,
    validation_data=(
        [
          valid_inputs['input_ids'],
          valid_inputs['attention_mask'],
          valid_inputs['token_type_ids'],
        ],
        valid_labels
    ),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=callbacks
)

Epoch 1/15
[1m764/764[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 84ms/step - accuracy: 0.5845 - loss: 0.6854 - val_accuracy: 0.7286 - val_loss: 0.5386
Epoch 2/15
[1m764/764[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 30ms/step - accuracy: 0.7246 - loss: 0.5413 - val_accuracy: 0.7933 - val_loss: 0.4467
Epoch 3/15
[1m764/764[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 30ms/step - accuracy: 0.7677 - loss: 0.4818 - val_accuracy: 0.8177 - val_loss: 0.4026
Epoch 4/15
[1m764/764[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 30ms/step - accuracy: 0.8004 - loss: 0.4322 - val_accuracy: 0.8358 - val_loss: 0.3782
Epoch 5/15
[1m764/764[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 30ms/step - accuracy: 0.8137 - loss: 0.4098 - val_accuracy: 0.8431 - val_loss: 0.3595
Epoch 6/15
[1m764/764[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 30ms/step - accuracy: 0.8315 - loss: 0.3823 - val_accuracy: 0.8558 - val_loss: 0.3454
Epoch 7/15
[1m

## Save model

In [10]:
# Save model weights
model.save_weights('/content/drive/MyDrive/nlu/attmodel/finalemodelB.weights.h5')
print("Model weights saved")

Model weights saved
