# Import Libraries 

In [1]:
import os
os.environ["KERAS_BACKEND"] = "jax"  # or "tensorflow" or "torch"

import keras_nlp
import keras
import tensorflow as tf

import numpy as np 
import pandas as pd
from tqdm import tqdm
import json

import matplotlib.pyplot as plt
import matplotlib as mpl
import plotly.express as px

In [2]:
print("TensorFlow:", tf.__version__)
print("Keras:", keras.__version__)
print("KerasNLP:", keras_nlp.__version__)

TensorFlow: 2.17.1
Keras: 3.5.0
KerasNLP: 0.18.1


In [3]:
class CFG:
    seed = 42  # Random seed
    preset = "deberta_v3_extra_small_en" # Name of pretrained models
    sequence_length = 512  # Input sequence length
    epochs = 3 # Training epochs
    batch_size = 16  # Batch size
    scheduler = 'cosine'  # Learning rate scheduler
    label2name = {0: 'winner_model_a', 1: 'winner_model_b', 2: 'winner_tie'}
    name2label = {v:k for k, v in label2name.items()}
    class_labels = list(label2name.keys())
    class_names = list(label2name.values())

In [4]:
keras.utils.set_random_seed(CFG.seed)

Use mixed precision instead of float32 precision for training and inference to reduce training and inference time.

In [5]:
keras.mixed_precision.set_global_policy("mixed_float16")

In [6]:
BASE_PATH = '/kaggle/input/llm-classification-finetuning'

# 📖 | Meta Data 

The competition dataset comprises user interactions from the ChatBot Arena. In each interaction, a judge presents one or more prompts to two different large language models and then indicates which model provided the more satisfactory response. The training data contains `55,000` rows, with an expected `25,000` rows in the test set.

## Files

### `train.csv`
- `id`: Unique identifier for each row.
- `model_[a/b]`: Model identity, present in train.csv but not in test.csv.
- `prompt`: Input prompt given to both models.
- `response_[a/b]`: Model_[a/b]'s response to the prompt.
- `winner_model_[a/b/tie]`: Binary columns indicating the judge's selection (ground truth target).

### `test.csv`
- `id`: Unique identifier for each row.
- `prompt`: Input prompt given to both models.
- `response_[a/b]`: Model_[a/b]'s response to the prompt.

> Note that each interaction may have multiple prompts and responses, but this notebook will use only **one prompt per interaction**. You can choose to use all prompts and responses. Additionally, prompts and responses in the dataframe are provided as string-formatted lists, so they need to be converted to literal lists using `eval()`.


## Train Data

In [7]:
# Load Train Data
df = pd.read_csv(f'{BASE_PATH}/train.csv') 

# Sample data
df = df.sample(frac=0.1)

# Take the first prompt and its associated response
df["prompt"] = df.prompt.map(lambda x: eval(x)[0])
df["response_a"] = df.response_a.map(lambda x: eval(x.replace("null","''"))[0])
df["response_b"] = df.response_b.map(lambda x: eval(x.replace("null", "''"))[0])

# Label conversion
df["class_name"] = df[["winner_model_a", "winner_model_b" , "winner_tie"]].idxmax(axis=1)
df["class_label"] = df.class_name.map(CFG.name2label)

# Show Sample
#df.head()
df.shape

(5748, 11)

## Test Data

In [8]:
# Load Test Data
test_df = pd.read_csv(f'{BASE_PATH}/test.csv')

# Take the first prompt and response
test_df["prompt"] = test_df.prompt.map(lambda x: eval(x)[0])
test_df["response_a"] = test_df.response_a.map(lambda x: eval(x.replace("null","''"))[0])
test_df["response_b"] = test_df.response_b.map(lambda x: eval(x.replace("null", "''"))[0])

# Show Sample
#test_df.head()
test_df.shape

(3, 4)

## Contextualize Response with Prompt

Contextualize each response with the prompt. For each response, we will provide the model with the same set of prompts combined with their respective response (e.g., `(P + R_A)`, `(P + R_B)`, etc.). This approach is similar to the multiple-choice question task in NLP.


In [9]:
# Define a function to create options based on the prompt and choices
def make_pairs(row):
    row["encode_fail"] = False
    try:
        prompt = row.prompt.encode("utf-8").decode("utf-8")
    except:
        prompt = ""
        row["encode_fail"] = True

    try:
        response_a = row.response_a.encode("utf-8").decode("utf-8")
    except:
        response_a = ""
        row["encode_fail"] = True

    try:
        response_b = row.response_b.encode("utf-8").decode("utf-8")
    except:
        response_b = ""
        row["encode_fail"] = True
        
    row['options'] = [f"Prompt: {prompt}\n\nResponse: {response_a}",  # Response from Model A
                      f"Prompt: {prompt}\n\nResponse: {response_b}"  # Response from Model B
                     ]
    return row


In [10]:
df = df.apply(make_pairs, axis=1)  # Apply the make_pairs function to each row in df
display(df.head(2))  # Display the first 2 rows of df
display(df["options"].iloc[0])

test_df = test_df.apply(make_pairs, axis=1)  # Apply the make_pairs function to each row in df
display(test_df.head(2))  # Display the first 2 rows of df

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie,class_name,class_label,encode_fail,options
37379,2785062085,gpt-3.5-turbo-0613,codellama-34b-instruct,what does hello world mean,"""Hello, World!"" is a phrase used in computer p...","""Hello, World!"" is a common phrase used to dis...",1,0,0,winner_model_a,0,False,[Prompt: what does hello world mean\n\nRespons...
644,48259531,mistral-medium,gpt-4-0314,I ran a marathon in 3:12:00 weighting 84kg. Ho...,It's difficult to provide an exact answer to t...,To accurately estimate how much faster you wou...,1,0,0,winner_model_a,0,False,[Prompt: I ran a marathon in 3:12:00 weighting...


['Prompt: what does hello world mean\n\nResponse: "Hello, World!" is a phrase used in computer programming to introduce beginners to a new programming language. It is often the first program that a programmer writes when learning a new language. The program typically displays the words "Hello, World!" on the screen or console. It serves as a basic starting point to verify that the programming environment is set up correctly and to demonstrate the basic syntax of the language.',
 'Prompt: what does hello world mean\n\nResponse: "Hello, World!" is a common phrase used to display a message on a computer screen. It is often used as a simple test to ensure that a computer program or system is working correctly. When a computer program is executed, it will typically print "Hello, World!" to the screen, indicating that the program has been executed successfully.']

Unnamed: 0,id,prompt,response_a,response_b,encode_fail,options
0,136060,"I have three oranges today, I ate an orange ye...",You have two oranges today.,You still have three oranges. Eating an orange...,False,"[Prompt: I have three oranges today, I ate an ..."
1,211333,You are a mediator in a heated political debat...,Thank you for sharing the details of the situa...,Mr Reddy and Ms Blue both have valid points in...,False,[Prompt: You are a mediator in a heated politi...


# Preprocessing - Tokenization

In [12]:
preprocessor = keras_nlp.models.DebertaV3Preprocessor.from_preset(
    preset=CFG.preset, # Name of the model
    sequence_length=CFG.sequence_length, # Max sequence length, will be padded if shorter
)

Now, let's examine what the output shape of the preprocessing layer looks like. The output shape of the layer can be represented as $(num\_responses, sequence\_length)$.

In [13]:
outs = preprocessor(df.options.iloc[0])  # Process options for the first row

# Display the shape of each processed output
for k, v in outs.items():
    print(k, ":", v.shape)

token_ids : (2, 512)
padding_mask : (2, 512)


We'll use the `preprocessing_fn` function to transform each text option using the `dataset.map(preprocessing_fn)` method.

In [14]:
def preprocess_fn(text, label=None):
    text = preprocessor(text)  # Preprocess text
    return (text, label) if label is not None else text  # Return processed text and label if available

# DataLoader

In [15]:
def build_dataset(texts, labels=None, batch_size=32,
                  cache=True, shuffle=1024):
    AUTO = tf.data.AUTOTUNE  # AUTOTUNE option
    slices = (texts,) if labels is None else (texts, keras.utils.to_categorical(labels, num_classes=3))  # Create slices
    ds = tf.data.Dataset.from_tensor_slices(slices)  # Create dataset from slices
    ds = ds.cache() if cache else ds  # Cache dataset if enabled
    ds = ds.map(preprocess_fn, num_parallel_calls=AUTO)  # Map preprocessing function
    opt = tf.data.Options()  # Create dataset options
    if shuffle: 
        ds = ds.shuffle(shuffle, seed=CFG.seed)  # Shuffle dataset if enabled
        opt.experimental_deterministic = False
    ds = ds.with_options(opt)  # Set dataset options
    ds = ds.batch(batch_size, drop_remainder=False)  # Batch dataset
    ds = ds.prefetch(AUTO)  # Prefetch next batch
    return ds  # Return the built dataset

# Model Checkpointing

Create a callback that will save the best checkpoint of the model during training.

In [18]:
ckpt_cb = keras.callbacks.ModelCheckpoint(f'best_model.weights.h5',
                                          monitor='val_log_loss',
                                          save_best_only=True,
                                          save_weights_only=True,
                                          mode='min')  # Get Model checkpoint callback

# Metric


In [19]:
log_loss = keras.metrics.CategoricalCrossentropy(name="log_loss")

# Modeling

In [20]:
# Define input layers
inputs = {
    "token_ids": keras.Input(shape=(2, None), dtype=tf.int32, name="token_ids"),
    "padding_mask": keras.Input(shape=(2, None), dtype=tf.int32, name="padding_mask"),
}
# Create a DebertaV3Classifier backbone
backbone = keras_nlp.models.DebertaV3Backbone.from_preset(
    CFG.preset,
)

# Compute embeddings for first response: (P + R_A) using backbone
response_a = {k: v[:, 0, :] for k, v in inputs.items()}
embed_a = backbone(response_a)

# Compute embeddings for second response: (P + R_B), using the same backbone
response_b = {k: v[:, 1, :] for k, v in inputs.items()}
embed_b = backbone(response_b)

# Compute final output
embeds = keras.layers.Concatenate(axis=-1)([embed_a, embed_b])
embeds = keras.layers.GlobalAveragePooling1D()(embeds)
outputs = keras.layers.Dense(3, activation="softmax", name="classifier")(embeds)
model = keras.Model(inputs, outputs)

# Compile the model with optimizer, loss, and metrics
model.compile(
    optimizer=keras.optimizers.Adam(9e-6),
    loss=keras.losses.CategoricalCrossentropy(label_smoothing=0.02),
    metrics=[
        log_loss,
        keras.metrics.CategoricalAccuracy(name="accuracy"),
    ],
)

### Model Summary

In [21]:
model.summary()

# Training

In [None]:
# 2-fold cross validation
from sklearn.model_selection import KFold
kf = KFold(n_splits=2, shuffle=True, random_state=CFG.seed)

for train, valid in kf.split(df):
    # Train
    train_df = df.iloc[train]
    train_texts = train_df.options.tolist()  # Extract training texts
    train_labels = train_df.class_label.tolist()  # Extract training labels
    train_ds = build_dataset(train_texts, train_labels,
                             batch_size=CFG.batch_size,
                             shuffle=True)
    
    # Valid
    valid_df = df.iloc[valid]
    valid_texts = valid_df.options.tolist()  # Extract validation texts
    valid_labels = valid_df.class_label.tolist()  # Extract validation labels
    valid_ds = build_dataset(valid_texts, valid_labels,
                             batch_size=CFG.batch_size,
                             shuffle=False)    

    # Start training the model
    history = model.fit(
        train_ds,
        epochs=CFG.epochs,
        validation_data=valid_ds,
        callbacks=ckpt_cb
    )

Epoch 1/3
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m336s[0m 2s/step - accuracy: 0.3716 - log_loss: 1.1814 - loss: 1.1831 - val_accuracy: 0.4315 - val_log_loss: 1.0686 - val_loss: 1.0706
Epoch 2/3
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m232s[0m 1s/step - accuracy: 0.4403 - log_loss: 1.0595 - loss: 1.0617 - val_accuracy: 0.4489 - val_log_loss: 1.0592 - val_loss: 1.0614
Epoch 3/3
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m195s[0m 1s/step - accuracy: 0.4699 - log_loss: 1.0325 - loss: 1.0353 - val_accuracy: 0.4499 - val_log_loss: 1.0590 - val_loss: 1.0614
Epoch 1/3
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m195s[0m 1s/step - accuracy: 0.4597 - log_loss: 1.0595 - loss: 1.0619 - val_accuracy: 0.5188 - val_log_loss: 0.9900 - val_loss: 0.9939
Epoch 2/3
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 1s/step - accuracy: 0.4939 - log_loss: 1.0237 - loss: 1.0270 - val_accuracy: 0.5181 - val_log_loss: 0.9926

## Load Best Model

After training, load the weight with best result to get the best performance.

In [24]:
model.load_weights('/kaggle/working/best_model.weights.h5')

# Prediction

In [25]:
# Build test dataset
test_texts = test_df.options.tolist()
test_ds = build_dataset(test_texts,
                         batch_size=min(len(test_df), CFG.batch_size),
                         shuffle=False)

In [26]:
# Make predictions using the trained model on test data
test_preds = model.predict(test_ds, verbose=1)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6s/step


# Submission

Following code will prepare the submission file.

In [27]:
sub_df = test_df[["id"]].copy()
sub_df[CFG.class_names] = test_preds.tolist()
sub_df.to_csv("submission.csv", index=False)
sub_df.head()

Unnamed: 0,id,winner_model_a,winner_model_b,winner_tie
0,136060,0.230835,0.268066,0.500977
1,211333,0.495605,0.197388,0.306641
2,1233961,0.238281,0.405029,0.356445
