In [1]:
import os
import pandas as pd
import torch
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

  from .autonotebook import tqdm as notebook_tqdm


Using device: cpu


In [2]:
df = pd.read_csv('./cleaned_train_data.csv')

In [3]:
df.head()

Unnamed: 0,user_input,user_intent,score
0,How do I link my existing health policy?,Add Health Policy,1.0
1,I want to purchase a new health plan,Buy Health Insurance,1.0
2,Add my corporate health plan to my account,Add Corporate Health Policy,1.0
3,Can I buy Elevate coverage online?,Buy Elevate,1.0
4,How to register my retail health policy?,Add Retail Health Policy,1.0


In [4]:
df["score"] = pd.to_numeric(df["score"], errors="coerce")

# Prepare training examples
train_examples = [
    InputExample(texts=[row["user_input"], row["user_intent"]], label=row["score"])
    for index, row in df.iterrows()
]

In [5]:
# Load the pre-trained model
model_name = "sentence-transformers/msmarco-distilbert-base-tas-b"
model = SentenceTransformer(model_name, cache_folder="./cache")
model.to(device)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [6]:
# Create a DataLoader for training
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

In [7]:
# Define the loss function (CosineSimilarityLoss is commonly used for similarity tasks)
train_loss = losses.CosineSimilarityLoss(model)

In [8]:
train_loss.to(device)

CosineSimilarityLoss(
  (model): SentenceTransformer(
    (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: DistilBertModel 
    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  )
  (loss_fct): MSELoss()
  (cos_score_transformation): Identity()
)

In [9]:
from sentence_transformers import InputExample, evaluation

val_examples = [
    # Positive examples (correct pairs)
    InputExample(texts=["I want to add a policy", "Add Policy"], label=1.0),
    InputExample(texts=["I need to buy car insurance", "Buy Car Insurance"], label=1.0),
    InputExample(texts=["Show me my travel policies", "View Travel Policies"], label=1.0),
    InputExample(texts=["I want to read some blogs", "Blogs"], label=1.0),
    InputExample(texts=["I need to see my profile", "Profile"], label=1.0),
    
    # Negative examples (mismatched pairs)
    InputExample(texts=["I want to add a policy", "Buy Car Insurance"], label=0.0),
    InputExample(texts=["I need health insurance", "Buy Car Insurance"], label=0.0),
    InputExample(texts=["I want to read blogs", "Add Policy"], label=0.0),
    InputExample(texts=["Check my internet speed", "Face Scan"], label=0.0),
    InputExample(texts=["How do I track my health?", "Driving Score"], label=0.0),
]

# Evaluator to evaluate the model during training
evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(val_examples, name="val-eval")

print("Validation examples created:", len(val_examples))

Validation examples created: 10


In [10]:
import numpy as np

def contains_nan(example):
    """Check if any text in the InputExample contains NaN."""
    return any(isinstance(text, float) and np.isnan(text) for text in example.texts)

# Validate your val_examples
for idx, example in enumerate(val_examples):
    if contains_nan(example):
        print(f"NaN detected in validation example at index {idx}: {example}")

In [11]:
# Validate your training egs
for idx, example in enumerate(train_examples):
    if contains_nan(example):
        print(f"NaN detected in validation example at index {idx}: {example}")

In [12]:
# Fine-tune the model
num_epochs = 3
output_path = os.path.join("..", "models", "iltc_finetuned_model")

In [13]:
model.fit(
    train_objectives = [(train_dataloader, train_loss)],
    epochs = num_epochs,
    warmup_steps = 100,
    evaluation_steps = 1000, 
    evaluator = evaluator,
    output_path = output_path,
    save_best_model = True
)

                                                                     

Step,Training Loss,Validation Loss,Val-eval Pearson Cosine,Val-eval Spearman Cosine
86,No log,No log,0.967285,0.870388
172,No log,No log,0.955181,0.870388
258,No log,No log,0.934118,0.870388


In [14]:
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=num_epochs,
    warmup_steps=100,
    output_path=output_path,
)

Step,Training Loss


In [15]:
model.save(output_path)
print("Model fine-tuned and saved at:", output_path)

Model fine-tuned and saved at: ..\models\iltc_finetuned_model
