#### **Step 1** - Importing Required modules to train our model

In [7]:
import random
from sentence_transformers import InputExample
from sentence_transformers import SentenceTransformer
from sentence_transformers import InputExample
model = SentenceTransformer('intfloat/e5-base-v2')
print("model imported successfully")

model imported successfully


#### **Step 2** Loading The dataset

In [2]:
import json
# Load the dataset
with open("dataset.json", "r", encoding="utf-8") as f:
    data = json.load(f)


for entry in data[:2]:  
    print(f"Name: {entry['name']}")
    print(f"Category: {entry['category']}")
    print(f"Help: {entry['help'][:200]}...") 
    print("-" * 50)

Name: Unzip collection
Category: Collection Operations
Help: Synopsis
This tool takes a paired collection and "unzips" it into two simple dataset collections (lists of datasets).

Description
1. **Functionality**
   - Given a paired collection of forward and re...
--------------------------------------------------
Name: Zip collections
Category: Collection Operations
Help: Synopsis
This tool takes two collections and creates a paired collection from them.

Description
1. **Functionality**
   - If you have one collection containing only forward reads and another containi...
--------------------------------------------------


#### **Step 3** Extract The data

In [4]:
# Extract structured text data
structured_data = [
    {"text": f"{entry['name']} - {entry['description']} - {entry['help']}", "category": entry["category"]}
    for entry in data
]

# Creating training examples with category-based pairing
train_examples = []
num_entries = len(structured_data)
print(f"Number of entries: {num_entries}")

Number of entries: 85


#### **Step 4** Creating Training Examples

In [8]:
# Create Positive Pairs (Same Category)
for i in range(num_entries):
    for j in range(i + 1, num_entries):
        if structured_data[i]["category"] == structured_data[j]["category"]:
            train_examples.append(InputExample(texts=[structured_data[i]["text"], structured_data[j]["text"]], label=1.0))

# Create Negative Pairs (Different Categories)
negative_pairs = []
for i in range(num_entries):
    for j in range(i + 1, num_entries):
        if structured_data[i]["category"] != structured_data[j]["category"]:
            negative_pairs.append((structured_data[i]["text"], structured_data[j]["text"]))

# Sample Negative Pairs to balance dataset
negative_pairs_sampled = random.sample(negative_pairs, len(train_examples))  # Match positive pair count

for pair in negative_pairs_sampled:
    train_examples.append(InputExample(texts=[pair[0], pair[1]], label=0.0))

# Print sample pairs for verification
print("Sample Training Pairs:")
for ex in train_examples[:5]:
    print(ex.texts, "Label:", ex.label)


Sample Training Pairs:
['Unzip collection -  - Synopsis\nThis tool takes a paired collection and "unzips" it into two simple dataset collections (lists of datasets).\n\nDescription\n1. **Functionality**\n   - Given a paired collection of forward and reverse reads, this tool separates them into two distinct collections.\n   - The first output collection contains all forward reads, and the second output collection contains all reverse reads.\n\n2. **Use Case**\n   - Useful for processing paired-end sequencing data.\n   - Enables downstream analysis by handling forward and reverse reads separately.\n\nThis tool simplifies paired dataset management, allowing for more flexible analysis workflows in Galaxy.', 'Zip collections -  - Synopsis\nThis tool takes two collections and creates a paired collection from them.\n\nDescription\n1. **Functionality**\n   - If you have one collection containing only forward reads and another containing only reverse reads, this tool will combine them into a pa

#### **Step 5** Define Data loader and Define loss function

In [9]:
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, losses
import datasets

# Define a DataLoader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=8)

# Define the loss function for contrastive learning
train_loss = losses.CosineSimilarityLoss(model=model)

#### **Step 6** Fine Tune SBERT Model With Dataset

In [10]:
# Fine-tune the E5 model
#model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=20, warmup_steps=100)
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100)
# Save the fine-tuned model
version = 3
model.save(f"fine_tuned_E5_for_galaxy_v{version}")

                                                                     

Step,Training Loss


In [11]:
from sentence_transformers import SentenceTransformer
fine_tuned_model = SentenceTransformer(f"fine_tuned_E5_for_galaxy_v{version}")
sentences = ["This tool helps in dataset filtering.", "Merging collections is useful."]
embeddings = fine_tuned_model.encode(sentences)
print(embeddings)

[[-0.02281008 -0.03698322 -0.0344712  ... -0.00136929  0.00334988
   0.02803659]
 [ 0.01571156 -0.04223171 -0.0090887  ... -0.04540608 -0.00240785
   0.01285199]]
