#### **Step 1** - Importing Required modules to train our model

In [1]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('intfloat/e5-base-v2')
print("model imported successfully")

  from .autonotebook import tqdm as notebook_tqdm


model imported successfully


#### **Step 2** Loading The dataset

In [2]:
import json
# Load the dataset
with open("dataset.json", "r", encoding="utf-8") as f:
    data = json.load(f)


for entry in data[:2]:  
    print(f"Name: {entry['name']}")
    print(f"Category: {entry['category']}")
    print(f"Help: {entry['help'][:200]}...") 
    print("-" * 50)

Name: Unzip collection
Category: Collection Operations
Help: Synopsis
This tool takes a paired collection and "unzips" it into two simple dataset collections (lists of datasets).

Description
1. **Functionality**
   - Given a paired collection of forward and re...
--------------------------------------------------
Name: Zip collections
Category: Collection Operations
Help: Synopsis
This tool takes two collections and creates a paired collection from them.

Description
1. **Functionality**
   - If you have one collection containing only forward reads and another containi...
--------------------------------------------------


#### **Step 3** Extract The data

In [3]:
# Extract text data
sentences = [f"{entry['name']} - {entry['description']} - {entry['help']}" for entry in data]

# Print sample
print(sentences[:3])  # Check first 3 processed sentences

['Unzip collection -  - Synopsis\nThis tool takes a paired collection and "unzips" it into two simple dataset collections (lists of datasets).\n\nDescription\n1. **Functionality**\n   - Given a paired collection of forward and reverse reads, this tool separates them into two distinct collections.\n   - The first output collection contains all forward reads, and the second output collection contains all reverse reads.\n\n2. **Use Case**\n   - Useful for processing paired-end sequencing data.\n   - Enables downstream analysis by handling forward and reverse reads separately.\n\nThis tool simplifies paired dataset management, allowing for more flexible analysis workflows in Galaxy.', 'Zip collections -  - Synopsis\nThis tool takes two collections and creates a paired collection from them.\n\nDescription\n1. **Functionality**\n   - If you have one collection containing only forward reads and another containing only reverse reads, this tool will combine them into a paired collection.\n   - 

#### **Step 4** Creating Training Examples

In [4]:
from sentence_transformers import InputExample

# Creating training examples (assuming similarity-based fine-tuning)
train_examples = []
for i in range(len(sentences) - 1):
    train_examples.append(InputExample(texts=[sentences[i], sentences[i+1]], label=1.0))  # assuming similarity for now

#### **Step 5** Define Data loader and Define loss function

In [5]:
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, losses
import datasets

# Define a DataLoader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=8)

# Define the loss function for contrastive learning
train_loss = losses.CosineSimilarityLoss(model=model)

#### **Step 6** Fine Tune SBERT Model With Dataset

In [None]:
# Fine-tune the E5 model
#model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=20, warmup_steps=100)
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100)
# Save the fine-tuned model
version = 1
model.save(f"fine_tuned_E5_for_galaxy_v{version}")

                                                                     

Step,Training Loss


In [7]:
from sentence_transformers import SentenceTransformer
fine_tuned_model = SentenceTransformer(f"fine_tuned_E5_for_galaxy_v{version}")
sentences = ["This tool helps in dataset filtering.", "Merging collections is useful."]
embeddings = fine_tuned_model.encode(sentences)
print(embeddings)

[[-0.01452434 -0.02709516 -0.03902506 ... -0.01207174  0.01802179
   0.03497765]
 [ 0.00367054 -0.04538389 -0.02575246 ... -0.04581482  0.02639549
   0.01231393]]
