<a href="https://colab.research.google.com/github/krishnathawan/SATYA/blob/main/Fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!pip install git+https://github.com/huggingface/transformers.git
!pip install faiss-gpu
!pip install sentence-transformers
!pip install pandas
!pip install numpy
!pip uninstall -y torch torchvision torchaudio
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install sentence-transformers
!pip install datasets

Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-ftl7b841
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-ftl7b841
  Resolved https://github.com/huggingface/transformers.git to commit 15bd3e61f8d3680ca472c9314ad07584d20f7b81
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Found existing installation: torch 2.5.1+cu118
Uninstalling torch-2.5.1+cu118:
  Successfully uninstalled torch-2.5.1+cu118
Found existing installation: torchvision 0.20.1+cu118
Uninstalling torchvision-0.20.1+cu118:
  Successfully uninstalled torchvision-0.20.1+cu118
Found existing installation: torchaudio 2.5.1+cu118
Uninstalling torchaudio-2.5.1+cu118:
  Successfully uninstalled torchaudio-2.5.1+cu118
Looking in indexes: https://



In [1]:
import torch
from torch import nn, optim
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers import InputExample
import pandas as pd
import numpy as np
import faiss
from torch.utils.data import DataLoader
from datasets import Dataset as HFDataset
from scipy.spatial.distance import cosine
import shutil

In [4]:
# paste csv file (dataset of Bhagwad Geeta)
bhagwat_gita_df = pd.read_csv('/content/Bhagwad_Gita_Verses_English.csv')

# paste csv file (dataset of Patanjali Yoga Sutra)
yoga_sutras_df = pd.read_csv('/content/Patanjali_Yoga_Sutras_Verses_English.csv')


print(f"Bhagavad Gita DataFrame shape: {bhagwat_gita_df.shape}")
print(f"Patanjali Yoga Sutras DataFrame shape: {yoga_sutras_df.shape}")
sanskrit_shlokas = list(bhagwat_gita_df['Sanskrit ']) + list(yoga_sutras_df['Sanskrit '])
bg_translations = bhagwat_gita_df['Swami Adidevananda'].tolist()
pys_translations = yoga_sutras_df['Translation '].tolist()
print(f"Number of Sanskrit Shlokas: {len(sanskrit_shlokas)}")
print(f"Number of Bhagavad Gita Translations: {len(bg_translations)}")
print(f"Number of Patanjali Yoga Sutras Translations: {len(pys_translations)}")
texts = bg_translations + pys_translations

# Load question datasets (evaluation dataset provided)
gita_que_df = pd.read_csv('/content/Bhagwad_Gita_Verses_English_Questions.csv')
pys_que_df = pd.read_csv('/content/Patanjali_Yoga_Sutras_Verses_English_Questions.csv')

que_df = pd.concat([gita_que_df, pys_que_df], ignore_index=True)
queries = que_df['question'].tolist()
data = {'query': queries, 'text': texts}
df = pd.DataFrame(data)
df.to_csv('fine_tune_dataset.csv', index=False)
train_examples = [InputExample(texts=[query, text]) for query, text in zip(queries, texts)]
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

Bhagavad Gita DataFrame shape: (700, 9)
Patanjali Yoga Sutras DataFrame shape: (195, 5)
Number of Sanskrit Shlokas: 895
Number of Bhagavad Gita Translations: 700
Number of Patanjali Yoga Sutras Translations: 195


In [None]:
train_data = HFDataset.from_dict({
    'query': queries,
    'text': texts
})

model = SentenceTransformer('nomic-ai/modernbert-embed-base')
loss = losses.MultipleNegativesRankingLoss(model)
num_epochs = 3
warmup_steps = int(0.1 * len(train_dataloader) * num_epochs)
model.fit(
    train_objectives=[(train_dataloader, loss)],
    epochs=num_epochs,
    warmup_steps=warmup_steps,
    show_progress_bar=True
)

model.save('new_fine_tuned_')
shutil.make_archive("D:\new_model_saved_", 'zip',"D:\new_model_saved_")
