In [1]:
import torch
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM

# Paths for models and index
FAISS_INDEX_FILE = "faiss_index.bin"
MAHABHARATA_DATA = "mahabharata_enriched_dataset.json"
LLAMA_MODEL_DIR = "fine_tuned_guru_llama"

# Initialize embedder for query embedding
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Load FAISS index
index = faiss.read_index(FAISS_INDEX_FILE)

# Load Mahabharata passages
import json
with open(MAHABHARATA_DATA, "r", encoding="utf-8") as f:
    mahabharata_data = json.load(f)
passages = [record.get("instruction", "") + " " + record.get("response", "") for record in mahabharata_data]

# Load fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(LLAMA_MODEL_DIR)
model = AutoModelForCausalLM.from_pretrained(LLAMA_MODEL_DIR)
model.to("cuda" if torch.cuda.is_available() else "cpu")

def retrieve_context(query, k=3):
    """Retrieve top k relevant passages using FAISS."""
    query_emb = embedder.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(query_emb)
    distances, indices = index.search(query_emb, k)
    retrieved = [passages[i] for i in indices[0]]
    return retrieved

def generate_response(query):
    """Generate a response in a saintly style using retrieved context."""
    context = retrieve_context(query)

    # Construct a concise prompt
    prompt = "Based on the following Mahabharata teachings:\n"
    for i, passage in enumerate(context):
        prompt += f"[Teachings {i+1}]: {passage}\n"
    prompt += f"\nUser Query: {query}\nSaintly Response:"

    # Tokenize input and ensure it fits within model limits
    input_tokens = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(model.device)

    # Generate response with a higher token limit
    outputs = model.generate(
        **input_tokens,
        max_new_tokens=300,  # Increase from 150 to allow a longer response
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id,
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract only the response (remove prompt duplication)
    response = response.split("Saintly Response:")[-1].strip()
    
    return response

if __name__ == "__main__":
    while True:
        query = input("Enter your query: ")
        if query.lower() in ["exit", "quit"]:
            break
        response = generate_response(query)
        print("\nChatbot Response:")
        print(response)


: 

In [1]:
import torch
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM

# Paths for models and index
FAISS_INDEX_FILE = "faiss_index.bin"
MAHABHARATA_DATA = "mahabharata_enriched_dataset.json"
LLAMA_MODEL_DIR = "fine_tuned_guru_llama"

# Initialize embedder for query embedding
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Load FAISS index
index = faiss.read_index(FAISS_INDEX_FILE)
# Load Mahabharata passages
import json
with open(MAHABHARATA_DATA, "r", encoding="utf-8") as f:
    mahabharata_data = json.load(f)
passages = [record.get("instruction", "") + " " + record.get("response", "") for record in mahabharata_data]

# Load fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(LLAMA_MODEL_DIR)
model = AutoModelForCausalLM.from_pretrained(LLAMA_MODEL_DIR)
model.to("cuda" if torch.cuda.is_available() else "cpu")

def retrieve_context(query, k=3):
    """Retrieve top k relevant passages using FAISS."""
    query_emb = embedder.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(query_emb)
    distances, indices = index.search(query_emb, k)
    retrieved = [passages[i] for i in indices[0]]
    return retrieved

def generate_response(query):
    """Generate a response in the saintly style.
       It optionally uses retrieved context.
    """
    context = retrieve_context(query)
    # Construct a prompt that includes context passages
    prompt = "Based on the following context passages from the Mahabharata:\n"
    for i, passage in enumerate(context):
        prompt += f"[Context {i+1}]: {passage}\n"
    prompt += f"\nUser Query: {query}\nSaintly Response:"
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=150,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id,
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Optionally, you can remove the prompt part to extract only the response.
    return response

if __name__ == "__main__":
    while True:
        query = input("Enter your query: ")
        if query.lower() in ["exit", "quit"]:
            break
        response = generate_response(query)
        print("Chatbot response:")
        print(response)

: 

In [5]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [1]:
import torch
import jsonlines
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments
)
from trl import SFTTrainer
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training
)
from transformers import BitsAndBytesConfig

# Function to load JSONL dataset
def load_jsonl_dataset(file_path):
    """
    Load JSONL file and convert to Hugging Face Dataset

    Args:
        file_path (str): Path to the JSONL file

    Returns:
        Dataset: Hugging Face Dataset
    """
    data = []
    with jsonlines.open(file_path) as reader:
        for obj in reader:
            # Combine prompt and completion for training
            text = f"{obj['prompt']} {obj['completion']}"
            data.append({"text": text})

    return Dataset.from_list(data)

# Load dataset
jsonl_file_path = "processed_pairs.jsonl"  # Update with your file path
dataset = load_jsonl_dataset(jsonl_file_path)

# Quantization configuration for efficient memory usage
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-3.2-3B"
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'  # Ensure padding is on the right side

# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        padding=True,
        truncation=True,
        max_length=2048,
        return_tensors="pt"
    )

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Load Model with Quantization
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-3B",
    quantization_config=quantization_config,
    device_map="auto"
)

# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

# LoRA Configuration
lora_config = LoraConfig(
    r=16,  # LoRA rank
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA
model = get_peft_model(model, lora_config)

# Training Arguments
training_args = TrainingArguments(
    output_dir="./fine_tuned_model",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    warmup_steps=5,
    num_train_epochs=45,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=1,
    optim="adamw_torch",
)

# Trainer Setup
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset,
    args=training_args,
)

# Start Fine-Tuning
trainer.train()

# Save the Fine-Tuned Model
model.save_pretrained("fine_tuned_guru_llama")
tokenizer.save_pretrained("fine_tuned_guru_llama")

# Print dataset info
print(f"Dataset size: {len(dataset)} samples")

Map:   0%|          | 0/299 [00:00<?, ? examples/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Truncating train dataset:   0%|          | 0/299 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
1,2.9968
2,2.2918
3,2.8276
4,2.8584
5,2.1935
6,2.305
7,2.232
8,2.6355
9,2.4852
10,2.7079


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Dataset size: 299 samples


In [2]:
import accelerate
print(accelerate.__version__)

0.34.0


In [1]:
import re
import jsonlines
from transformers import BartTokenizer, BartForConditionalGeneration
import torch

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Using device: cuda


In [3]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large').to(device)

In [4]:
def clean_text(text):
    """Remove unwanted elements like [Music] and extra spaces."""
    text = re.sub(r'\[Music\]', '', text)  # Remove [Music] tags
    text = re.sub(r'\s+', ' ', text)       # Replace multiple spaces with one
    return text.strip()

# Function to generate a prompt using BART
def generate_prompt(text, max_length=50):
    """Generate a prompt for monologues using BART."""
    inputs = tokenizer.encode("generate prompt: " + text, return_tensors="pt", max_length=512, truncation=True).to(device)
    outputs = model.generate(inputs, max_length=max_length, num_beams=4, early_stopping=True)
    prompt = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return prompt

# Function to extract prompt-completion pairs from a transcript
def extract_pairs_from_transcript(transcript):
    """Extract pairs based on Q&A format or generate prompts for monologues."""
    pairs = []
    
    # Check for explicit Q&A format (e.g., "Questioner:" and "Saint:")
    if "Questioner:" in transcript and any(saint in transcript for saint in ["Sadhguru:", "Saint:"]):
        pattern = r'Questioner:(.*?)Sadhguru:(.*?)(?=Questioner:|$)'  # Adjust saint name as needed
        matches = re.findall(pattern, transcript, re.DOTALL)
        for match in matches:
            prompt = clean_text(match[0])
            completion = clean_text(match[1])
            pairs.append({"prompt": prompt, "completion": completion})
    else:
        # For monologues, generate a prompt with BART
        cleaned_text = clean_text(transcript)
        prompt = generate_prompt(cleaned_text)
        pairs.append({"prompt": prompt, "completion": cleaned_text})
    
    return pairs

# Function to process all transcripts
def preprocess_transcripts(transcript_files, output_file):
    """Process all transcript files and save pairs to a JSONL file."""
    all_pairs = []
    for file_path in transcript_files:
        with open(file_path, 'r', encoding='utf-8') as file:
            transcript = file.read()
        pairs = extract_pairs_from_transcript(transcript)
        all_pairs.extend(pairs)
    
    # Save to JSONL
    with jsonlines.open(output_file, mode='w') as writer:
        for pair in all_pairs:
            writer.write(pair)
    print(f"Saved {len(all_pairs)} prompt-completion pairs to {output_file}")

In [5]:
transcript_files = [f"{i}.txt" for i in range(1, 101)]  
output_file = "spiritual_preachings.jsonl"
print(transcript_files)

['1.txt', '2.txt', '3.txt', '4.txt', '5.txt', '6.txt', '7.txt', '8.txt', '9.txt', '10.txt', '11.txt', '12.txt', '13.txt', '14.txt', '15.txt', '16.txt', '17.txt', '18.txt', '19.txt', '20.txt', '21.txt', '22.txt', '23.txt', '24.txt', '25.txt', '26.txt', '27.txt', '28.txt', '29.txt', '30.txt', '31.txt', '32.txt', '33.txt', '34.txt', '35.txt', '36.txt', '37.txt', '38.txt', '39.txt', '40.txt', '41.txt', '42.txt', '43.txt', '44.txt', '45.txt', '46.txt', '47.txt', '48.txt', '49.txt', '50.txt', '51.txt', '52.txt', '53.txt', '54.txt', '55.txt', '56.txt', '57.txt', '58.txt', '59.txt', '60.txt', '61.txt', '62.txt', '63.txt', '64.txt', '65.txt', '66.txt', '67.txt', '68.txt', '69.txt', '70.txt', '71.txt', '72.txt', '73.txt', '74.txt', '75.txt', '76.txt', '77.txt', '78.txt', '79.txt', '80.txt', '81.txt', '82.txt', '83.txt', '84.txt', '85.txt', '86.txt', '87.txt', '88.txt', '89.txt', '90.txt', '91.txt', '92.txt', '93.txt', '94.txt', '95.txt', '96.txt', '97.txt', '98.txt', '99.txt', '100.txt']


In [6]:
preprocess_transcripts(transcript_files, output_file)

Saved 100 prompt-completion pairs to spiritual_preachings.jsonl


In [8]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rohit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
import jsonlines
from transformers import AutoTokenizer
import sys

def process_pairs(input_file, output_file, model_name):
    """
    Process prompt-completion pairs from a JSONL file with robust error handling.
    
    Args:
        input_file (str): Path to input JSONL file
        output_file (str): Path to output JSONL file
        model_name (str): Tokenizer model name
    """
    try:
        # Load the tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        # Determine max token length more explicitly
        max_length = min(tokenizer.model_max_length, 1024) - 64
        print(f"Using maximum token length: {max_length}")
        
        new_pairs = []
        error_count = 0
        total_count = 0
        
        # Read the original pairs
        with jsonlines.open(input_file, mode='r') as reader:
            for pair in reader:
                total_count += 1
                prompt = pair.get('prompt', '')
                completion = pair.get('completion', '')
                
                try:
                    # Encode the completion to check its length
                    completion_tokens = tokenizer.encode(completion, add_special_tokens=False)
                    
                    # If completion is too long, split it
                    if len(completion_tokens) > max_length:
                        # Split tokens into chunks
                        chunks = []
                        for i in range(0, len(completion_tokens), max_length):
                            chunk_tokens = completion_tokens[i:i+max_length]
                            chunk = tokenizer.decode(chunk_tokens)
                            
                            # Create new prompt for subsequent chunks
                            chunk_prompt = f"{prompt} (Part {len(chunks)+1})" if chunks else prompt
                            
                            chunks.append({
                                "prompt": chunk_prompt,
                                "completion": chunk
                            })
                        
                        new_pairs.extend(chunks)
                    else:
                        # Keep the original pair if within token limit
                        new_pairs.append(pair)
                
                except Exception as chunk_error:
                    print(f"Error processing pair: {chunk_error}")
                    error_count += 1
        
        # Save the processed pairs
        with jsonlines.open(output_file, mode='w') as writer:
            for new_pair in new_pairs:
                writer.write(new_pair)
        
        print(f"Total pairs processed: {total_count}")
        print(f"Pairs in output: {len(new_pairs)}")
        print(f"Errors encountered: {error_count}")
        print(f"Processed pairs saved to {output_file}")
    
    except Exception as e:
        print(f"Critical error: {e}")
        print(f"Error type: {type(e)}")
        import traceback
        traceback.print_exc()
        sys.exit(1)

# Example usage
if __name__ == "__main__":
    input_file = "spiritual_preachings.jsonl"  # Replace with your input file
    output_file = "processed_pairs.jsonl"  # Replace with your output file
    
    # Try different model names if needed
    model_names = [
        "gpt2",
        "bert-base-uncased",
        "facebook/opt-350m",
        "distilbert-base-uncased"
    ]
    
    # Try each model until successful
    for model_name in model_names:
        try:
            print(f"\nTrying with model: {model_name}")
            process_pairs(input_file, output_file, model_name)
            break
        except Exception as e:
            print(f"Failed with {model_name}: {e}")
            continue


Trying with model: gpt2


Token indices sequence length is longer than the specified maximum sequence length for this model (1464 > 1024). Running this sequence through the model will result in indexing errors


Using maximum token length: 960
Total pairs processed: 100
Pairs in output: 299
Errors encountered: 0
Processed pairs saved to processed_pairs.jsonl
