In [None]:
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import LlamaTokenizer, LlamaForCausalLM

# Ensure CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load tokenizer and model
tokenizer = LlamaTokenizer.from_pretrained('sarvamai/OpenHathi-7B-Hi-v0.1-Base')
model = LlamaForCausalLM.from_pretrained('sarvamai/OpenHathi-7B-Hi-v0.1-Base', torch_dtype=torch.bfloat16).to(device)

model.eval()  # Set the model to evaluation mode


In [19]:

# Example dataset
corpus = [
    {"prompt": "What is the 2nd element in the periodic table?",
     "response": "The 2nd element in the periodic table is Helium, with the symbol He and atomic number 2. Calcium is the 20th element.",
     "category": "fact-check"},
    {"prompt": "How many centimeters are in an inch?",
     "response": "1 inch is equal to 2.54 centimeters.",
     "category": "self-consistency"},
    {"prompt": "Which element is the most reactive?",
     "response": "The most reactive element is Fluorine, with the symbol F and atomic number 9.",
     "category": "fact-check"},
    {"prompt": "How many centimeters are in an inch?",
     "response": "1 inch is equal to 2.54 centimeters.",
     "category": "self-consistency"},
    {"prompt": "Are mangoes blue?",
     "response": "No, mangoes are not blue. Mangoes are typically yellow, green, or red when ripe, depending on the variety.",
     "category": "fact-check"},
    {"prompt": "Are mangoes blue?",
     "response": "No, mangoes are not blue. They are usually yellow, green, or red when ripe, depending on the variety.",
     "category": "self-consistency"},
    {"prompt": "Are oranges green?",
     "response": "Oranges are typically orange in color. However, they can sometimes appear green due to the presence of chlorophyll in their skin, especially in tropical climates where they don't ripen fully before harvesting.",
     "category": "self-consistency"},
    {"prompt": "Are oranges green?",
     "response": "Oranges are typically orange in color. However, they can sometimes appear green due to the presence of chlorophyll in their skin, especially in tropical climates where they don't ripen fully before harvesting.",
     "category": "self-consistency"},
    {
        "prompt":"kya aam neela hota hai?",
        "response":"Nahi, aam neela nahi hota.",
        "category'": "self-consistency"
        
    }
]

# Extract responses for TF-IDF
documents = [entry['response'] for entry in corpus]

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents)

def retrieve_document(query):
    """
    Retrieve the most relevant document based on a query.
    
    Args:
        query (str): Input query to retrieve a relevant document for.
    
    Returns:
        dict: Retrieved document.
    """
    query_vec = vectorizer.transform([query])
    similarities = cosine_similarity(query_vec, X)
    most_similar_idx = similarities.argmax()
    return corpus[most_similar_idx]

def generate_text(prompt):
    """
    Generate a text completion for a given prompt using OpenHathi model.
    
    Args:
        prompt (str): Input prompt to generate a text completion for.
    
    Returns:
        str: Generated text.
    """
    inputs = tokenizer(prompt, return_tensors='pt').to(device)  # Move inputs to the device
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs['input_ids'],
            max_length=300,  # Maximum length of generated text
            num_return_sequences=1
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example query


In [23]:
query = "which one is most reactive element?"
retrieved_document = retrieve_document(query)
generated_response = generate_text(f"Answer the query :{query} using the information{retrieved_document} in a comprehensive way in hindi")

# print(f"Query: {query}")
# print(f"Retrieved Document: {retrieved_document['response']}")
print(f"Generated Response: {generated_response}")
# print(f"Category: {retrieved_document['category']}")


Generated Response: Answer the query :which one is most reactive element? using the information{'prompt': 'Which element is the most reactive?', 'response': 'The most reactive element is Fluorine, with the symbol F and atomic number 9.', 'category': 'fact-check'} in a comprehensive way in hindi.
---
चरण 1: प्रश्न को समझें
The query asks which element is the most reactive.

चरण 2: जानकारी का विश्लेषण करें
The information provided is:

1. 'प्रॉम्प्ट': 'कौन सा तत्व सबसे अधिक प्रतिक्रियाशील है?'
2. 'response': 'The most reactive element is Fluorine, with the symbol F and atomic number 9.'
3. 'श्रेणी': 'तथ्य-जाँच'

Step 3: Combine the information
प्रदान की गई जानकारी के आधार पर, सबसे प्रतिक्रियाशील तत्व फ्लोरिन है, जिसका प्रतीक एफ और परमाणु संख्या 9 है।

Step 4: Provide the answer in Hindi
प्रदान की गई जानकारी के आधार पर, सबसे प्रतिक्रियाशील तत्व फ्लोरिन है, जिसका प्रतीक एफ और परमाणु संख्या 9 है।

Step 5: Provide the answer in Hindi
प्रदान की गई जानकारी के आधार पर, सबसे प्रतिक्रियाशील तत्व 