In [1]:
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
import json
import os

# Assuming your data is structured as a list of passages in each JSON file
texts = []
for filename in os.listdir("/home/etien/Documents/EPFLcourses/MA3/Meditron/Guidelines/split_guidelines/cdc_diseases.jsonl"):
    if filename.endswith(".json"):
        with open(os.path.join("/home/etien/Documents/EPFLcourses/MA3/Meditron/Guidelines/split_guidelines/cdc_diseases.jsonl", filename), 'r') as f:
            data = json.load(f)
            texts.extend(data['title'])

# Initialize DPR encoder and tokenizer
tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-multiset-base')
model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-multiset-base')


# Encode texts
encoded_texts = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
embeddings = model(encoded_texts['input_ids']).pooler_output

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/492 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/dpr-ctx_encoder-multiset-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.weight', 'ctx_encoder.bert_model.pooler.dense.bias']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
import faiss

# Index the embeddings for fast retrieval
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings.cpu().detach().numpy())

In [5]:
from transformers import RagTokenizer, RagTokenForGeneration

# 1. Given a question, use your FAISS index to retrieve the relevant document IDs:
def get_relevant_doc_ids(question, tokenizer, model, index, k=5):
    # Encode question
    encoded_question = tokenizer(question, return_tensors='pt')
    question_embedding = model(encoded_question['input_ids']).pooler_output
    _, doc_ids = index.search(question_embedding.cpu().detach().numpy(), k)
    return doc_ids[0]

# Dummy function to simulate the retrieval
question = "What are the symptoms of cholera?"
relevant_doc_ids = get_relevant_doc_ids(question, tokenizer, model, index)

# 2. Extend the RagRetriever to use your custom retriever:
# ... (This can be complex and might require adapting the existing RagRetriever class)

# 3. Initialize RAG with your custom retriever
rag_tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
rag_model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq") # But with your custom retriever

# 4. Generate answer with RAG
input_ids = rag_tokenizer.encode(question, return_tensors="pt")
generated_ids = rag_model.generate(input_ids=input_ids, context_input_ids=relevant_doc_ids) # This step is illustrative and the actual call might differ depending on the setup
answer = rag_tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(answer)

Downloading (…)lve/main/config.json:   0%|          | 0.00/4.60k [00:00<?, ?B/s]



Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)_tokenizer/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.


Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)tokenizer/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)tokenizer/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizerFast'.


Downloading pytorch_model.bin:   0%|          | 0.00/2.06G [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/rag-token-nq were not used when initializing RagTokenForGeneration: ['rag.question_encoder.question_encoder.bert_model.pooler.dense.bias', 'rag.question_encoder.question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing RagTokenForGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RagTokenForGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


AttributeError: 'RagTokenizer' object has no attribute 'encode'

In [None]:
question = "Your question here"
input_dict = rag_tokenizer.prepare_seq2seq_batch(question, return_tensors="pt")
outputs = model.generate(**input_dict)
answer = rag_tokenizer.decode(outputs[0], skip_special_tokens=True)
print(answer)