In [None]:
!!pip install transformers faiss-cpu sentence-transformers PyPDF2 torch

In [None]:
# Install required libraries
#!pip install transformers faiss-cpu sentence-transformers PyPDF2 torch

# Import libraries
import PyPDF2
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import torch
import faiss
import numpy as np

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

pdf_path = '/content/The_Lightning_Thief_-_Percy_Jackson_1-10.pdf'  # Replace with your PDF file path
text = extract_text_from_pdf(pdf_path)

# Split text into chunks
def split_text_into_chunks(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

chunks = split_text_into_chunks(text)

# Generate embeddings
tokenizer = AutoTokenizer.from_pretrained("hkunlp/instructor-large")
model = AutoModel.from_pretrained("hkunlp/instructor-large")

def generate_embeddings(chunks):
    inputs = tokenizer(chunks, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

embeddings = generate_embeddings(chunks)

# Build FAISS index
embeddings_np = embeddings.numpy()
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

# Load Mistral model
mistral_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
mistral_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")

# Query RAG system
def query_rag_system(query, top_k=3):
    query_embedding = generate_embeddings([query]).numpy()
    distances, indices = index.search(query_embedding, top_k)
    relevant_chunks = [chunks[idx] for idx in indices[0]]
    input_text = " ".join(relevant_chunks) + " " + query
    inputs = mistral_tokenizer(input_text, return_tensors='pt')
    outputs = mistral_model.generate(**inputs, max_length=512)
    response = mistral_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Example query
query = "What is the main topic of the document?"
response = query_rag_system(query)
print(response)

# Ensure models run on GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
mistral_model.to(device)
embeddings = embeddings.to(device)

In [None]:
!huggingface-cli login

In [None]:
# Install required libraries
#!pip install transformers faiss-cpu sentence-transformers PyPDF2 torch

# Import libraries
import PyPDF2
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import torch
import faiss
import numpy as np

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

pdf_path = '/content/The_Lightning_Thief_-_Percy_Jackson_1-10.pdf'  # Replace with your PDF file path
text = extract_text_from_pdf(pdf_path)

# Split text into chunks
def split_text_into_chunks(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

chunks = split_text_into_chunks(text)

# Generate embeddings using hkunlp/instructor-large
tokenizer = AutoTokenizer.from_pretrained("hkunlp/instructor-large")
model = AutoModel.from_pretrained("hkunlp/instructor-large")

def generate_embeddings(chunks):
    inputs = tokenizer(chunks, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

embeddings = generate_embeddings(chunks)

# Build FAISS index
embeddings_np = embeddings.numpy()
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

# Load Mistral model
mistral_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
mistral_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")

# Query RAG system
def query_rag_system(query, top_k=3):
    # Generate query embedding
    query_embedding = generate_embeddings([query]).numpy()

    # Search FAISS index
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve relevant chunks
    relevant_chunks = [chunks[idx] for idx in indices[0]]

    # Combine relevant chunks with the query
    input_text = " ".join(relevant_chunks) + " " + query

    # Generate response using Mistral
    inputs = mistral_tokenizer(input_text, return_tensors='pt', max_length=512, truncation=True)
    outputs = mistral_model.generate(**inputs, max_length=512)
    response = mistral_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

# Example query
query = "What is the main topic of the document?"
response = query_rag_system(query)
print(response)

# Ensure models run on GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
mistral_model.to(device)
embeddings = embeddings.to(device)

In [None]:
# Install required libraries
#!pip install transformers faiss-cpu sentence-transformers PyPDF2 torch

# Import libraries
import PyPDF2
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import torch
import faiss
import numpy as np

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

pdf_path = '/content/The_Lightning_Thief_-_Percy_Jackson_1-10.pdf'  # Replace with your PDF file path
text = extract_text_from_pdf(pdf_path)

# Split text into chunks
def split_text_into_chunks(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

chunks = split_text_into_chunks(text)

# Generate embeddings using hkunlp/instructor-large
tokenizer = AutoTokenizer.from_pretrained("hkunlp/instructor-large")
model = AutoModel.from_pretrained("hkunlp/instructor-large")

def generate_embeddings(chunks):
    inputs = tokenizer(chunks, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

embeddings = generate_embeddings(chunks)

# Build FAISS index
embeddings_np = embeddings.numpy()
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

# Load Mistral model
mistral_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
mistral_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype=torch.bfloat16, device_map="auto")

# Query RAG system
def query_rag_system(query, top_k=3):
    # Generate query embedding
    query_embedding = generate_embeddings([query]).numpy()

    # Search FAISS index
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve relevant chunks
    relevant_chunks = [chunks[idx] for idx in indices[0]]

    # Combine relevant chunks with the query
    input_text = " ".join(relevant_chunks) + " " + query

    # Generate response using Mistral
    inputs = mistral_tokenizer(input_text, return_tensors='pt', max_length=512, truncation=True)
    outputs = mistral_model.generate(**inputs, max_length=512)
    response = mistral_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

# Example query
query = "What is the main topic of the document?"
response = query_rag_system(query)
print(response)

# Ensure models run on GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
mistral_model.to(device)
embeddings = embeddings.to(device)

In [None]:
# Install required libraries
!pip install transformers faiss-cpu sentence-transformers PyPDF2 torch

# Import libraries
import PyPDF2
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import torch
import faiss
import numpy as np

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

pdf_path = '/content/The_Lightning_Thief_-_Percy_Jackson_1-10.pdf'  # Replace with your PDF file path
text = extract_text_from_pdf(pdf_path)

# Split text into chunks
def split_text_into_chunks(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

chunks = split_text_into_chunks(text)

# Generate embeddings using hkunlp/instructor-large
tokenizer = AutoTokenizer.from_pretrained("hkunlp/instructor-large")
model = AutoModel.from_pretrained("hkunlp/instructor-large")

def generate_embeddings(chunks):
    inputs = tokenizer(chunks, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

embeddings = generate_embeddings(chunks)

# Build FAISS index
embeddings_np = embeddings.numpy()
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

# Load Mistral model
mistral_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
mistral_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype=torch.bfloat16, device_map="auto")

# Query RAG system
def query_rag_system(query, top_k=3):
    # Generate query embedding
    query_embedding = generate_embeddings([query]).numpy()

    # Search FAISS index
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve relevant chunks
    relevant_chunks = [chunks[idx] for idx in indices[0]]

    # Combine relevant chunks with the query
    input_text = " ".join(relevant_chunks) + " " + query

    # Tokenize input for Mistral
    inputs = mistral_tokenizer(input_text, return_tensors='pt', max_length=512, truncation=True)

    # Generate response using Mistral
    outputs = mistral_model.generate(inputs.input_ids, max_length=512)
    response = mistral_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

# Example query
query = "What is the main topic of the document?"
response = query_rag_system(query)
print(response)

# Ensure models run on GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
mistral_model.to(device)
embeddings = embeddings.to(device)

In [None]:
# Install required libraries
!pip install transformers faiss-cpu sentence-transformers PyPDF2 torch

# Import libraries
import PyPDF2
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import torch
import faiss
import numpy as np

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

pdf_path = '/content/The_Lightning_Thief_-_Percy_Jackson_1-10.pdf'  # Replace with your PDF file path
text = extract_text_from_pdf(pdf_path)

# Split text into chunks
def split_text_into_chunks(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

chunks = split_text_into_chunks(text)

# Generate embeddings using hkunlp/instructor-large
tokenizer = AutoTokenizer.from_pretrained("hkunlp/instructor-large")
model = AutoModel.from_pretrained("hkunlp/instructor-large")

def generate_embeddings(chunks):
    inputs = tokenizer(chunks, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

embeddings = generate_embeddings(chunks)

# Build FAISS index
embeddings_np = embeddings.numpy()
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

# Load Mistral model
mistral_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
mistral_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype=torch.bfloat16, device_map="auto")

# Query RAG system
def query_rag_system(query, top_k=3):
    # Generate query embedding
    query_embedding = generate_embeddings([query]).numpy()

    # Search FAISS index
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve relevant chunks
    relevant_chunks = [chunks[idx] for idx in indices[0]]

    # Combine relevant chunks with the query
    input_text = " ".join(relevant_chunks) + " " + query

    # Tokenize input for Mistral
    inputs = mistral_tokenizer(input_text, return_tensors='pt', max_length=512, truncation=True)

    # Generate response using Mistral
    outputs = mistral_model.generate(inputs.input_ids.to(mistral_model.device), max_length=512)
    response = mistral_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

# Example query
query = "What is the main topic of the document?"
response = query_rag_system(query)
print(response)

# Ensure models run on GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
mistral_model.to(device)
embeddings = embeddings.to(device)

In [None]:
# Install required libraries
!pip install transformers faiss-cpu sentence-transformers PyPDF2 torch

# Import libraries
import PyPDF2
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import torch
import faiss
import numpy as np

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

pdf_path = '/content/The_Lightning_Thief_-_Percy_Jackson_1-10.pdf'  # Replace with your PDF file path
text = extract_text_from_pdf(pdf_path)

# Split text into chunks
def split_text_into_chunks(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

chunks = split_text_into_chunks(text)

# Generate embeddings using hkunlp/instructor-large
tokenizer = AutoTokenizer.from_pretrained("hkunlp/instructor-large")
model = AutoModel.from_pretrained("hkunlp/instructor-large")

def generate_embeddings(chunks):
    # Correct way to tokenize and encode for instructor-large
    instruction = "Represent the document for retrieval: "
    inputs = tokenizer([instruction + chunk for chunk in chunks], return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Instructor-large returns a tuple of (last_hidden_state, all_hidden_states)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

embeddings = generate_embeddings(chunks)

# Build FAISS index
embeddings_np = embeddings.cpu().numpy() #Move embeddings to cpu
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

# Load Mistral model
mistral_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
mistral_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype=torch.bfloat16, device_map="auto")

# Query RAG system
def query_rag_system(query, top_k=3):
    # Generate query embedding (with instruction this time)
    instruction = "Represent the query for retrieval: "
    query_embedding = generate_embeddings([instruction + query]).cpu().numpy()

    # Search FAISS index
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve relevant chunks
    relevant_chunks = [chunks[idx] for idx in indices[0]]

    # Combine relevant chunks with the query
    input_text = " ".join(relevant_chunks) + " " + query

    # Tokenize input for Mistral
    inputs = mistral_tokenizer(input_text, return_tensors='pt', max_length=512, truncation=True).to(mistral_model.device) #Moved tokenizer to the device

    # Generate response using Mistral
    outputs = mistral_model.generate(inputs.input_ids, max_length=512)
    response = mistral_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

# Example query
query = "What is the main topic of the document?"
response = query_rag_system(query)
print(response)

# Ensure models run on GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# mistral_model.to(device) #Already in device_map="auto"
# embeddings = embeddings.to(device) # Done earlier and not needed

In [None]:
# Install required libraries
!pip install transformers faiss-cpu sentence-transformers PyPDF2 torch

# Import libraries
import PyPDF2
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import torch
import faiss
import numpy as np

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

pdf_path = '/content/The_Lightning_Thief_-_Percy_Jackson_1-10.pdf'  # Replace with your PDF file path
text = extract_text_from_pdf(pdf_path)

# Split text into chunks
def split_text_into_chunks(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

chunks = split_text_into_chunks(text)

# Generate embeddings using hkunlp/instructor-large
tokenizer = AutoTokenizer.from_pretrained("hkunlp/instructor-large")
model = AutoModel.from_pretrained("hkunlp/instructor-large")

def generate_embeddings(texts, instruction):
    # Correct way to tokenize and encode for instructor-large
    inputs = tokenizer([instruction + text for text in texts], return_tensors='pt', padding=True, truncation=True, max_length=512).to(model.device)
    with torch.no_grad():
        outputs = model(**inputs)
    # Instructor-large returns a tuple of (last_hidden_state, all_hidden_states)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

# Generate embeddings for the chunks
instruction = "Represent the document for retrieval: "
embeddings = generate_embeddings(chunks, instruction)

# Build FAISS index
embeddings_np = embeddings.cpu().numpy() #Move embeddings to cpu
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

# Load Mistral model
mistral_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
mistral_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype=torch.bfloat16, device_map="auto")

# Query RAG system
def query_rag_system(query, top_k=3):
    # Generate query embedding (with instruction this time)
    instruction = "Represent the query for retrieval: "
    query_embedding = generate_embeddings([query], instruction).cpu().numpy()

    # Search FAISS index
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve relevant chunks
    relevant_chunks = [chunks[idx] for idx in indices[0]]

    # Combine relevant chunks with the query
    input_text = " ".join(relevant_chunks) + " " + query

    # Tokenize input for Mistral
    inputs = mistral_tokenizer(input_text, return_tensors='pt', max_length=512, truncation=True).to(mistral_model.device) #Moved tokenizer to the device

    # Generate response using Mistral
    outputs = mistral_model.generate(inputs.input_ids, max_length=512)
    response = mistral_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

# Example query
query = "What is the main topic of the document?"
response = query_rag_system(query)
print(response)

# Ensure models run on GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# mistral_model.to(device) #Already in device_map="auto"
# embeddings = embeddings.to(device) # Done earlier and not needed

In [None]:
# Install required libraries


# Import libraries
import PyPDF2
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import torch
import faiss
import numpy as np

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

pdf_path = '/content/The_Lightning_Thief_-_Percy_Jackson_1-10.pdf'  # Replace with your PDF file path
text = extract_text_from_pdf(pdf_path)

# Split text into chunks
def split_text_into_chunks(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

chunks = split_text_into_chunks(text)

# Generate embeddings using hkunlp/instructor-large
tokenizer = AutoTokenizer.from_pretrained("hkunlp/instructor-large")
model = AutoModel.from_pretrained("hkunlp/instructor-large")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def generate_embeddings(texts, instruction):
    # Correct way to tokenize and encode for instructor-large
    inputs = tokenizer([instruction + text for text in texts], return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    # Instructor-large returns a tuple of (last_hidden_state, all_hidden_states)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

# Generate embeddings for the chunks
instruction = "Represent the document for retrieval: "
embeddings = generate_embeddings(chunks, instruction)

# Build FAISS index
embeddings_np = embeddings.cpu().numpy() #Move embeddings to cpu
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

# Load Mistral model
mistral_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
mistral_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype=torch.bfloat16, device_map="auto")

# Query RAG system
def query_rag_system(query, top_k=3):
    # Generate query embedding (with instruction this time)
    instruction = "Represent the query for retrieval: "
    query_embedding = generate_embeddings([query], instruction).cpu().numpy()

    # Search FAISS index
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve relevant chunks
    relevant_chunks = [chunks[idx] for idx in indices[0]]

    # Combine relevant chunks with the query
    input_text = " ".join(relevant_chunks) + " " + query

    # Tokenize input for Mistral
    inputs = mistral_tokenizer(input_text, return_tensors='pt', max_length=512, truncation=True).to(mistral_model.device) #Moved tokenizer to the device

    # Generate response using Mistral
    outputs = mistral_model.generate(inputs.input_ids, max_length=512)
    response = mistral_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

# Example query
query = "What is the main topic of the document?"
response = query_rag_system(query)
print(response)

# Ensure models run on GPU if available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
# mistral_model.to(device) #Already in device_map="auto"
# embeddings = embeddings.to(device) # Done earlier and not needed

In [None]:


# Import libraries
import PyPDF2
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import torch
import faiss
import numpy as np

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

pdf_path = '/content/The_Lightning_Thief_-_Percy_Jackson_1-10.pdf'  # Replace with your PDF file path
text = extract_text_from_pdf(pdf_path)

# Split text into chunks
def split_text_into_chunks(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

chunks = split_text_into_chunks(text)

# Generate embeddings using hkunlp/instructor-large
tokenizer = AutoTokenizer.from_pretrained("hkunlp/instructor-large")
model = AutoModel.from_pretrained("hkunlp/instructor-large") # Changed to AutoModel
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def generate_embeddings(texts, instruction):
    # Correct way to tokenize and encode for instructor-large
    inputs = tokenizer([instruction + text for text in texts], return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    # Instructor-large returns a tuple of (last_hidden_state, all_hidden_states)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

# Generate embeddings for the chunks
instruction = "Represent the document for retrieval: "
embeddings = generate_embeddings(chunks, instruction)

# Build FAISS index
embeddings_np = embeddings.cpu().numpy() #Move embeddings to cpu
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

# Load Mistral model
mistral_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
mistral_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype=torch.bfloat16, device_map="auto")

# Query RAG system
def query_rag_system(query, top_k=3):
    # Generate query embedding (with instruction this time)
    instruction = "Represent the query for retrieval: "
    query_embedding = generate_embeddings([query], instruction).cpu().numpy()

    # Search FAISS index
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve relevant chunks
    relevant_chunks = [chunks[idx] for idx in indices[0]]

    # Combine relevant chunks with the query
    input_text = " ".join(relevant_chunks) + " " + query

    # Tokenize input for Mistral
    inputs = mistral_tokenizer(input_text, return_tensors='pt', max_length=512, truncation=True).to(mistral_model.device) #Moved tokenizer to the device

    # Generate response using Mistral
    outputs = mistral_model.generate(inputs.input_ids, max_length=512)
    response = mistral_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

# Example query
query = "What is the main topic of the document?"
response = query_rag_system(query)
print(response)

# Ensure models run on GPU if available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
# mistral_model.to(device) #Already in device_map="auto"
# embeddings = embeddings.to(device) # Done earlier and not needed

In [None]:
# Install required libraries


# Import libraries
import PyPDF2
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import torch
import faiss
import numpy as np

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

pdf_path = '/content/The_Lightning_Thief_-_Percy_Jackson_1-10.pdf'  # Replace with your PDF file path
text = extract_text_from_pdf(pdf_path)

# Split text into chunks
def split_text_into_chunks(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

chunks = split_text_into_chunks(text)

# Generate embeddings using hkunlp/instructor-large
tokenizer = AutoTokenizer.from_pretrained("hkunlp/instructor-large")
model = AutoModel.from_pretrained("hkunlp/instructor-large")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def generate_embeddings(texts, instruction):
    # Correct way to tokenize and encode for instructor-large
    inputs = tokenizer([instruction + text for text in texts], return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model.encoder(**inputs) # Get only the encoder output
    # Instructor-large returns a tuple of (last_hidden_state, all_hidden_states)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

# Generate embeddings for the chunks
instruction = "Represent the document for retrieval: "
embeddings = generate_embeddings(chunks, instruction)

# Build FAISS index
embeddings_np = embeddings.cpu().numpy() #Move embeddings to cpu
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

# Load Mistral model
mistral_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.3")
mistral_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.3", torch_dtype=torch.bfloat16, device_map="auto")

# Query RAG system
def query_rag_system(query, top_k=3):
    # Generate query embedding (with instruction this time)
    instruction = "Represent the query for retrieval: "
    query_embedding = generate_embeddings([query], instruction).cpu().numpy()

    # Search FAISS index
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve relevant chunks
    relevant_chunks = [chunks[idx] for idx in indices[0]]

    # Combine relevant chunks with the query
    input_text = " ".join(relevant_chunks) + " " + query

    # Tokenize input for Mistral
    inputs = mistral_tokenizer(input_text, return_tensors='pt', max_length=512, truncation=True).to(mistral_model.device) #Moved tokenizer to the device

    # Generate response using Mistral
    outputs = mistral_model.generate(inputs.input_ids, max_length=512)
    response = mistral_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

# Example query
query = "What is the main topic of the document?"
response = query_rag_system(query)
print(response)

# Ensure models run on GPU if available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
# mistral_model.to(device) #Already in device_map="auto"
# embeddings = embeddings.to(device) # Done earlier and not needed

In [None]:
!huggingface-cli login


In [None]:
# Install required libraries


# Import libraries
import PyPDF2
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import torch
import faiss
import numpy as np

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

pdf_path = '/content/The_Lightning_Thief_-_Percy_Jackson_1-10.pdf'  # Replace with your PDF file path
text = extract_text_from_pdf(pdf_path)

# Split text into chunks
def split_text_into_chunks(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

chunks = split_text_into_chunks(text)

# Generate embeddings using hkunlp/instructor-large
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def generate_embeddings(texts, instruction):
    # Correct way to tokenize and encode for instructor-large
    inputs = tokenizer([instruction + text for text in texts], return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model.encoder(**inputs) # Get only the encoder output
    # Instructor-large returns a tuple of (last_hidden_state, all_hidden_states)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

# Generate embeddings for the chunks
instruction = "Represent the document for retrieval: "
embeddings = generate_embeddings(chunks, instruction)

# Build FAISS index
embeddings_np = embeddings.cpu().numpy() #Move embeddings to cpu
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

# Load Mistral model
mistral_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
mistral_model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2", torch_dtype=torch.bfloat16, device_map="auto")

# Query RAG system
def query_rag_system(query, top_k=3):
    # Generate query embedding (with instruction this time)
    instruction = "Represent the query for retrieval: "
    query_embedding = generate_embeddings([query], instruction).cpu().numpy()

    # Search FAISS index
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve relevant chunks
    relevant_chunks = [chunks[idx] for idx in indices[0]]

    # Combine relevant chunks with the query
    input_text = " ".join(relevant_chunks) + " " + query

    # Tokenize input for Mistral
    inputs = mistral_tokenizer(input_text, return_tensors='pt', max_length=512, truncation=True).to(mistral_model.device) #Moved tokenizer to the device

    # Generate response using Mistral
    outputs = mistral_model.generate(inputs.input_ids, max_length=512)
    response = mistral_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

# Example query
query = "What is the main topic of the document?"
response = query_rag_system(query)
print(response)

# Ensure models run on GPU if available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
# mistral_model.to(device) #Already in device_map="auto"
# embeddings = embeddings.to(device) # Done earlier and not needed

In [None]:
# Install required libraries


# Import libraries
import PyPDF2
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import torch
import faiss
import numpy as np

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

pdf_path = '/content/The_Lightning_Thief_-_Percy_Jackson_1-10.pdf'  # Replace with your PDF file path
text = extract_text_from_pdf(pdf_path)

# Split text into chunks
def split_text_into_chunks(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

chunks = split_text_into_chunks(text)

# Generate embeddings using sentence-transformers/all-MiniLM-L6-v2
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def generate_embeddings(texts, instruction):
    # Correct way to tokenize and encode for sentence-transformers/all-MiniLM-L6-v2
    inputs = tokenizer([instruction + text for text in texts], return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model.encoder(inputs.input_ids, attention_mask=inputs.attention_mask)  # Pass inputs as a positional argument
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

# Generate embeddings for the chunks
instruction = "Represent the document for retrieval: "
embeddings = generate_embeddings(chunks, instruction)

# Build FAISS index
embeddings_np = embeddings.cpu().numpy() #Move embeddings to cpu
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

# Load Mistral model
mistral_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
mistral_model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2", torch_dtype=torch.bfloat16, device_map="auto")

# Query RAG system
def query_rag_system(query, top_k=3):
    # Generate query embedding (with instruction this time)
    instruction = "Represent the query for retrieval: "
    query_embedding = generate_embeddings([query], instruction).cpu().numpy()

    # Search FAISS index
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve relevant chunks
    relevant_chunks = [chunks[idx] for idx in indices[0]]

    # Combine relevant chunks with the query
    input_text = " ".join(relevant_chunks) + " " + query

    # Tokenize input for Mistral
    inputs = mistral_tokenizer(input_text, return_tensors='pt', max_length=512, truncation=True).to(mistral_model.device) #Moved tokenizer to the device

    # Generate response using Mistral
    outputs = mistral_model.generate(inputs.input_ids, max_length=512)
    response = mistral_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

# Example query
query = "What is the main topic of the document?"
response = query_rag_system(query)
print(response)

# Ensure models run on GPU if available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
# mistral_model.to(device) #Already in device_map="auto"
# embeddings = embeddings.to(device) # Done earlier and not needed

In [None]:
# Install required libraries


# Import libraries
import PyPDF2
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import torch
import faiss
import numpy as np

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

pdf_path = '/content/The_Lightning_Thief_-_Percy_Jackson_1-10.pdf'  # Replace with your PDF file path
text = extract_text_from_pdf(pdf_path)

# Split text into chunks
def split_text_into_chunks(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

chunks = split_text_into_chunks(text)

# Generate embeddings using sentence-transformers/all-MiniLM-L6-v2
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def generate_embeddings(texts, instruction):
    # Correct way to tokenize and encode for sentence-transformers/all-MiniLM-L6-v2
    inputs = tokenizer([instruction + text for text in texts], return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model.encoder(inputs.input_ids, attention_mask=inputs.attention_mask)  # Pass inputs as a positional argument
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

# Generate embeddings for the chunks
instruction = "Represent the document for retrieval: "
embeddings = generate_embeddings(chunks, instruction)

# Build FAISS index
embeddings_np = embeddings.cpu().numpy() #Move embeddings to cpu
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

# Load Mistral model
mistral_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
mistral_model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2", torch_dtype=torch.bfloat16, device_map="auto")

# Query RAG system
def query_rag_system(query, top_k=3):
    # Generate query embedding (with instruction this time)
    instruction = "Represent the query for retrieval: "
    query_embedding = generate_embeddings([query], instruction).cpu().numpy()

    # Search FAISS index
    distances, indices = index.search(query_embedding, top_k) #Corrected here

    # Retrieve relevant chunks
    relevant_chunks = [chunks[idx] for idx in indices[0]]

    # Combine relevant chunks with the query
    input_text = " ".join(relevant_chunks) + " " + query

    # Tokenize input for Mistral
    inputs = mistral_tokenizer(input_text, return_tensors='pt', max_length=512, truncation=True).to(mistral_model.device) #Moved tokenizer to the device

    # Generate response using Mistral
    outputs = mistral_model.generate(inputs.input_ids, max_length=512)
    response = mistral_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

# Example query
query = "What is the main topic of the document?"
response = query_rag_system(query)
print(response)

# Ensure models run on GPU if available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
# mistral_model.to(device) #Already in device_map="auto"
# embeddings = embeddings.to(device) # Done earlier and not needed

In [None]:
# Install required libraries


# Import libraries
import PyPDF2
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import torch
import faiss
import numpy as np

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

pdf_path = '/content/The_Lightning_Thief_-_Percy_Jackson_1-10.pdf'  # Replace with your PDF file path
text = extract_text_from_pdf(pdf_path)

# Split text into chunks
def split_text_into_chunks(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

chunks = split_text_into_chunks(text)

# Generate embeddings using sentence-transformers/all-MiniLM-L6-v2
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def generate_embeddings(texts, instruction):
    # Correct way to tokenize and encode for sentence-transformers/all-MiniLM-L6-v2
    inputs = tokenizer([instruction + text for text in texts], return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model.encoder(inputs.input_ids, attention_mask=inputs.attention_mask)  # Pass inputs as a positional argument
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

# Generate embeddings for the chunks
instruction = "Represent the document for retrieval: "
embeddings = generate_embeddings(chunks, instruction)

# Build FAISS index
embeddings_np = embeddings.cpu().numpy() #Move embeddings to cpu
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

# Load Mistral model
mistral_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
mistral_model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2", torch_dtype=torch.bfloat16, device_map="auto")

# Query RAG system
def query_rag_system(query, top_k=3):
    # Generate query embedding (with instruction this time)
    instruction = "Represent the query for retrieval: "
    query_embedding = generate_embeddings([query], instruction).cpu().numpy()

    # Search FAISS index
    distances, indices = index.search(query_embedding, top_k) # This line is now correct

    # Retrieve relevant chunks
    relevant_chunks = [chunks[idx] for idx in indices[0]]

    # Combine relevant chunks with the query
    input_text = " ".join(relevant_chunks) + " " + query

    # Tokenize input for Mistral
    inputs = mistral_tokenizer(input_text, return_tensors='pt', max_length=512, truncation=True).to(mistral_model.device) #Moved tokenizer to the device

    # Generate response using Mistral
    outputs = mistral_model.generate(inputs.input_ids, max_length=512)
    response = mistral_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

# Example query
query = "What is the main topic of the document?"
response = query_rag_system(query)
print(response)

# Ensure models run on GPU if available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
# mistral_model.to(device) #Already in device_map="auto"
# embeddings = embeddings.to(device) # Done earlier and not needed

In [None]:


# Import libraries
import PyPDF2
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import torch
import faiss
import numpy as np

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

pdf_path = '/content/The_Lightning_Thief_-_Percy_Jackson_1-10.pdf'  # Replace with your PDF file path
text = extract_text_from_pdf(pdf_path)

# Split text into chunks
def split_text_into_chunks(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

chunks = split_text_into_chunks(text)

# Generate embeddings using sentence-transformers/all-MiniLM-L6-v2
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def generate_embeddings(texts, instruction):
    # Correct way to tokenize and encode for sentence-transformers/all-MiniLM-L6-v2
    inputs = tokenizer([instruction + text for text in texts], return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model.encoder(inputs.input_ids, attention_mask=inputs.attention_mask)  # Pass inputs as a positional argument
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

# Generate embeddings for the chunks
instruction = "Represent the document for retrieval: "
embeddings = generate_embeddings(chunks, instruction)

# Build FAISS index
embeddings_np = embeddings.cpu().numpy() #Move embeddings to cpu
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

# Load Mistral model
mistral_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
mistral_model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2", torch_dtype=torch.bfloat16, device_map="auto")

# Query RAG system
def query_rag_system(query, top_k=3):
    # Generate query embedding (with instruction this time)
    instruction = "Represent the query for retrieval: "
    query_embedding = generate_embeddings([query], instruction).cpu().numpy()

    # Search FAISS index
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve relevant chunks
    relevant_chunks = [chunks[idx] for idx in indices[0][:]] #Fixed here

    # Combine relevant chunks with the query
    input_text = " ".join(relevant_chunks) + " " + query

    # Tokenize input for Mistral
    inputs = mistral_tokenizer(input_text, return_tensors='pt', max_length=512, truncation=True).to(mistral_model.device) #Moved tokenizer to the device

    # Generate response using Mistral
    outputs = mistral_model.generate(inputs.input_ids, max_length=512)
    response = mistral_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

# Example query
query = "What is the main topic of the document?"
response = query_rag_system(query)
print(response)

# Ensure models run on GPU if available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
# mistral_model.to(device) #Already in device_map="auto"
# embeddings = embeddings.to(device) # Done earlier and not needed

In [None]:


# Import libraries
import PyPDF2
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import torch
import faiss
import numpy as np

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

pdf_path = '/content/The_Lightning_Thief_-_Percy_Jackson_1-10.pdf'  # Replace with your PDF file path
text = extract_text_from_pdf(pdf_path)

# Split text into chunks
def split_text_into_chunks(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

chunks = split_text_into_chunks(text)

# Generate embeddings using sentence-transformers/all-MiniLM-L6-v2
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def generate_embeddings(texts, instruction):
    # Correct way to tokenize and encode for sentence-transformers/all-MiniLM-L6-v2
    inputs = tokenizer([instruction + text for text in texts], return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model.encoder(inputs.input_ids, attention_mask=inputs.attention_mask)  # Pass inputs as a positional argument
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

# Generate embeddings for the chunks
instruction = "Represent the document for retrieval: "
embeddings = generate_embeddings(chunks, instruction)

# Build FAISS index
embeddings_np = embeddings.cpu().numpy() #Move embeddings to cpu
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

# Load Mistral model
mistral_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
mistral_model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2", torch_dtype=torch.bfloat16, device_map="auto")

# Query RAG system
def query_rag_system(query, top_k=3):
    # Generate query embedding (with instruction this time)
    instruction = "Represent the query for retrieval: "
    query_embedding = generate_embeddings([query], instruction).cpu().numpy()

    # Search FAISS index
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve relevant chunks
    relevant_chunks = [chunks[idx] for idx in indices[0]] # Fixed here

    # Combine relevant chunks with the query
    input_text = " ".join(relevant_chunks) + " " + query

    # Tokenize input for Mistral
    inputs = mistral_tokenizer(input_text, return_tensors='pt', max_length=512, truncation=True).to(mistral_model.device) #Moved tokenizer to the device

    # Generate response using Mistral
    outputs = mistral_model.generate(inputs.input_ids, max_length=512)
    response = mistral_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

# Example query
query = "What is the main topic of the document?"
response = query_rag_system(query)
print(response)

# Ensure models run on GPU if available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
# mistral_model.to(device) #Already in device_map="auto"
# embeddings = embeddings.to(device) # Done earlier and not needed

In [None]:
# Install required libraries


# Import libraries
import PyPDF2
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import torch
import faiss
import numpy as np

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

pdf_path = '/content/The_Lightning_Thief_-_Percy_Jackson_1-10.pdf'  # Replace with your PDF file path
text = extract_text_from_pdf(pdf_path)

# Split text into chunks
def split_text_into_chunks(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

chunks = split_text_into_chunks(text)

# Generate embeddings using sentence-transformers/all-MiniLM-L6-v2
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def generate_embeddings(texts, instruction):
    # Correct way to tokenize and encode for sentence-transformers/all-MiniLM-L6-v2
    inputs = tokenizer([instruction + text for text in texts], return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
       # The model expects the inputs as a dictionary, not positional arguments
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

# Generate embeddings for the chunks
instruction = "Represent the document for retrieval: "
embeddings = generate_embeddings(chunks, instruction)

# Build FAISS index
embeddings_np = embeddings.cpu().numpy() #Move embeddings to cpu
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

# Load Mistral model
mistral_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
mistral_model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2", torch_dtype=torch.bfloat16, device_map="auto")

# Query RAG system
def query_rag_system(query, top_k=3):
    # Generate query embedding (with instruction this time)
    instruction = "Represent the query for retrieval: "
    query_embedding = generate_embeddings([query], instruction).cpu().numpy()

    # Search FAISS index
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve relevant chunks
    relevant_chunks = [chunks[idx] for idx in indices[0]] # Fixed here

    # Combine relevant chunks with the query
    input_text = " ".join(relevant_chunks) + " " + query

    # Tokenize input for Mistral
    inputs = mistral_tokenizer(input_text, return_tensors='pt', max_length=512, truncation=True).to(mistral_model.device) #Moved tokenizer to the device

    # Generate response using Mistral
    outputs = mistral_model.generate(inputs.input_ids, max_length=512)
    response = mistral_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

# Example query
query = "What is the main topic of the document?"
response = query_rag_system(query)
print(response)

# Ensure models run on GPU if available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
# mistral_model.to(device) #Already in device_map="auto"
# embeddings = embeddings.to(device) # Done earlier and not needed

In [None]:
# Install required libraries


# Import libraries
import PyPDF2
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import torch
import faiss
import numpy as np

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

pdf_path = '/content/The_Lightning_Thief_-_Percy_Jackson_1-10.pdf'  # Replace with your PDF file path
text = extract_text_from_pdf(pdf_path)

# Split text into chunks
def split_text_into_chunks(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

chunks = split_text_into_chunks(text)

# Generate embeddings using sentence-transformers/all-MiniLM-L6-v2
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def generate_embeddings(texts, instruction):
    # Correct way to tokenize and encode for sentence-transformers/all-MiniLM-L6-v2
    inputs = tokenizer([instruction + text for text in texts], return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
       # The model expects the inputs as a dictionary, not positional arguments
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

# Generate embeddings for the chunks
instruction = "Represent the document for retrieval: "
embeddings = generate_embeddings(chunks, instruction)

# Build FAISS index
embeddings_np = embeddings.cpu().numpy() #Move embeddings to cpu
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

# Load Mistral model
mistral_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
mistral_model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2", torch_dtype=torch.bfloat16, device_map="auto")

# Query RAG system
def query_rag_system(query, top_k=3):
    # Generate query embedding (with instruction this time)
    instruction = "Represent the query for retrieval: "
    query_embedding = generate_embeddings([query], instruction).cpu().numpy()

    # Search FAISS index
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve relevant chunks
    relevant_chunks = [chunks[idx] for idx in indices[0]]

    # Combine relevant chunks with the query
    input_text = " ".join(relevant_chunks) + " " + query

    # Tokenize input for Mistral
    inputs = mistral_tokenizer(input_text, return_tensors='pt', truncation=True).to(mistral_model.device) #Moved tokenizer to the device, removed max_length

    # Generate response using Mistral, using max_new_tokens instead of max_length
    outputs = mistral_model.generate(inputs.input_ids, max_new_tokens=512)
    response = mistral_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

# Example query
query = "What is the main topic of the document?"
response = query_rag_system(query)
print(response)

# Ensure models run on GPU if available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
# mistral_model.to(device) #Already in device_map="auto"
# embeddings = embeddings.to(device) # Done earlier and not needed

In [None]:
# Install required libraries


# Import libraries
import PyPDF2
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import torch
import faiss
import numpy as np

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

pdf_path = '/content/The_Lightning_Thief_-_Percy_Jackson_1-10.pdf'  # Replace with your PDF file path
text = extract_text_from_pdf(pdf_path)

# Split text into chunks
def split_text_into_chunks(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

chunks = split_text_into_chunks(text)

# Generate embeddings using sentence-transformers/all-MiniLM-L6-v2
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def generate_embeddings(texts, instruction):
    # Correct way to tokenize and encode for sentence-transformers/all-MiniLM-L6-v2
    inputs = tokenizer([instruction + text for text in texts], return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
       # The model expects the inputs as a dictionary, not positional arguments
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

# Generate embeddings for the chunks
instruction = "Represent the document for retrieval: "
embeddings = generate_embeddings(chunks, instruction)

# Build FAISS index
embeddings_np = embeddings.cpu().numpy() #Move embeddings to cpu
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

# Load Mistral model
mistral_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
mistral_model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2", torch_dtype=torch.bfloat16, device_map="auto")

# Query RAG system
def query_rag_system(query, top_k=3):
    # Generate query embedding (with instruction this time)
    instruction = "Represent the query for retrieval: "
    query_embedding = generate_embeddings([query], instruction).cpu().numpy()

    # Search FAISS index
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve relevant chunks, ensuring indices are within bounds
    relevant_chunks = [chunks[idx] for idx in indices[0] if idx < len(chunks)]

    # Combine relevant chunks with the query
    input_text = " ".join(relevant_chunks) + " " + query

    # Tokenize input for Mistral
    inputs = mistral_tokenizer(input_text, return_tensors='pt', truncation=True).to(mistral_model.device) #Moved tokenizer to the device, removed max_length

    # Generate response using Mistral, using max_new_tokens instead of max_length
    outputs = mistral_model.generate(inputs.input_ids, max_new_tokens=52)
    response = mistral_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

# Example query
query = "What is the main topic of the document?"
response = query_rag_system(query)
print(response)

# Ensure models run on GPU if available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
# mistral_model.to(device) #Already in device_map="auto"
# embeddings = embeddings.to(device) # Done earlier and not needed

In [None]:
#!pip install transformers faiss-cpu sentence-transformers PyPDF2 torch

import PyPDF2
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import torch
import faiss
import numpy as np

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

def split_text_into_chunks(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

def generate_embeddings(texts, instruction):
    inputs = tokenizer([instruction + text for text in texts], return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

# --- Main Execution ---

pdf_path = '/content/The_Lightning_Thief_-_Percy_Jackson_1-10.pdf'
text = extract_text_from_pdf(pdf_path)
chunks = split_text_into_chunks(text)

# Generate embeddings
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
instruction = "Represent the document for retrieval: "
embeddings = generate_embeddings(chunks, instruction)

# Build FAISS index
embeddings_np = embeddings.cpu().numpy()
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

# Load Mistral model and set pad token
mistral_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
mistral_tokenizer.pad_token = mistral_tokenizer.eos_token
mistral_model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2", torch_dtype=torch.bfloat16, device_map="auto")

def query_rag_system(query, top_k=3):
    instruction = "Represent the query for retrieval: "
    query_embedding = generate_embeddings([query], instruction).cpu().numpy()
    distances, indices = index.search(query_embedding, top_k)

    relevant_chunks = []
    for i in range(indices.shape[0]):
        for j in range(indices.shape[1]):
            idx = indices[i, j]
            if 0 <= idx < len(chunks):
                relevant_chunks.append(chunks[idx])

    input_text = " ".join(relevant_chunks) + " " + query
    inputs = mistral_tokenizer(input_text, return_tensors='pt', truncation=True, padding=True).to(mistral_model.device)
    outputs = mistral_model.generate(inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=512)
    response = mistral_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

# Example query
query = "What is the main topic of the document?"
response = query_rag_system(query)
print(response)

In [None]:
#!pip install transformers faiss-cpu sentence-transformers PyPDF2 torch

import PyPDF2
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import torch
import faiss
import numpy as np

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

def split_text_into_chunks(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

def generate_embeddings(texts, instruction):
    inputs = tokenizer([instruction + text for text in texts], return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

# --- Main Execution ---

pdf_path = '/content/The_Lightning_Thief_-_Percy_Jackson_1-10.pdf'
text = extract_text_from_pdf(pdf_path)
chunks = split_text_into_chunks(text)

# Generate embeddings
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
instruction = "Represent the document for retrieval: "
embeddings = generate_embeddings(chunks, instruction)

# Build FAISS index
embeddings_np = embeddings.cpu().numpy()
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

# Load Mistral model and set pad token
mistral_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
mistral_tokenizer.pad_token = mistral_tokenizer.eos_token
mistral_model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2", torch_dtype=torch.bfloat16, device_map="auto")

def query_rag_system(query, top_k=3):
    instruction = "Represent the query for retrieval: "
    query_embedding = generate_embeddings([query], instruction).cpu().numpy()
    distances, indices = index.search(query_embedding, top_k)

    relevant_chunks = []
    for i in range(indices.shape[0]):
        for j in range(indices.shape[1]):
            idx = indices[i, j]
            if 0 <= idx < len(chunks):
                relevant_chunks.append(chunks[idx])

    input_text = " ".join(relevant_chunks) + " " + query
    inputs = mistral_tokenizer(input_text, return_tensors='pt', truncation=True, padding=True).to(mistral_model.device)
    outputs = mistral_model.generate(inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=5)
    response = mistral_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

# Example query
query = "What is the main topic of the document?"
response = query_rag_system(query)
print(response)

شغال ع المعلج

In [None]:
#!pip install transformers faiss-cpu sentence-transformers PyPDF2 torch

import PyPDF2
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import torch
import faiss
import numpy as np

# --- Functions ---

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

def split_text_into_chunks(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

def generate_embeddings(texts, instruction):
    inputs = tokenizer([instruction + text for text in texts], return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

# --- Main Execution ---

pdf_path = '/content/The_Lightning_Thief_-_Percy_Jackson_1-10.pdf'
text = extract_text_from_pdf(pdf_path)
chunks = split_text_into_chunks(text)

# Generate embeddings
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
instruction = "Represent the document for retrieval: "
embeddings = generate_embeddings(chunks, instruction)

# Build FAISS index
embeddings_np = embeddings.cpu().numpy()
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

# Load Mistral model and set pad token if needed
mistral_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
if mistral_tokenizer.pad_token_id is None:
    mistral_tokenizer.pad_token = mistral_tokenizer.eos_token

mistral_model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2", torch_dtype=torch.bfloat16, device_map="auto")

def query_rag_system(query, top_k=3):
    instruction = "Represent the query for retrieval: "
    query_embedding = generate_embeddings([query], instruction).cpu().numpy()
    distances, indices = index.search(query_embedding, top_k)

    relevant_chunks = []
    for i in range(indices.shape[0]):
        for j in range(indices.shape[1]):
            idx = indices[i, j]
            if 0 <= idx < len(chunks):
                relevant_chunks.append(chunks[idx])

    # Limit context size by truncating individual chunks
    max_chunk_length = 256
    relevant_chunks = [chunk[:max_chunk_length] for chunk in relevant_chunks]

    input_text = " ".join(relevant_chunks) + " " + query
    inputs = mistral_tokenizer(input_text, return_tensors='pt', truncation=True, padding=True).to(mistral_model.device)
    outputs = mistral_model.generate(inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=256)
    response = mistral_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

# Example query
query = "What is the main topic of the document?"
response = query_rag_system(query)
print(response)

شغال احسن

In [None]:
#!pip install transformers faiss-cpu sentence-transformers PyPDF2 torch

import PyPDF2
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import torch
import faiss
import numpy as np

# --- Functions ---

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

def split_text_into_chunks(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

def generate_embeddings(texts, instruction):
    inputs = tokenizer([instruction + text for text in texts], return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

# --- Main Execution ---

pdf_path = '/content/The_Lightning_Thief_-_Percy_Jackson_1-10.pdf'
text = extract_text_from_pdf(pdf_path)
chunks = split_text_into_chunks(text)

# Generate embeddings
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
instruction = "Represent the document for retrieval: "
embeddings = generate_embeddings(chunks, instruction)

# Build FAISS index
embeddings_np = embeddings.cpu().numpy()
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

# Load Mistral model and set pad token if needed
mistral_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
if mistral_tokenizer.pad_token_id is None:
    mistral_tokenizer.pad_token = mistral_tokenizer.eos_token

mistral_model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2", torch_dtype=torch.bfloat16, device_map="auto")

def query_rag_system(query, top_k=3):
    instruction = "Represent the query for retrieval: "
    query_embedding = generate_embeddings([query], instruction).cpu().numpy()
    distances, indices = index.search(query_embedding, top_k)

    relevant_chunks = []
    for i in range(indices.shape[0]):
        for j in range(indices.shape[1]):
            idx = indices[i, j]
            if 0 <= idx < len(chunks):
                relevant_chunks.append(chunks[idx])

    # Limit context size by truncating individual chunks
    max_chunk_length = 256
    relevant_chunks = [chunk[:max_chunk_length] for chunk in relevant_chunks]

    input_text = " ".join(relevant_chunks) + " " + query
    inputs = mistral_tokenizer(input_text, return_tensors='pt', truncation=True, padding=True).to(mistral_model.device)
    outputs = mistral_model.generate(inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=64, do_sample=True)
    response = mistral_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

# Example query
query = "What is the main topic of the document?"
response = query_rag_system(query)
print(response)

شغال

In [None]:
#!pip install transformers faiss-cpu sentence-transformers PyPDF2 torch

import PyPDF2
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM
import torch
import faiss
import numpy as np

# --- Functions ---

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

def split_text_into_chunks(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

def generate_embeddings(texts, instruction):
    inputs = tokenizer([instruction + text for text in texts], return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

# --- Main Execution ---

pdf_path = '/content/The_Lightning_Thief_-_Percy_Jackson_1-10.pdf'
text = extract_text_from_pdf(pdf_path)
chunks = split_text_into_chunks(text)

# Generate embeddings
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
instruction = "Represent the document for retrieval: "
embeddings = generate_embeddings(chunks, instruction)

# Build FAISS index
embeddings_np = embeddings.cpu().numpy()
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

# Load Mistral model
mistral_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
mistral_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", torch_dtype=torch.bfloat16, device_map="auto")

def query_rag_system(query, top_k=3):
    instruction = "Represent the query for retrieval: "
    query_embedding = generate_embeddings([query], instruction).cpu().numpy()
    distances, indices = index.search(query_embedding, top_k)

    relevant_chunks = []
    for i in range(indices.shape[0]):
        for j in range(indices.shape[1]):
            idx = indices[i, j]
            if 0 <= idx < len(chunks):
                relevant_chunks.append(chunks[idx])

    # Combine relevant chunks with the query and instruction
    input_text = f"""Context: { " ".join(relevant_chunks) }

    Question: {query}

    Answer:"""

    inputs = mistral_tokenizer(input_text, return_tensors='pt', truncation=True, padding=True).to(mistral_model.device)
    outputs = mistral_model.generate(inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=512, do_sample=True)
    response = mistral_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

# Example query
query = "What is the main topic of the document?"
response = query_rag_system(query)
print(response)

شغال

In [None]:
#!pip install transformers faiss-cpu sentence-transformers PyPDF2 torch

import PyPDF2
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM
import torch
import faiss
import numpy as np

# --- Functions ---

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

def split_text_into_chunks(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

def generate_embeddings(texts, instruction):
    inputs = tokenizer([instruction + text for text in texts], return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

# --- Main Execution ---

pdf_path = '/content/The_Lightning_Thief_-_Percy_Jackson_1-10.pdf'
text = extract_text_from_pdf(pdf_path)
chunks = split_text_into_chunks(text)

# Generate embeddings
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
instruction = "Represent the document for retrieval: "
embeddings = generate_embeddings(chunks, instruction)

# Build FAISS index
embeddings_np = embeddings.cpu().numpy()
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

# Load Mistral model
mistral_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
mistral_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", torch_dtype=torch.bfloat16, device_map="auto")

def query_rag_system(query, top_k=3):
    instruction = "Represent the query for retrieval: "
    query_embedding = generate_embeddings([query], instruction).cpu().numpy()
    distances, indices = index.search(query_embedding, top_k)

    relevant_chunks = []
    for i in range(indices.shape[0]):
        for j in range(indices.shape[1]):
            idx = indices[i, j]
            if 0 <= idx < len(chunks):
                relevant_chunks.append(chunks[idx])

    # --- Debugging Print Statements ---
    print("Relevant Chunks:")
    for chunk in relevant_chunks:
        print(chunk)
    print("\nInput Text:")
    # --- End Debugging Print Statements ---

    # Combine relevant chunks with the query and instruction
    input_text = f"""Context: { " ".join(relevant_chunks) }

    Question: {query}

    Answer:"""
    print(input_text)

    inputs = mistral_tokenizer(input_text, return_tensors='pt', truncation=True, padding=True).to(mistral_model.device)
    outputs = mistral_model.generate(inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=512, do_sample=True)
    response = mistral_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

# Example query
query = "What is the main topic of the document?"
response = query_rag_system(query)
print(response)

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline
import torch
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe = pipeline("text-generation", model="google/flan-t5-base", trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="auto", max_new_tokens=5, do_sample=True)
pipe(messages)

شغال سريع جدا ع المعالج

In [None]:

from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

input_text = "translate English to German: How old are you?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0]))


In [None]:

from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

input_text = "Who is Napoleon Bonaparte?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0]))


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

input_text = "Who is python?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

# Generate text with specified parameters
outputs = model.generate(input_ids, max_new_tokens=512, do_sample=True)

# Decode and print the generated text
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("openai-community/gpt2")
model = T5ForConditionalGeneration.from_pretrained("openai-community/gpt2")

input_text = "Who is python?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

# Generate text with specified parameters
outputs = model.generate(input_ids, max_new_tokens=512, do_sample=True)

# Decode and print the generated text
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

شغال

In [None]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='gpt2')
set_seed(42)
generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5)


In [None]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='gpt2')
set_seed(42)
generator("Who is Napoleon Bonaparte?", max_length=30, num_return_sequences=5max_new_tokens=512, do_sample=True)


In [None]:
max_new_tokens=512, do_sample=True)

شغال افضل

In [None]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='gpt2')
set_seed(42)
generator("Hello, I'm a language model,", max_length=128, num_return_sequences=1, do_sample=True)


In [None]:
truncation=True

شغال ولاالجابة سيئة

In [None]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='gpt2')
set_seed(444)
generator("Who is Napoleon Bonaparte?", max_length=128, num_return_sequences=1, do_sample=True, truncation=True)


In [None]:
!pip install datasets

In [None]:
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration

tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", trust_remote_code=True)
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

input_dict = tokenizer.prepare_seq2seq_batch("who holds the record in 100m freestyle", return_tensors="pt")

generated = model.generate(input_ids=input_dict["input_ids"])
print(tokenizer.batch_decode(generated, skip_special_tokens=True)[0])

# should give michael phelps => sounds reasonable


In [None]:
retriever = RagRetriever.from_pretrained(
    "facebook/rag-token-nq",
    index_name="legacy",  # اسم الفهرس الافتراضي
    trust_remote_code=True
)


In [None]:
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration

tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="legacy", trust_remote_code=True, use_dummy_dataset=True)
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

input_dict = tokenizer.prepare_seq2seq_batch("who holds the record in 100m freestyle", return_tensors="pt")

generated = model.generate(input_ids=input_dict["input_ids"])
print(tokenizer.batch_decode(generated, skip_special_tokens=True)[0])

# should give michael phelps => sounds reasonable


https://huggingface.co/MustEr/rager_legacy/blob/main/psgs_w100.tsv.pkl

In [None]:
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration

tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True)
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

input_dict = tokenizer.prepare_seq2seq_batch("who holds the record in 100m freestyle", return_tensors="pt")

generated = model.generate(input_ids=input_dict["input_ids"])
print(tokenizer.batch_decode(generated, skip_special_tokens=True)[0])

# should give michael phelps => sounds reasonable


In [None]:
retriever = RagRetriever.from_pretrained(
    "facebook/rag-token-nq",
    index_path="/path/to/psgs_w100.tsv.pkl",
    trust_remote_code=True
)


In [None]:
from transformers import RagTokenizer, RagTokenForGeneration

tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq")

input_dict = tokenizer.prepare_seq2seq_batch("who holds the record in 100m freestyle", return_tensors="pt")

generated = model.generate(input_ids=input_dict["input_ids"])
print(tokenizer.batch_decode(generated, skip_special_tokens=True)[0])


In [None]:
from transformers import RagTokenizer, RagTokenForGeneration

tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq")

input_dict = tokenizer.prepare_seq2seq_batch("who holds the record in 100m freestyle", return_tensors="pt")

generated = model.generate(input_ids=input_dict["input_ids"])
print(tokenizer.batch_decode(generated, skip_special_tokens=True)[0])


https://github.com/chaimol/bio_code/blob/master/practice/tsv2pkl.py

https://www.programmersought.com/article/12217435517/

In [None]:
import cPickle as pickle
f = open('path')
data = pickle.load(f)
print (data)   #show file

In [None]:
import pickle as pkl

In [None]:
!pip install cPickle

In [None]:
import pickle as pkl
f = open('/content/psgs_w100.tsv')
data = pickle.load(f)
print (data)   #show file

In [None]:
import pickle as pkl  # Use pickle instead of cPickle
f = open('/content/psgs_w100.tsv')
data = pickle.load(f)
print (data)   #show file

In [None]:
import pickle as pkl  # Use pickle instead of cPickle

# Open the file in binary read mode ('rb')
f = open('/content/psgs_w100.tsv', 'rb')

data = pickle.load(f)
print (data)   #show file

ayhgشغال

In [None]:
import pandas as pd

# Assuming your TSV file has a header row
df = pd.read_csv('/content/psgs_w100.tsv', sep='\t')

print(df)  # Display the DataFrame

In [None]:
from transformers.models.rag.retrieval_rag import RagRetriever

# تحديد المسار الخاص بملف .tsv
passages_path = "/content/psgs_w100.tsv"

# إنشاء الفهرسة
retriever = RagRetriever.from_pretrained(
    "facebook/rag-token-nq",
    index_name="exact",
    passages_path=passages_path,
    use_dummy_dataset=True
)

# حفظ الفهرسة في ملف .pkl
retriever.save_index("path/to/psgs_w100.tsv.pkl")


In [None]:
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration

# تحميل البيانات المصغرة
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
retriever = RagRetriever.from_pretrained(
    "facebook/rag-token-nq",
    index_path="/content/psgs_w100_index.pkl",
    passages_path="/content/psgs_w100.tsv"
)
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

# إدخال السؤال
input_dict = tokenizer.prepare_seq2seq_batch("Who developed the theory of relativity?", return_tensors="pt")
generated = model.generate(input_ids=input_dict["input_ids"])

# عرض النتيجة
print(tokenizer.batch_decode(generated, skip_special_tokens=True)[0])


In [None]:
import pandas as pd

# قراءة ملف البيانات النصية
passages = pd.read_csv("/content/psgs_w100.tsv", sep="\t", names=["id", "text", "title"])

# عرض أول 5 أسطر من البيانات
print(passages.head())


In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

# تحميل نموذج لتوليد الـ embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# توليد embeddings للنصوص
embeddings = model.encode(passages["text"].tolist(), convert_to_numpy=True)

# حفظ embeddings كملف .pkl
import faiss
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

# حفظ الفهرس
faiss.write_index(index, "/content/psgs_w100_index.pkl")


In [None]:
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration

# تحميل البيانات المصغرة
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
retriever = RagRetriever.from_pretrained(
    "facebook/rag-token-nq",
    index_path="/content/psgs_w100_index.pkl",
    passages_path="/content/psgs_w100.tsv"
)
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

# إدخال السؤال
input_dict = tokenizer.prepare_seq2seq_batch("Who developed the theory of relativity?", return_tensors="pt")
generated = model.generate(input_ids=input_dict["input_ids"])

# عرض النتيجة
print(tokenizer.batch_decode(generated, skip_special_tokens=True)[0])


In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

# تحميل نموذج لتوليد embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# توليد embeddings لجميع النصوص
embeddings = model.encode(passages["text"].tolist(), convert_to_numpy=True)

# إنشاء الفهرس
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

# حفظ الفهرس
faiss.write_index(index, "/content/psgs_w100_index.pkl")
print(f"تم إنشاء الفهرس بنجاح بعدد عناصر: {index.ntotal}")


In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

# تحميل النصوص من ملف psgs_w100.tsv
passages = pd.read_csv("/content/psgs_w100.tsv", sep="\t", names=["id", "text", "title"])

# تحميل نموذج لتوليد embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")

# توليد embeddings للنصوص
embeddings = model.encode(passages["text"].tolist(), convert_to_numpy=True)

# إنشاء الفهرس باستخدام FAISS
dimension = embeddings.shape[1]  # أبعاد كل embedding
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)  # إضافة embeddings إلى الفهرس

# حفظ الفهرس
faiss.write_index(index, "/content/psgs_w100_index.pkl")
print(f"تم إنشاء الفهرس بنجاح بعدد عناصر: {index.ntotal}")


In [None]:
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration

# تحميل البيانات
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
retriever = RagRetriever.from_pretrained(
    "facebook/rag-token-nq",
    index_path="/content/psgs_w100_index.pkl",
    passages_path="/content/psgs_w100.tsv"
)
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

# إدخال السؤال
input_dict = tokenizer.prepare_seq2seq_batch("Who developed the theory of relativity?", return_tensors="pt")
generated = model.generate(input_ids=input_dict["input_ids"])

# عرض النتيجة
print(tokenizer.batch_decode(generated, skip_special_tokens=True)[0])


In [None]:
import pandas as pd

# قراءة ملف النصوص الكامل
passages = pd.read_csv("/content/psgs_w100.tsv", sep="\t", names=["id", "text", "title"])

# اختيار مجموعة بيانات صغيرة (10 نصوص على سبيل المثال)
small_passages = passages.sample(n=10, random_state=42)  # اختار 10 نصوص عشوائية

# حفظ المجموعة الصغيرة في ملف جديد
small_passages.to_csv("/content/small_psgs_w100.tsv", sep="\t", index=False, header=False)

print(f"تم حفظ مجموعة البيانات الصغيرة بحجم: {len(small_passages)}")


In [None]:
# قراءة ملف النصوص
passages = pd.read_csv("/content/psgs_w100.tsv", sep="\t", names=["id", "text", "title"])

# تحقق من حجم مجموعة البيانات
print(f"حجم مجموعة البيانات: {len(passes)}")


In [None]:
import pandas as pd

# قراءة ملف النصوص الكامل
passages = pd.read_csv("/content/psgs_w100.tsv", sep="\t", names=["id", "text", "title"])

# تحقق من حجم مجموعة البيانات
print(f"حجم مجموعة البيانات: {len(passes)}")


In [None]:
import pandas as pd

# قراءة ملف النصوص الكامل
passages = pd.read_csv("/content/psgs_w100.tsv", sep="\t", names=["id", "text", "title"])

# تحقق من حجم مجموعة البيانات
print(f"حجم مجموعة البيانات: {len(passes)}")  # الخطأ في الاسم


In [None]:
import pandas as pd

# قراءة ملف النصوص الكامل
passages = pd.read_csv("/content/psgs_w100.tsv", sep="\t", names=["id", "text", "title"])

# تحقق من حجم مجموعة البيانات
print(f"حجم مجموعة البيانات: {len(passes)}")  # هنا الخطأ

# تأكد من طباعة حجم البيانات بشكل صحيح
print(f"حجم مجموعة البيانات: {len(passes)}")


In [None]:
import pandas as pd

# قراءة ملف النصوص الكامل
passages = pd.read_csv("/content/psgs_w100.tsv", sep="\t", names=["id", "text", "title"])

# تحقق من حجم مجموعة البيانات
print(f"حجم مجموعة البيانات: {len(passages)}")  # Corrected variable name to 'passages'

# تأكد من طباعة حجم البيانات بشكل صحيح
print(f"حجم مجموعة البيانات: {len(passages)}")  # Corrected variable name to 'passages'


In [None]:
# طباعة أول 5 صفوف من البيانات
print(passages.head())


In [None]:
# قراءة ملف النصوص مع تجاهل الصف الأول الذي يحتوي على رؤوس الأعمدة
passages = pd.read_csv("/content/psgs_w100.tsv", sep="\t", header=1, names=["id", "text", "title"])

# طباعة أول 5 صفوف من البيانات بعد التصحيح
print(passages.head())


In [None]:
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration

# تحميل البيانات المصغرة
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
retriever = RagRetriever.from_pretrained(
    "facebook/rag-token-nq",
    index_path="/content/psgs_w100_index.pkl",
    passages_path="/content/psgs_w100.tsv"
)
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

# إدخال سؤال للنموذج
input_dict = tokenizer.prepare_seq2seq_batch("Who developed the theory of relativity?", return_tensors="pt")
generated = model.generate(input_ids=input_dict["input_ids"])

# عرض النتيجة
print(tokenizer.batch_decode(generated, skip_special_tokens=True)[0])


In [None]:
import faiss
import numpy as np

# إنشاء بيانات مصغرة (تمثل embeddings)
# هذا المثال يتطلب استبدال البيانات الفعلية بـ embeddings المناسبة
dimension = 768  # حدد الأبعاد المناسبة للبيانات الخاصة بك
num_vectors = 3  # عدد النقاط في الفهرس
data = np.random.random((num_vectors, dimension)).astype('float32')

# إنشاء الفهرس
index = faiss.IndexFlatL2(dimension)  # فهرس يعتمد على المسافة L2
index.add(data)  # إضافة البيانات للفهرس

# حفظ الفهرس في ملف .pkl
faiss.write_index(index, "/content/psgs_w100_index.pkl")


In [None]:
#!pip install transformers faiss-cpu sentence-transformers PyPDF2 torch

import PyPDF2
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM
import torch
import faiss
import numpy as np

# --- Functions ---

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

def split_text_into_chunks(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

def generate_embeddings(texts, instruction):
    inputs = tokenizer([instruction + text for text in texts], return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

# --- Main Execution ---

pdf_path = '/content/The_Lightning_Thief_-_Percy_Jackson_1-10.pdf'
text = extract_text_from_pdf(pdf_path)
chunks = split_text_into_chunks(text)

# Generate embeddings
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
instruction = "Represent the document for retrieval: "
embeddings = generate_embeddings(chunks, instruction)

# Build FAISS index
embeddings_np = embeddings.cpu().numpy()
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

# Load Mistral model
mistral_tokenizer = AutoTokenizer.from_pretrained("facebook/rag-token-nq")
mistral_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/rag-token-nq", torch_dtype=torch.bfloat16, device_map="auto")

def query_rag_system(query, top_k=3):
    instruction = "Represent the query for retrieval: "
    query_embedding = generate_embeddings([query], instruction).cpu().numpy()
    distances, indices = index.search(query_embedding, top_k)

    relevant_chunks = []
    for i in range(indices.shape[0]):
        for j in range(indices.shape[1]):
            idx = indices[i, j]
            if 0 <= idx < len(chunks):
                relevant_chunks.append(chunks[idx])

    # --- Debugging Print Statements ---
    print("Relevant Chunks:")
    for chunk in relevant_chunks:
        print(chunk)
    print("\nInput Text:")
    # --- End Debugging Print Statements ---

    # Combine relevant chunks with the query and instruction
    input_text = f"""Context: { " ".join(relevant_chunks) }

    Question: {query}

    Answer:"""
    print(input_text)

    inputs = mistral_tokenizer(input_text, return_tensors='pt', truncation=True, padding=True).to(mistral_model.device)
    outputs = mistral_model.generate(inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=512, do_sample=True)
    response = mistral_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

# Example query
query = "What is the main topic of the document?"
response = query_rag_system(query)
print(response)

In [None]:
#!pip install transformers faiss-cpu sentence-transformers PyPDF2 torch

import PyPDF2
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM
import torch
import faiss
import numpy as np

# --- Functions ---

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

def split_text_into_chunks(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

def generate_embeddings(texts, instruction):
    inputs = tokenizer([instruction + text for text in texts], return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

# --- Main Execution ---

pdf_path = '/content/The_Lightning_Thief_-_Percy_Jackson_1-10.pdf'
text = extract_text_from_pdf(pdf_path)
chunks = split_text_into_chunks(text)

# Generate embeddings
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
instruction = "Represent the document for retrieval: "
embeddings = generate_embeddings(chunks, instruction)

# Build FAISS index
embeddings_np = embeddings.cpu().numpy()
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

# Load Mistral model
mistral_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
mistral_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", torch_dtype=torch.bfloat16, device_map="auto")

def query_rag_system(query, top_k=3):
    instruction = "Represent the query for retrieval: "
    query_embedding = generate_embeddings([query], instruction).cpu().numpy()
    distances, indices = index.search(query_embedding, top_k)

    relevant_chunks = []
    for i in range(indices.shape[0]):
        for j in range(indices.shape[1]):
            idx = indices[i, j]
            if 0 <= idx < len(chunks):
                relevant_chunks.append(chunks[idx])

    # --- Debugging Print Statements ---
    print("Relevant Chunks:")
    for chunk in relevant_chunks:
        print(chunk)
    print("\nInput Text:")
    # --- End Debugging Print Statements ---

    # Combine relevant chunks with the query and instruction
    input_text = f"""Context: { " ".join(relevant_chunks) }

    Question: {query}

    Answer:"""
    print(input_text)

    inputs = mistral_tokenizer(input_text, return_tensors='pt', truncation=True, padding=True).to(mistral_model.device)
    outputs = mistral_model.generate(inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=512, do_sample=True)
    response = mistral_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

# Example query
query = "What is the main topic of the document?"
response = query_rag_system(query)
print(response)

In [None]:
# Install required libraries


# Import libraries
import PyPDF2
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import torch
import faiss
import numpy as np

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

pdf_path = '/content/The_Lightning_Thief_-_Percy_Jackson_1-10.pdf'  # Replace with your PDF file path
text = extract_text_from_pdf(pdf_path)

# Split text into chunks
def split_text_into_chunks(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

chunks = split_text_into_chunks(text)

# Generate embeddings using sentence-transformers/all-MiniLM-L6-v2
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
device = torch.device("cpu") # Force CPU usage
model.to(device)

def generate_embeddings(texts, instruction):
    # Correct way to tokenize and encode for sentence-transformers/all-MiniLM-L6-v2
    inputs = tokenizer([instruction + text for text in texts], return_tensors='pt', padding=True, truncation=True, max_length=512).to(device) #Moved tokenizer to device
    with torch.no_grad():
        outputs = model.encoder(inputs.input_ids, attention_mask=inputs.attention_mask)  # Pass inputs as a positional argument
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

# Generate embeddings for the chunks
instruction = "Represent the document for retrieval: "
embeddings = generate_embeddings(chunks, instruction)

# Build FAISS index
embeddings_np = embeddings.cpu().numpy() #Move embeddings to cpu
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

# Load Mistral model
mistral_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
mistral_model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2", torch_dtype=torch.float32) # Removed device_map="auto" and used float32
mistral_model.to(device) #Moved model to the device

# Query RAG system
def query_rag_system(query, top_k=3):
    # Generate query embedding (with instruction this time)
    instruction = "Represent the query for retrieval: "
    query_embedding = generate_embeddings([query], instruction).cpu().numpy()

    # Search FAISS index
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve relevant chunks
    relevant_chunks = [chunks[idx] for idx in indices[0]]

    # Combine relevant chunks with the query
    input_text = " ".join(relevant_chunks) + " " + query

    # Tokenize input for Mistral
    inputs = mistral_tokenizer(input_text, return_tensors='pt', max_length=512, truncation=True).to(device) #Moved tokenizer to the device

    # Generate response using Mistral
    outputs = mistral_model.generate(inputs.input_ids, max_length=512)
    response = mistral_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

# Example query
query = "What is the main topic of the document?"
response = query_rag_system(query)
print(response)

# Ensure models run on GPU if available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Removed device selection because we explicitly set it to CPU
# model.to(device) #Moved model to device earlier
# mistral_model.to(device) #Already in device_map="auto" # Now moved to device explicitly
# embeddings = embeddings.to(device) # Now not needed

In [None]:
# Install required libraries
!pip install transformers faiss-cpu sentence-transformers PyPDF2 torch

# Import libraries
import PyPDF2
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import torch
import faiss
import numpy as np

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

pdf_path = '/content/The_Lightning_Thief_-_Percy_Jackson_1-10.pdf'  # Replace with your PDF file path
text = extract_text_from_pdf(pdf_path)

# Split text into chunks
def split_text_into_chunks(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

chunks = split_text_into_chunks(text)

# Generate embeddings using sentence-transformers/all-MiniLM-L6-v2
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
device = torch.device("cpu") # Force CPU usage
model.to(device)

def generate_embeddings(texts, instruction):
    # Correct way to tokenize and encode for sentence-transformers/all-MiniLM-L6-v2
    inputs = tokenizer([instruction + text for text in texts], return_tensors='pt', padding=True, truncation=True, max_length=512).to(device) #Moved tokenizer to device
    with torch.no_grad():
        outputs = model.encoder(inputs.input_ids, attention_mask=inputs.attention_mask)  # Pass inputs as a positional argument
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

# Generate embeddings for the chunks
instruction = "Represent the document for retrieval: "
embeddings = generate_embeddings(chunks, instruction)

# Build FAISS index
embeddings_np = embeddings.cpu().numpy() #Move embeddings to cpu
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

# Load Mistral model
mistral_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
mistral_model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2", torch_dtype=torch.float32) # Removed device_map="auto" and used float32
mistral_model.to(device) #Moved model to the device

# Query RAG system
def query_rag_system(query, top_k=3):
    # Generate query embedding (with instruction this time)
    instruction = "Represent the query for retrieval: "
    query_embedding = generate_embeddings([query], instruction).cpu().numpy()

    # Search FAISS index
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve relevant chunks
    relevant_chunks = [chunks[idx] for idx in indices[0]] # Corrected this line

    # Combine relevant chunks with the query
    input_text = " ".join(relevant_chunks) + " " + query

    # Tokenize input for Mistral
    inputs = mistral_tokenizer(input_text, return_tensors='pt', max_length=512, truncation=True).to(device) #Moved tokenizer to the device

    # Generate response using Mistral
    outputs = mistral_model.generate(inputs.input_ids, max_length=512)
    response = mistral_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

# Example query
query = "What is the main topic of the document?"
response = query_rag_system(query)
print(response)

# Ensure models run on GPU if available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Removed device selection because we explicitly set it to CPU
# model.to(device) #Moved model to device earlier
# mistral_model.to(device) #Already in device_map="auto" # Now moved to device explicitly
# embeddings = embeddings.to(device) # Now not needed

In [None]:
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration

tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True)
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

input_dict = tokenizer.prepare_seq2seq_batch("who holds the record in 100m freestyle", return_tensors="pt")

generated = model.generate(input_ids=input_dict["input_ids"])
print(tokenizer.batch_decode(generated, skip_special_tokens=True)[0])

# should give michael phelps => sounds reasonable


In [None]:
!rm -rf /root/.cache

https://huggingface.co/datasets/facebook/wiki_dpr/tree/main/data/psgs_w100

شغال رائع

In [None]:
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration

tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True, trust_remote_code=True)
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

input_dict = tokenizer.prepare_seq2seq_batch("who holds the record in 100m freestyle", return_tensors="pt")

generated = model.generate(input_ids=input_dict["input_ids"])
print(tokenizer.batch_decode(generated, skip_special_tokens=True)[0])

# should give michael phelps => sounds reasonable


In [None]:
!rm -rf /path/to/your/folder/*

In [None]:
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration

tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True, trust_remote_code=True)
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

input_dict = tokenizer.prepare_seq2seq_batch("who holds the record in 100m freestyle", return_tensors="pt")

generated = model.generate(input_ids=input_dict["input_ids"])
print(tokenizer.batch_decode(generated, skip_special_tokens=True)[0])

# should give michael phelps => sounds reasonable


In [None]:
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration

tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
# Remove use_dummy_dataset=True
# or change to a dataset that has dummy dataset support
retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact")
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

input_dict = tokenizer.prepare_seq2seq_batch("who holds the record in 100m freestyle", return_tensors="pt")

generated = model.generate(input_ids=input_dict["input_ids"])
print(tokenizer.batch_decode(generated, skip_special_tokens=True)[0])

# should give michael phelps => sounds reasonable

In [None]:
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration

tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
# Remove index_name="exact"
retriever = RagRetriever.from_pretrained("facebook/rag-token-nq")
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

input_dict = tokenizer.prepare_seq2seq_batch("who holds the record in 100m freestyle", return_tensors="pt")

generated = model.generate(input_ids=input_dict["input_ids"])
print(tokenizer.batch_decode(generated, skip_special_tokens=True)[0])

# should give michael phelps => sounds reasonable

In [None]:
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration

tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")

# Use the dummy dataset
retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact")

model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

input_dict = tokenizer.prepare_seq2seq_batch("who holds the record in 100m freestyle", return_tensors="pt")

generated = model.generate(input_ids=input_dict["input_ids"])
print(tokenizer.batch_decode(generated, skip_special_tokens=True)[0])

# should give michael phelps => sounds reasonable

In [None]:
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration, RagConfig

# Load the configuration file
config = RagConfig.from_pretrained("facebook/rag-token-nq")

# Change the dataset and dataset split
config.dataset = "CoTracker3_Kubric"
config.dataset_split = "train"

# Initialize the tokenizer
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")

# Initialize the retriever with the modified configuration
retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", config=config, index_name="compressed")

# Initialize the model
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

# Generate and print the response
input_dict = tokenizer.prepare_seq2seq_batch("who holds the record in 100m freestyle", return_tensors="pt")
generated = model.generate(input_ids=input_dict["input_ids"])
print(tokenizer.batch_decode(generated, skip_special_tokens=True)[0])

# should give michael phelps => sounds reasonable

In [None]:
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration, RagConfig

# Load the configuration file
config = RagConfig.from_pretrained("facebook/rag-token-nq")

# Change the dataset and dataset split
config.dataset = "CoTracker3_Kubric"
config.dataset_split = "train"

# Initialize the tokenizer
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")

# Initialize the retriever with the modified configuration and enable the dummy dataset
retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", config=config, index_name="exact", use_dummy_dataset=True)

# Initialize the model
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

# Generate and print the response
input_dict = tokenizer.prepare_seq2seq_batch("who holds the record in 100m freestyle", return_tensors="pt")
generated = model.generate(input_ids=input_dict["input_ids"])
print(tokenizer.batch_decode(generated, skip_special_tokens=True)[0])

# should give michael phelps => sounds reasonable

In [None]:
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration, RagConfig

# Load the configuration file
config = RagConfig.from_pretrained("facebook/rag-token-nq")

# Change the dataset and dataset split
config.dataset = "CoTracker3_Kubric"
config.dataset_split = "train"

# Initialize the tokenizer
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")

# Initialize the retriever with the original config
retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True)

# Replace the retriever's config with the modified config
retriever.config = config

# Initialize the model
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

# Generate and print the response
input_dict = tokenizer.prepare_seq2seq_batch("who holds the record in 100m freestyle", return_tensors="pt")
generated = model.generate(input_ids=input_dict["input_ids"])
print(tokenizer.batch_decode(generated, skip_special_tokens=True)[0])

# should give michael phelps => sounds reasonable

In [None]:
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration, RagConfig
import faiss

# ... [Load your data and generate embeddings as before] ...

# Assuming you have `embeddings` as a PyTorch tensor
embeddings_np = embeddings.cpu().numpy()
dimension = embeddings_np.shape[1]

# Create a Faiss index (e.g., IndexFlatL2)
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

# Customize retrieval paths
index_path = "path/to/your/index.faiss"  # Save index to this path
passages_path = "path/to/your/passages.pkl" # Save passages to this path

# Save index and passages
faiss.write_index(index, index_path)
# ... Save your passages to passages_path using pickle or a similar method ...

# Initialize RagRetriever using custom dataset and index
config = RagConfig.from_pretrained("facebook/rag-token-nq")
config.index_path = index_path
config.passages_path = passages_path

retriever = RagRetriever.from_pretrained(
    "facebook/rag-token-nq",
    config=config,
    index_name="custom",  # Use a custom index name
    use_dummy_dataset=False  # Disable dummy dataset
)

# Initialize the model
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

# ... [Rest of the code as before] ...

In [None]:
import pandas as pd
import pickle

# تحميل البيانات من ملف CSV
data = pd.read_csv("your_data_file.csv")

# حفظ البيانات كملف tsv.pkl
with open("your_data_file.tsv.pkl", "wb") as f:
    pickle.dump(data, f)

In [None]:
awacke1/data.csv

In [None]:
import pandas as pd
import pickle

# تحميل البيانات من ملف CSV
data = pd.read_csv("awacke1/data.csv")

# حفظ البيانات كملف tsv.pkl
with open("your_data_file.tsv.pkl", "wb") as f:
    pickle.dump(data, f)

In [None]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("awacke1/data.csv")

In [None]:
from datasets import load_dataset

# تحديد مسار المجلد المطلوب
data_dir = "/content/a"

# تحميل البيانات إلى المجلد المحدد
ds = load_dataset("awacke1/data.csv", data_dir=data_dir)

شغال تحويل

In [None]:
import pandas as pd
import pickle

# تحميل البيانات من ملف CSV
data = pd.read_csv("/content/a.csv")

# حفظ البيانات كملف tsv.pkl
with open("your_data_file.tsv.pkl", "wb") as f:
    pickle.dump(data, f)

شغال تحويل

In [11]:
import pandas as pd
import pickle

# تحميل البيانات من ملف CSV
data = pd.read_csv("/content/file.tsv")

# حفظ البيانات كملف tsv.pkl
with open("aa.tsv.pkl", "wb") as f:
    pickle.dump(data, f)

In [None]:
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration

tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True)
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

input_dict = tokenizer.prepare_seq2seq_batch("who holds the record in 100m freestyle", return_tensors="pt")

generated = model.generate(input_ids=input_dict["input_ids"])
print(tokenizer.batch_decode(generated, skip_special_tokens=True)[0])

# should give michael phelps => sounds reasonable


In [None]:
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration
from datasets import load_dataset

# Initialize tokenizer and model
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact")  # Remove use_dummy_dataset=True
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

# Load and index your custom dataset
dataset = load_dataset('csv', data_files='/content/your_data_file.tsv.pkl')
dataset = dataset.map(lambda example: {'embeddings': retriever.embed_documents([example['text']])[0]})
retriever.add(dataset['train']['embeddings'], dataset['train']['text'], dataset['train']['title'])


input_dict = tokenizer.prepare_seq2seq_batch("who holds the record in 100m freestyle", return_tensors="pt")

generated = model.generate(input_ids=input_dict["input_ids"])
print(tokenizer.batch_decode(generated, skip_special_tokens=True)[0])

In [None]:
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration

tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True)
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

input_dict = tokenizer.prepare_seq2seq_batch("who holds the record in 100m freestyle", return_tensors="pt")

generated = model.generate(input_ids=input_dict["input_ids"])
print(tokenizer.batch_decode(generated, skip_special_tokens=True)[0])

# should give michael phelps => sounds reasonable


In [None]:
/content/your_data_file.tsv.pkl

https://github.com/huggingface/transformers/issues/23884

https://prokfunfind.readthedocs.io/en/latest/outputs.html

In [14]:
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration

tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
retriever = RagRetriever.from_pretrained("facebook/rag-token-nq")
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

input_dict = tokenizer.prepare_seq2seq_batch("who holds the record in 100m freestyle", return_tensors="pt")

generated = model.generate(input_ids=input_dict["input_ids"])
print(tokenizer.batch_decode(generated, skip_special_tokens=True)[0])

# should give michael phelps => sounds reasonable


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

OSError: Can't load 'psgs_w100.tsv.pkl'. Make sure that:

- 'https://storage.googleapis.com/huggingface-nlp/datasets/wiki_dpr/' is a correct remote path to a directory containing a file named psgs_w100.tsv.pkl

- or 'https://storage.googleapis.com/huggingface-nlp/datasets/wiki_dpr/' is the correct path to a directory containing a file named psgs_w100.tsv.pkl.



In [None]:
from datasets import load_dataset

dataset = load_dataset('csv', data_files='my_data.csv')  # بدون with_index

In [15]:
!rm /root/.cache/huggingface/datasets/wiki_dpr/dummy.psgs_w100.nq.no_index-dummy=True,with_index=False

rm: cannot remove '/root/.cache/huggingface/datasets/wiki_dpr/dummy.psgs_w100.nq.no_index-dummy=True,with_index=False': Is a directory


In [18]:


!rm -rf /root/.cache/huggingface/datasets

In [20]:
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration

tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", use_dummy_dataset=True)
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

input_dict = tokenizer.prepare_seq2seq_batch("who holds the record in 100m freestyle", return_tensors="pt")

generated = model.generate(input_ids=input_dict["input_ids"])
print(tokenizer.batch_decode(generated, skip_special_tokens=True)[0])

# should give michael phelps => sounds reasonable


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

OSError: Can't load 'psgs_w100.tsv.pkl'. Make sure that:

- 'https://storage.googleapis.com/huggingface-nlp/datasets/wiki_dpr/' is a correct remote path to a directory containing a file named psgs_w100.tsv.pkl

- or 'https://storage.googleapis.com/huggingface-nlp/datasets/wiki_dpr/' is the correct path to a directory containing a file named psgs_w100.tsv.pkl.



In [None]:
/root/.cache/huggingface/hub/datasets--wiki_dpr/snapshots/0ae2454140a2d6864475c83f26e6dc9cd4ab9ce4/wiki_dpr.py

In [None]:
/root/.cache/huggingface/hub/datasets--wiki_dpr/snapshots/0ae2454140a2d6864475c83f26e6dc9cd4ab9ce4/data/psgs_w100/dummy.nq/train-00000-of-00001.parquet

In [None]:
/root/.cache/huggingface/hub/datasets--wiki_dpr/snapshots/0ae2454140a2d6864475c83f26e6dc9cd4ab9ce4/data/psgs_w100/dummy.nq/train-00000-of-00001.parquet

تحويل ملف CSV إلى Parquet

In [1]:
!pip install pandas pyarrow



In [3]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [4]:
df = pd.read_csv('/content/sample_data/mnist_test.csv')  # استبدل 'your_file.csv' باسم ملفك

In [5]:
table = pa.Table.from_pandas(df)
pq.write_table(table, 'a.parquet')  # استبدل 'your_file.parquet' باسم ملف Parquet الجديد

In [None]:
/root/.cache/huggingface/hub/datasets--wiki_dpr/snapshots/0ae2454140a2d6864475c83f26e6dc9cd4ab9ce4/data/psgs_w100/dummy.nq/train-00000-of-00001.parquet

In [10]:
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration

tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", iuse_dummy_dataset=True)
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

input_dict = tokenizer.prepare_seq2seq_batch("What is the content of the data file?", return_tensors="pt")

generated = model.generate(input_ids=input_dict["input_ids"])
print(tokenizer.batch_decode(generated, skip_special_tokens=True)[0])

# should give michael phelps => sounds reasonable


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

OSError: Can't load 'psgs_w100.tsv.pkl'. Make sure that:

- 'https://storage.googleapis.com/huggingface-nlp/datasets/wiki_dpr/' is a correct remote path to a directory containing a file named psgs_w100.tsv.pkl

- or 'https://storage.googleapis.com/huggingface-nlp/datasets/wiki_dpr/' is the correct path to a directory containing a file named psgs_w100.tsv.pkl.



In [7]:
input_dict = tokenizer.prepare_seq2seq_batch("What is the content of the data file?", return_tensors="pt")

generated = model.generate(input_ids=input_dict["input_ids"])
print(tokenizer.batch_decode(generated, skip_special_tokens=True)[0])



NameError: name 'model' is not defined

In [13]:
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration

tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
#retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True)
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq")

input_dict = tokenizer.prepare_seq2seq_batch("who holds the record in 100m freestyle", return_tensors="pt")

generated = model.generate(input_ids=input_dict["input_ids"])
print(tokenizer.batch_decode(generated, skip_special_tokens=True)[0])

# should give michael phelps => sounds reasonable


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

AttributeError: 'NoneType' object has no attribute 'shape'

In [1]:
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration

tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
# You need to initialize the retriever and pass it to the model
retriever = RagRetriever.from_pretrained("facebook/rag-token-nq")
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

input_dict = tokenizer.prepare_seq2seq_batch("who holds the record in 100m freestyle", return_tensors="pt")

generated = model.generate(input_ids=input_dict["input_ids"])
print(tokenizer.batch_decode(generated, skip_special_tokens=True)[0])

# should give michael phelps => sounds reasonable

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoder

OSError: Can't load 'psgs_w100.tsv.pkl'. Make sure that:

- 'https://storage.googleapis.com/huggingface-nlp/datasets/wiki_dpr/' is a correct remote path to a directory containing a file named psgs_w100.tsv.pkl

- or 'https://storage.googleapis.com/huggingface-nlp/datasets/wiki_dpr/' is the correct path to a directory containing a file named psgs_w100.tsv.pkl.



https://huggingface.co/docs/transformers/main/en/model_doc/rag#transformers.RagTokenForGeneration

In [None]:
from transformers import AutoTokenizer, RagRetriever, RagTokenForGeneration
import torch

tokenizer = AutoTokenizer.from_pretrained("facebook/rag-token-nq")
retriever = RagRetriever.from_pretrained(
    "facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True
)
# initialize with RagRetriever to do everything in one forward call
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
targets = tokenizer(text_target="In Paris, there are 10 million people.", return_tensors="pt")
input_ids = inputs["input_ids"]
labels = targets["input_ids"]
outputs = model(input_ids=input_ids, labels=labels)

# or use retriever separately
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", use_dummy_dataset=True)
# 1. Encode
question_hidden_states = model.question_encoder(input_ids)[0]
# 2. Retrieve
docs_dict = retriever(input_ids.numpy(), question_hidden_states.detach().numpy(), return_tensors="pt")
doc_scores = torch.bmm(
    question_hidden_states.unsqueeze(1), docs_dict["retrieved_doc_embeds"].float().transpose(1, 2)
).squeeze(1)
# 3. Forward to generator
outputs = model(
    context_input_ids=docs_dict["context_input_ids"],
    context_attention_mask=docs_dict["context_attention_mask"],
    doc_scores=doc_scores,
    decoder_input_ids=labels,
)

# or directly generate
generated = model.generate(
    context_input_ids=docs_dict["context_input_ids"],
    context_attention_mask=docs_dict["context_attention_mask"],
    doc_scores=doc_scores,
)
generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)

In [2]:
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", use_dummy_dataset=True)
# 1. Encode
question_hidden_states = model.question_encoder(input_ids)[0]
# 2. Retrieve
docs_dict = retriever(input_ids.numpy(), question_hidden_states.detach().numpy(), return_tensors="pt")
doc_scores = torch.bmm(
    question_hidden_states.unsqueeze(1), docs_dict["retrieved_doc_embeds"].float().transpose(1, 2)
).squeeze(1)

Some weights of the model checkpoint at facebook/rag-token-nq were not used when initializing RagTokenForGeneration: ['rag.question_encoder.question_encoder.bert_model.pooler.dense.bias', 'rag.question_encoder.question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing RagTokenForGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RagTokenForGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


NameError: name 'input_ids' is not defined

In [5]:
from transformers import AutoTokenizer, RagRetriever, RagTokenForGeneration
import torch

tokenizer = AutoTokenizer.from_pretrained("facebook/rag-token-nq")
retriever = RagRetriever.from_pretrained(
    "facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True
)
# initialize with RagRetriever to do everything in one forward call
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
targets = tokenizer(text_target="In Paris, there are 10 million people.", return_tensors="pt")
input_ids = inputs["input_ids"]
labels = targets["input_ids"]
outputs = model(input_ids=input_ids, labels=labels)

config.json:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

(…)_encoder_tokenizer/tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

question_encoder_tokenizer/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

(…)ncoder_tokenizer/special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.


(…)enerator_tokenizer/tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

generator_tokenizer/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

generator_tokenizer/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

(…)erator_tokenizer/special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may res

README.md:   0%|          | 0.00/14.9k [00:00<?, ?B/s]

wiki_dpr.py:   0%|          | 0.00/8.63k [00:00<?, ?B/s]

The repository for wiki_dpr contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/wiki_dpr.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


train-00000-of-00001.parquet:   0%|          | 0.00/40.8M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

  0%|          | 0/10 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/2.06G [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/rag-token-nq were not used when initializing RagTokenForGeneration: ['rag.question_encoder.question_encoder.bert_model.pooler.dense.bias', 'rag.question_encoder.question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing RagTokenForGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RagTokenForGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", use_dummy_dataset=True)
# 1. Encode
question_hidden_states = model.question_encoder(input_ids)[0]
# 2. Retrieve
docs_dict = retriever(input_ids.numpy(), question_hidden_states.detach().numpy(), return_tensors="pt")
doc_scores = torch.bmm(
    question_hidden_states.unsqueeze(1), docs_dict["retrieved_doc_embeds"].float().transpose(1, 2)
).squeeze(1)

Some weights of the model checkpoint at facebook/rag-token-nq were not used when initializing RagTokenForGeneration: ['rag.question_encoder.question_encoder.bert_model.pooler.dense.bias', 'rag.question_encoder.question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing RagTokenForGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RagTokenForGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
!rm -rf /root/.cache

In [7]:
from transformers import AutoTokenizer, RagRetriever, RagSequenceForGeneration
import torch

tokenizer = AutoTokenizer.from_pretrained("facebook/rag-sequence-nq")
retriever = RagRetriever.from_pretrained(
    "facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True
)
# initialize with RagRetriever to do everything in one forward call
model = RagSequenceForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
targets = tokenizer(text_target="In Paris, there are 10 million people.", return_tensors="pt")
input_ids = inputs["input_ids"]
labels = targets["input_ids"]
outputs = model(input_ids=input_ids, labels=labels)

# or use retriever separately
model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", use_dummy_dataset=True)
# 1. Encode
question_hidden_states = model.question_encoder(input_ids)[0]
# 2. Retrieve
docs_dict = retriever(input_ids.numpy(), question_hidden_states.detach().numpy(), return_tensors="pt")
doc_scores = torch.bmm(
    question_hidden_states.unsqueeze(1), docs_dict["retrieved_doc_embeds"].float().transpose(1, 2)
).squeeze(1)
# 3. Forward to generator
outputs = model(
    context_input_ids=docs_dict["context_input_ids"],
    context_attention_mask=docs_dict["context_attention_mask"],
    doc_scores=doc_scores,
    decoder_input_ids=labels,
)

config.json:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

(…)_encoder_tokenizer/tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

question_encoder_tokenizer/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

(…)ncoder_tokenizer/special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.


(…)enerator_tokenizer/tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

generator_tokenizer/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

generator_tokenizer/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

(…)erator_tokenizer/special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may res

pytorch_model.bin:   0%|          | 0.00/2.06G [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/rag-sequence-nq were not used when initializing RagSequenceForGeneration: ['rag.question_encoder.question_encoder.bert_model.pooler.dense.bias', 'rag.question_encoder.question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing RagSequenceForGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RagSequenceForGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
from transformers import AutoTokenizer, RagRetriever, TFRagModel
import torch

tokenizer = AutoTokenizer.from_pretrained("facebook/rag-token-base")
retriever = RagRetriever.from_pretrained(
    "facebook/rag-token-base", index_name="exact", use_dummy_dataset=True
)
# initialize with RagRetriever to do everything in one forward call
model = TFRagModel.from_pretrained("facebook/rag-token-base", retriever=retriever, from_pt=True)

input_dict = tokenizer.prepare_seq2seq_batch(
    "How many people live in Paris?", "In Paris, there are 10 million people.", return_tensors="tf"
)
input_ids = input_dict["input_ids"]
outputs = model(input_ids)

config.json:   0%|          | 0.00/4.55k [00:00<?, ?B/s]

(…)_encoder_tokenizer/tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

question_encoder_tokenizer/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

(…)ncoder_tokenizer/special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.


(…)enerator_tokenizer/tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

generator_tokenizer/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

generator_tokenizer/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

(…)erator_tokenizer/special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may res

pytorch_model.bin:   0%|          | 0.00/2.06G [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRagModel: ['rag.generator.model.decoder.layers.8.encoder_attn.v_proj.weight', 'rag.generator.model.decoder.layers.11.fc2.bias', 'rag.question_encoder.question_encoder.bert_model.encoder.layer.3.attention.output.dense.weight', 'rag.generator.model.decoder.layers.5.final_layer_norm.weight', 'rag.generator.model.decoder.layers.11.self_attn.q_proj.weight', 'rag.generator.model.encoder.layers.4.fc2.weight', 'rag.generator.model.encoder.layers.1.self_attn_layer_norm.bias', 'rag.generator.model.encoder.layers.1.final_layer_norm.bias', 'rag.generator.model.encoder.layers.8.self_attn.v_proj.weight', 'rag.question_encoder.question_encoder.bert_model.encoder.layer.8.attention.self.query.weight', 'rag.question_encoder.question_encoder.bert_model.encoder.layer.1.attention.self.value.weight', 'rag.generator.model.decoder.layers.1.final_layer_norm.weight', 'rag.generator.model.encoder.layers.7.self_attn_layer_norm.we

In [None]:
from transformers import AutoTokenizer, RagRetriever, TFRagModel
import torch

tokenizer = AutoTokenizer.from_pretrained("facebook/rag-token-base")
retriever = RagRetriever.from_pretrained(
    "facebook/rag-token-base", index_name="exact", use_dummy_dataset=True
)
# initialize with RagRetriever to do everything in one forward call
model = TFRagModel.from_pretrained("facebook/rag-token-base", retriever=retriever, from_pt=True)

# Instead of prepare_seq2seq_batch, use __call__ and with_target_tokenizer
with tokenizer.as_target_tokenizer():
    input_dict = tokenizer(
        "How many people live in Paris?", "In Paris, there are 10 million people.", return_tensors="tf"
    )

input_ids = input_dict["input_ids"]
outputs = model(input_ids)

In [9]:
from transformers import AutoTokenizer, RagRetriever, TFRagModel
import torch

tokenizer = AutoTokenizer.from_pretrained("facebook/rag-token-base")
retriever = RagRetriever.from_pretrained(
    "facebook/rag-token-base", index_name="exact", use_dummy_dataset=True
)
# initialize with RagRetriever to do everything in one forward call
model = TFRagModel.from_pretrained("facebook/rag-token-base", retriever=retriever, from_pt=True)

input_dict = tokenizer.prepare_seq2seq_batch(
    "How many people live in Paris?", "In Paris, there are 10 million people.", return_tensors="tf"
)
input_ids = input_dict["input_ids"]
outputs = model(input_ids)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

In [4]:
from transformers import AutoTokenizer, RagRetriever, TFRagModel
import torch
from datasets import load_dataset
# 1. تحميل البيانات وتقسيمها
#dataset = load_dataset("wiki_dpr", split="train")  # تحميل dataset wiki_dpr
#dataset = dataset.select(range(10000))  # اختيار أول 10000 مثال

dataset = load_dataset("wiki_dpr", "psgs_w100.nq.exact", split="train")  # specify config
dataset = dataset.select(range(10000))  # اختيار أول 10000 مثال

# 2. تهيئة tokenizer و retriever
tokenizer = AutoTokenizer.from_pretrained("facebook/rag-token-base")
retriever = RagRetriever.from_pretrained(
    "facebook/rag-token-base",
    index_name="custom",  # اختيار index_name
    use_dummy_dataset=False,
    indexed_dataset=dataset,  # تحديد dataset
)

# 3. تهيئة و تدريب نموذج RAG
model = TFRagModel.from_pretrained("facebook/rag-token-base", retriever=retriever, from_pt=True)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")
# ... (كود لتدريب النموذج على dataset) ...

# 4. طرح سؤال استدلالي
with tokenizer.as_target_tokenizer():
    input_dict = tokenizer(
        "who holds the record in 100m freestyle", return_tensors="tf"
    )

input_ids = input_dict["input_ids"]
outputs = model(input_ids)

# 5. عرض النتيجة
print(tokenizer.decode(outputs.logits, skip_special_tokens=True))

Downloading data:   0%|          | 0/157 [00:00<?, ?files/s]

train-00000-of-00157.parquet:   0%|          | 0.00/545M [00:00<?, ?B/s]

train-00001-of-00157.parquet:   0%|          | 0.00/546M [00:00<?, ?B/s]

train-00002-of-00157.parquet:   0%|          | 0.00/546M [00:00<?, ?B/s]

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-4-d618abf89f4a>", line 8, in <cell line: 0>
    dataset = load_dataset("wiki_dpr", "psgs_w100.nq.exact", split="train")  # specify config
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/datasets/load.py", line 2151, in load_dataset
    builder_instance.download_and_prepare(
  File "/usr/local/lib/python3.11/dist-packages/datasets/builder.py", line 924, in download_and_prepare
    self._download_and_prepare(
  File "/usr/local/lib/python3.11/dist-packages/datasets/builder.py", line 978, in _download_and_prepare
    split_generators = self._split_generators(dl_manager, **split_generators_kwargs)
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.cache

TypeError: object of type 'NoneType' has no len()

In [2]:
!pip install datasets




In [None]:
import pandas as pd

# Download the Parquet file
!wget https://huggingface.co/datasets/facebook/wiki_dpr/resolve/main/data/psgs_w100/dummy.nq/train-00000-of-00001.parquet

# Read the Parquet file into a Pandas DataFrame
df = pd.read_parquet('train-00000-of-00001.parquet')

# Now you can use the DataFrame 'df' in your RAG system
# ... (your code to process the DataFrame and integrate it into your RAG system) ...

In [None]:
import pandas as pd
from transformers import AutoTokenizer, RagRetriever, TFRagModel
import torch
from datasets import Dataset

# Download the Parquet file
!wget https://huggingface.co/datasets/facebook/wiki_dpr/resolve/main/data/psgs_w100/dummy.nq/train-00000-of-00001.parquet

# Read the Parquet file into a Pandas DataFrame
df = pd.read_parquet('train-00000-of-00001.parquet')

# Convert the DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# 2. تهيئة tokenizer و retriever
tokenizer = AutoTokenizer.from_pretrained("facebook/rag-token-base")
retriever = RagRetriever.from_pretrained(
    "facebook/rag-token-base",
    index_name="custom",  # اختيار index_name
    use_dummy_dataset=False,
    indexed_dataset=dataset,  # تحديد dataset
)

# 3. تهيئة و تدريب نموذج RAG
model = TFRagModel.from_pretrained("facebook/rag-token-base", retriever=retriever, from_pt=True)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")
# ... (كود لتدريب النموذج على dataset) ...

# 4. طرح سؤال استدلالي
with tokenizer.as_target_tokenizer():
    input_dict = tokenizer(
        "who holds the record in 100m freestyle", return_tensors="tf"
    )

input_ids = input_dict["input_ids"]
outputs = model(input_ids)

# 5. عرض النتيجة
print(tokenizer.decode(outputs.logits, skip_special_tokens=True))

In [None]:
import pandas as pd
from transformers import AutoTokenizer, RagRetriever, TFRagModel
import torch
from datasets import Dataset
import faiss

# Download the Parquet file
!wget https://huggingface.co/datasets/facebook/wiki_dpr/resolve/main/data/psgs_w100/dummy.nq/train-00000-of-00001.parquet

# Read the Parquet file into a Pandas DataFrame
df = pd.read_parquet('train-00000-of-00001.parquet')

# Convert the DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Generate embeddings for your dataset using a SentenceTransformer or other embedding model
from sentence_transformers import SentenceTransformer
model_st = SentenceTransformer('all-mpnet-base-v2') # Or any other model
embeddings = model_st.encode(dataset['text'])  # Assuming 'text' column holds the document content
embeddings = np.array(embeddings).astype("float32")

# Add FAISS index to the dataset
dataset.add_faiss_index(column='embeddings', index_name='custom') # Changed index name to 'custom'
dataset.add_faiss_index("embeddings")


# ... (rest of the code remains the same)

شغال

In [1]:
import pandas as pd
from transformers import AutoTokenizer, RagRetriever, TFRagModel
import torch
from datasets import Dataset
import faiss
import numpy as np

# Download the Parquet file
!wget https://huggingface.co/datasets/facebook/wiki_dpr/resolve/main/data/psgs_w100/dummy.nq/train-00000-of-00001.parquet

# Read the Parquet file into a Pandas DataFrame
df = pd.read_parquet('train-00000-of-00001.parquet')

# Convert the DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Generate embeddings for your dataset using a SentenceTransformer or other embedding model
from sentence_transformers import SentenceTransformer
model_st = SentenceTransformer('all-mpnet-base-v2') # Or any other model
embeddings = model_st.encode(dataset['text'])  # Assuming 'text' column holds the document content
embeddings = np.array(embeddings).astype("float32")

# Add FAISS index to the dataset
dataset.add_faiss_index(column='embeddings', index_name='custom') # Changed index name to 'custom'
dataset.add_faiss_index("embeddings")

# 2. تهيئة tokenizer و retriever
tokenizer = AutoTokenizer.from_pretrained("facebook/rag-token-base")
retriever = RagRetriever.from_pretrained(
    "facebook/rag-token-base",
    index_name="custom",  # اختيار index_name
    use_dummy_dataset=False,
    indexed_dataset=dataset,  # تحديد dataset
)

# 3. تهيئة و تدريب نموذج RAG
model = TFRagModel.from_pretrained("facebook/rag-token-base", retriever=retriever, from_pt=True)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")
# ... (كود لتدريب النموذج على dataset) ...

# 4. طرح سؤال استدلالي
with tokenizer.as_target_tokenizer():
    input_dict = tokenizer(
        "who holds the record in 100m freestyle", return_tensors="tf"
    )

input_ids = input_dict
outputs = model(input_ids)

# 5. عرض النتيجة
print(tokenizer.decode(outputs.logits, skip_special_tokens=True))

--2025-01-21 04:43:06--  https://huggingface.co/datasets/facebook/wiki_dpr/resolve/main/data/psgs_w100/dummy.nq/train-00000-of-00001.parquet
Resolving huggingface.co (huggingface.co)... 18.164.174.17, 18.164.174.55, 18.164.174.23, ...
Connecting to huggingface.co (huggingface.co)|18.164.174.17|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.hf.co/datasets/wiki_dpr/06e56bc71db027bed1c6cf437b8ffd4ed4b30e37082944bac164ad7722110288?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27train-00000-of-00001.parquet%3B+filename%3D%22train-00000-of-00001.parquet%22%3B&Expires=1737438187&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczNzQzODE4N319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9kYXRhc2V0cy93aWtpX2Rwci8wNmU1NmJjNzFkYjAyN2JlZDFjNmNmNDM3YjhmZmQ0ZWQ0YjMwZTM3MDgyOTQ0YmFjMTY0YWQ3NzIyMTEwMjg4P3Jlc3BvbnNlLWNvbnRlbnQtZGlzcG9zaXRpb249KiJ9XX0_&Signature=fmOCX24hhO8JehQuUS0W-KDBmbeAv5HE0zCP7Ht%7

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
import pandas as pd
from transformers import AutoTokenizer, RagRetriever, TFRagModel, DPRContextEncoder, DPRQuestionEncoder
import torch
from datasets import Dataset
import faiss
import numpy as np


# Download the Parquet file
!wget https://huggingface.co/datasets/facebook/wiki_dpr/resolve/main/data/psgs_w100/dummy.nq/train-00000-of-00001.parquet

# Read the Parquet file into a Pandas DataFrame
df = pd.read_parquet('train-00000-of-00001.parquet')

# Convert the DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)


# 1. تحميل نماذج DPR
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
question_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")


# 2. إنشاء تضمينات باستخدام DPR
def embed_passages(passages):
  input_ids = tokenizer(
      passages["text"],
      truncation=True,
      padding="longest",
      return_tensors="pt",
  )["input_ids"]
  with torch.no_grad():
    embeddings = ctx_encoder(input_ids).pooler_output
  return embeddings.cpu().numpy()

dataset = dataset.map(embed_passages, batched=True, batch_size=16)


# 3. إضافة فهرس FAISS
dataset.add_faiss_index(column="embeddings", index_name="custom")


# 4. تهيئة tokenizer و retriever
tokenizer = AutoTokenizer.from_pretrained("facebook/rag-token-base")
retriever = RagRetriever.from_pretrained(
    "facebook/rag-token-base",
    index_name="custom",
    use_dummy_dataset=False,
    indexed_dataset=dataset,
)


# ... (بقية الكود يبقى كما هو) ...

# 3. تهيئة و تدريب نموذج RAG
model = TFRagModel.from_pretrained("facebook/rag-token-base", retriever=retriever, from_pt=True)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")
# ... (كود لتدريب النموذج على dataset) ...

# 4. طرح سؤال استدلالي
with tokenizer.as_target_tokenizer():
    input_dict = tokenizer(
        "who holds the record in 100m freestyle", return_tensors="tf"
    )

input_ids = input_dict
outputs = model(input_ids)

# 5. عرض النتيجة
print(tokenizer.decode(outputs.logits, skip_special_tokens=True))

In [None]:
import pandas as pd
from transformers import AutoTokenizer, RagRetriever, TFRagModel, DPRContextEncoder, DPRQuestionEncoder
import torch
from datasets import Dataset
import faiss
import numpy as np

# Download the Parquet file
!wget https://huggingface.co/datasets/facebook/wiki_dpr/resolve/main/data/psgs_w100/dummy.nq/train-00000-of-00001.parquet

# Read the Parquet file into a Pandas DataFrame
df = pd.read_parquet('train-00000-of-00001.parquet')

# Convert the DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# 1. تحميل نماذج DPR
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
question_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

# 2. إنشاء تضمينات باستخدام DPR
tokenizer = AutoTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")  # Use DPR tokenizer
def embed_passages(passages):
  input_ids = tokenizer(
      passages["text"],
      truncation=True,
      padding="longest",
      return_tensors="pt",
  )["input_ids"]
  with torch.no_grad():
    embeddings = ctx_encoder(input_ids).pooler_output
  return {"embeddings": embeddings.cpu().numpy()}

dataset = dataset.map(embed_passages, batched=True, batch_size=16)

# 3. إضافة فهرس FAISS
dataset.add_faiss_index(column="embeddings", index_name="custom")

# 4. تهيئة tokenizer و retriever
tokenizer = AutoTokenizer.from_pretrained("facebook/rag-token-base")
retriever = RagRetriever.from_pretrained(
    "facebook/rag-token-base",
    index_name="custom",
    use_dummy_dataset=False,
    indexed_dataset=dataset,
)

# 5. تهيئة و تدريب نموذج RAG (Placeholder for training)
model = TFRagModel.from_pretrained("facebook/rag-token-base", retriever=retriever, from_pt=True)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")
# ... (كود لتدريب النموذج على dataset) ...


# 6. طرح سؤال استدلالي
with tokenizer.as_target_tokenizer():
    input_dict = tokenizer(
        "who holds the record in 100m freestyle", return_tensors="tf"
    )

input_ids = input_dict
outputs = model(input_ids)

# 7. عرض النتيجة
print(tokenizer.decode(outputs.logits, skip_special_tokens=True))

ششششششششششششش

شغال

In [1]:
import pandas as pd
from transformers import AutoTokenizer, RagRetriever, TFRagModel, DPRContextEncoder, DPRQuestionEncoder
import torch
from datasets import Dataset
import faiss
import numpy as np

# Download the Parquet file
!wget https://huggingface.co/datasets/facebook/wiki_dpr/resolve/main/data/psgs_w100/dummy.nq/train-00000-of-00001.parquet

# Read the Parquet file into a Pandas DataFrame
df = pd.read_parquet('train-00000-of-00001.parquet')

# Convert the DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# 1. تحميل نماذج DPR
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
question_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

# 2. إنشاء تضمينات باستخدام DPR
tokenizer = AutoTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")  # Use DPR tokenizer
def embed_passages(passages):
  input_ids = tokenizer(
      passages["text"],
      truncation=True,
      padding="longest",
      return_tensors="pt",
  )["input_ids"]
  with torch.no_grad():
    embeddings = ctx_encoder(input_ids).pooler_output
  return {"embeddings": embeddings.cpu().numpy()}

dataset = dataset.map(embed_passages, batched=True, batch_size=16)

# 3. إضافة فهرس FAISS
dataset.add_faiss_index(column="embeddings", index_name="custom")

# 4. تهيئة tokenizer و retriever
tokenizer = AutoTokenizer.from_pretrained("facebook/rag-token-base")
retriever = RagRetriever.from_pretrained(
    "facebook/rag-token-base",
    index_name="custom",
    use_dummy_dataset=False,
    indexed_dataset=dataset,
)

# 5. تهيئة و تدريب نموذج RAG (Placeholder for training)
model = TFRagModel.from_pretrained("facebook/rag-token-base", retriever=retriever, from_pt=True)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")
# ... (كود لتدريب النموذج على dataset) ...


# 6. طرح سؤال استدلالي
with tokenizer.as_target_tokenizer():
    input_dict = tokenizer(
        "who holds the record in 100m freestyle", return_tensors="tf"
    )

input_ids = input_dict
outputs = model(input_ids)

# 7. عرض النتيجة
print(tokenizer.decode(outputs.logits, skip_special_tokens=True))

--2025-01-21 05:32:33--  https://huggingface.co/datasets/facebook/wiki_dpr/resolve/main/data/psgs_w100/dummy.nq/train-00000-of-00001.parquet
Resolving huggingface.co (huggingface.co)... 18.164.174.17, 18.164.174.55, 18.164.174.23, ...
Connecting to huggingface.co (huggingface.co)|18.164.174.17|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.hf.co/datasets/wiki_dpr/06e56bc71db027bed1c6cf437b8ffd4ed4b30e37082944bac164ad7722110288?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27train-00000-of-00001.parquet%3B+filename%3D%22train-00000-of-00001.parquet%22%3B&Expires=1737441153&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczNzQ0MTE1M319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9kYXRhc2V0cy93aWtpX2Rwci8wNmU1NmJjNzFkYjAyN2JlZDFjNmNmNDM3YjhmZmQ0ZWQ0YjMwZTM3MDgyOTQ0YmFjMTY0YWQ3NzIyMTEwMjg4P3Jlc3BvbnNlLWNvbnRlbnQtZGlzcG9zaXRpb249KiJ9XX0_&Signature=XHdFxSivtpS5tVTtQ%7EYy7uGV-3lHvYMzYE%7ELk

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/492 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


config.json:   0%|          | 0.00/493 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


KeyboardInterrupt: 

In [None]:
import pandas as pd
from datasets import Dataset

ششششششششششششششش

In [1]:
import pandas as pd
from transformers import AutoTokenizer, RagRetriever, TFRagModel, DPRContextEncoder, DPRQuestionEncoder
import torch
from datasets import Dataset
import faiss
import numpy as np



!wget https://huggingface.co/datasets/facebook/wiki_dpr/resolve/main/data/psgs_w100/dummy.nq/train-00000-of-00001.parquet

df = pd.read_parquet('train-00000-of-00001.parquet')
df = df.head(1000)
dataset = Dataset.from_pandas(df)




# 1. تحميل نماذج DPR
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
question_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

# 2. إنشاء تضمينات باستخدام DPR
tokenizer = AutoTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")  # Use DPR tokenizer
def embed_passages(passages):
  input_ids = tokenizer(
      passages["text"],
      truncation=True,
      padding="longest",
      return_tensors="pt",
  )["input_ids"]
  with torch.no_grad():
    embeddings = ctx_encoder(input_ids).pooler_output
  return {"embeddings": embeddings.cpu().numpy()}

dataset = dataset.map(embed_passages, batched=True, batch_size=16)

# 3. إضافة فهرس FAISS
dataset.add_faiss_index(column="embeddings", index_name="custom")

# 4. تهيئة tokenizer و retriever
tokenizer = AutoTokenizer.from_pretrained("facebook/rag-token-base")
retriever = RagRetriever.from_pretrained(
    "facebook/rag-token-base",
    index_name="custom",
    use_dummy_dataset=False,
    indexed_dataset=dataset,
)

# 5. تهيئة و تدريب نموذج RAG (Placeholder for training)
model = TFRagModel.from_pretrained("facebook/rag-token-base", retriever=retriever, from_pt=True)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")
# ... (كود لتدريب النموذج على dataset) ...


# 6. طرح سؤال استدلالي
with tokenizer.as_target_tokenizer():
    input_dict = tokenizer(
        "who holds the record in 100m freestyle", return_tensors="tf"
    )

input_ids = input_dict
outputs = model(input_ids)

# 7. عرض النتيجة
print(tokenizer.decode(outputs.logits, skip_special_tokens=True))







--2025-01-21 05:47:40--  https://huggingface.co/datasets/facebook/wiki_dpr/resolve/main/data/psgs_w100/dummy.nq/train-00000-of-00001.parquet
Resolving huggingface.co (huggingface.co)... 18.164.174.118, 18.164.174.23, 18.164.174.55, ...
Connecting to huggingface.co (huggingface.co)|18.164.174.118|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.hf.co/datasets/wiki_dpr/06e56bc71db027bed1c6cf437b8ffd4ed4b30e37082944bac164ad7722110288?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27train-00000-of-00001.parquet%3B+filename%3D%22train-00000-of-00001.parquet%22%3B&Expires=1737442060&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczNzQ0MjA2MH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9kYXRhc2V0cy93aWtpX2Rwci8wNmU1NmJjNzFkYjAyN2JlZDFjNmNmNDM3YjhmZmQ0ZWQ0YjMwZTM3MDgyOTQ0YmFjMTY0YWQ3NzIyMTEwMjg4P3Jlc3BvbnNlLWNvbnRlbnQtZGlzcG9zaXRpb249KiJ9XX0_&Signature=Q6Uo0wPuvYL6S%7EsgWEQu405JOcT9u3q3sHHZK

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly iden

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/1 [00:00<?, ?it/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

ValueError: Missing faiss index in the dataset. Make sure you called `dataset.add_faiss_index` to compute it or `dataset.load_faiss_index` to load one from the disk.

In [None]:
!wget https://huggingface.co/datasets/facebook/wiki_dpr/resolve/main/data/psgs_w100/dummy.nq/train-00000-of-00001.parquet

df = pd.read_parquet('train-00000-of-00001.parquet')
df = df.head(1000)
dataset = Dataset.from_pandas(df)







In [None]:
!wget https://huggingface.co/datasets/facebook/wiki_dpr/resolve/main/data/psgs_w100/dummy.nq/train-00000-of-00001.parquet

df = pd.read_parquet('train-00000-of-00001.parquet')
df = df.head(1000)
dataset = Dataset.from_pandas(df)