In [1]:
# Import necessary libraries from transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Specify the path to the local GPT-2 model
model_name = "./gpt2-large"

# Load the tokenizer and model from the local directory
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Create a text-generation pipeline using the loaded model and tokenizer
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Display a message confirming the pipeline setup
print("Pipeline setup complete.")


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Pipeline setup complete.


In [2]:
# Import PyMuPDF for PDF processing
import fitz  # PyMuPDF

# Define a function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)  # Open the PDF file
    text = ""
    for page in doc:
        text += page.get_text()  # Extract text from each page
    return text

# Extract text from the PDF document
pdf_path = "demo_data_for_RAG.pdf"
pdf_text = extract_text_from_pdf(pdf_path)

# Display the first 500 characters of the extracted text to verify
print("Extracted PDF Text Preview:\n", pdf_text[:500])


Extracted PDF Text Preview:
 Personal Information
Name: John Michael Smith
Date of Birth: January 15, 1985
Place of Birth: New York City, USA
Nationality: American
Gender: Male
Marital Status: Married
Contact Information:
- Address: 1234 Elm Street, Apt. 5B, Brooklyn, NY, 11215
- Phone: (123) 456-7890
- Email: john.m.smith@example.com
Personal Statement
John Michael Smith is a highly motivated and results-driven professional
with over 15 years of experience in the software development industry.
Known for his exceptional pro


In [3]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Initialize the SentenceTransformer model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Create a list of documents including the text from the PDF
documents = [
    pdf_text,
    "The quick brown fox jumps over the lazy dog.",
    "Artificial Intelligence is transforming the world.",
    "LangChain provides a powerful interface for working with language models."
]

# Generate embeddings for each document using the SentenceTransformer model
document_embeddings = embedding_model.encode(documents, convert_to_tensor=True).cpu().detach().numpy()

# Initialize FAISS index
index = faiss.IndexFlatL2(document_embeddings.shape[1])  # Initialize a flat (non-hierarchical) index

# Add the embeddings to the FAISS index
index.add(document_embeddings)

# Display a message confirming the index setup
print("FAISS index created and populated with document embeddings using SentenceTransformer.")


FAISS index created and populated with document embeddings using SentenceTransformer.


In [4]:
# Define a function to retrieve the most relevant context based on a question
def retrieve_context(question):
    # Generate an embedding for the query using the SentenceTransformer model
    query_embedding = embedding_model.encode([question], convert_to_tensor=True).cpu().detach().numpy()
    
    # Search the FAISS index to find the most similar document embedding
    distances, indices = index.search(query_embedding, 1)
    
    # Return the most relevant document (context)
    return documents[indices[0][0]]

# Example usage of the retrieval function
context_example = retrieve_context("Tell me about AI.")
print("Retrieved Context:\n", context_example[:500])


Retrieved Context:
 Artificial Intelligence is transforming the world.


In [5]:
# Import the necessary classes from langchain
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

# Define a prompt template for generating text based on the retrieved context
template = """
Context: {context}

Question: {question}

Please provide a concise answer based on the context above.
Answer:"""


# Initialize the prompt template
prompt = PromptTemplate(template=template)

# Define a custom LLMChain class to integrate retrieval and generation
class CustomLLMChain:
    def __init__(self, pipeline, prompt):
        self.pipeline = pipeline
        self.prompt = prompt

    def __call__(self, inputs):
        # Retrieve context using the question
        context = retrieve_context(inputs["question"])
        
        # Format the prompt with the retrieved context and question
        prompt_input = {"context": context, "question": inputs["question"]}
        prompt_text = self.prompt.format(**prompt_input)
        
        # Generate the response using the pipeline with max_new_tokens
        output = self.pipeline(prompt_text, max_new_tokens=50)[0]['generated_text']
        return output

# Initialize the custom LLMChain
llm_chain = CustomLLMChain(pipeline=pipe, prompt=prompt)

# Display a message confirming the LLMChain setup
print("Custom LLMChain initialized.")


Custom LLMChain initialized.


In [6]:
# Define a sample query
question = "What is the full name of John?"

# Use the LLMChain to generate an answer based on retrieved context
result = llm_chain({"question": question})

# Print the generated answer
print("Generated Answer:\n", result)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Answer:
 
Context: Personal Information
Name: John Michael Smith
Date of Birth: January 15, 1985
Place of Birth: New York City, USA
Nationality: American
Gender: Male
Marital Status: Married
Contact Information:
- Address: 1234 Elm Street, Apt. 5B, Brooklyn, NY, 11215
- Phone: (123) 456-7890
- Email: john.m.smith@example.com
Personal Statement
John Michael Smith is a highly motivated and results-driven professional
with over 15 years of experience in the software development industry.
Known for his exceptional problem-solving skills and ability to lead teams to
success, John is dedicated to advancing technology and improving user
experiences.
Key Strengths
Strong leadership and team management skills
Expertise in software development and project management
Excellent communication and interpersonal abilities
Proficient in multiple programming languages and technologies
Adaptable and quick to learn new skills
Education and Work Experience
Education
Master of Science in Computer