<a href="https://colab.research.google.com/github/jal9o3/gpt2-rag/blob/main/Webpage_RAG_Query_GPT2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GPT-2 RAG Experiments
This notebook walks through my experimentation with utilizing a RAG approach with the open source GPT-2 model.

# Scraping Code
This is where we use requests and BeautifulSoup to retrieve the page content.

We also parse sentences using NLTK.

In [None]:
import requests

In [None]:
from bs4 import BeautifulSoup

In [None]:
r = requests.get('https://en.wikipedia.org/wiki/Retrieval-augmented_generation')
r.text

In [None]:
soup = BeautifulSoup(r.text, 'html.parser')
#print(soup.prettify())

In [None]:
raw_paragraphs = soup.find_all("p")

In [None]:
print(raw_paragraphs)

In [None]:
paragraphs = ""
for raw_paragraph in raw_paragraphs:
    paragraphs += "\n" + raw_paragraph.text
print(paragraphs)

In [None]:
import nltk
from nltk.tokenize import sent_tokenize

In [None]:
nltk.download('punkt')

In [None]:
sentences = sent_tokenize(paragraphs)
print(sentences)

# Embedding Code
Here we generate embeddings using a sentence transformer model from Hugging Face.

In [None]:
!pip install -U sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentences)
print(embeddings)

In [None]:
len(embeddings)

# Pinecone VectorDB Code
Here we upsert the embeddings to PineconeDB and run a similarity search.

In [None]:
from google.colab import userdata
PINECONE_API_KEY = userdata.get('PC_API_KEY')

In [None]:
!pip install "pinecone-client[grpc]"

In [None]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

In [None]:
# pc.delete_index("webpage-index")

In [None]:
index_name = "webpage-index"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384, # dimensions of embeddings
        metric="cosine", # used by the embedding model
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

In [None]:
index = pc.Index(index_name)

vectors = []
for i, embedding in enumerate(embeddings):
  vectors.append({"id": f"vec{i}", "values": embedding})

index.upsert(
    vectors,
    namespace="webpage-1"
)

In [None]:
print(index.describe_index_stats())

In [None]:
index_stats = index.describe_index_stats()
# Account for upsert delay by checking the number of vectors in DB
len(embeddings) == index_stats['namespaces']['webpage-1']['vector_count']

In [None]:
# Generate an embedding of the prompt
prompt = "What is Retrieval Augmented Generation?"
vector = model.encode([prompt])[0]

In [None]:
print(vector)

In [None]:
query_results = index.query(
    namespace="webpage-1",
    vector=vector,
    top_k=2,
    include_values=True
)

#print(query_results)

for query_match in query_results['matches']:
  print(query_match['id'])

In [None]:
# Parse result indexes using regex (temporary ;))
import re
id_nums = []
for query_match in query_results['matches']:
  id_nums.append(int(re.findall(r'\d+', query_match['id'])[0]))
print(id_nums)

In [None]:
# Construct background info for GPT-2
background = sentences[0]
for id in id_nums:
  if id > 0:
    background += " " + sentences[id]
print(background)

# GPT-2 Code
This is where we use the GPT-2 model from Hugging Face.

In [None]:
!pip install torch

In [None]:
extended_prompt = f"""
{background.strip()} {prompt}
"""
print(extended_prompt)

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

#model = AutoModelForCausalLM.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2", torch_dtype=torch.float16, attn_implementation="sdpa")
tokenizer = AutoTokenizer.from_pretrained("gpt2")


input_ids = tokenizer(extended_prompt, return_tensors="pt").input_ids

gen_tokens = model.generate(
    input_ids,
    do_sample=True,
    temperature=0.9,
    max_length=100,
)
gen_text = tokenizer.batch_decode(gen_tokens)[0]

In [None]:
print(gen_text)