In [None]:
!pip install transformers scikit-learn docx2txt datasets nltk lancedb 

In [None]:
!pip install openai==0.28

In [None]:
!pip install wikipedia

In [None]:
!pip install sentence_transformers tf-keras

## RAG from scratch w/ LanceDB

### Data Loading

In [4]:
import wikipedia

def get_wiki_article(title):
    try:
        page = wikipedia.page(title)
        with open('jamaica.txt', 'w') as f:
            f.write(page.content)
        print("Article successfully written article to text")
    except Exception as e:
        print(f"An error occurred: {e}")

get_wiki_article('Jamaica')

Article successfully written to article.txt


In [4]:
with open('jamaica.txt', 'r') as file:
    text_data = file.read()

### Chunking

In [5]:
# Recursive Text Splitter

import nltk

nltk.download("punkt")
from nltk.tokenize import sent_tokenize
import re


def recursive_text_splitter(text, max_chunk_length=1000, overlap=100):
    """
    Helper function for chunking text recursively
    """
    # Initialize result
    result = []

    current_chunk_count = 0
    separator = ["\n", " "]
    _splits = re.split(f"({separator})", text)
    splits = [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)]

    for i in range(len(splits)):
        if current_chunk_count != 0:
            chunk = "".join(
                splits[
                    current_chunk_count
                    - overlap : current_chunk_count
                    + max_chunk_length
                ]
            )
        else:
            chunk = "".join(splits[0:max_chunk_length])

        if len(chunk) > 0:
            result.append("".join(chunk))
        current_chunk_count += max_chunk_length

    return result

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
#split text

chunks = recursive_text_splitter(text_data, max_chunk_length=100, overlap=10)
print("Number of Chunks: ", len(chunks))

Number of Chunks:  141


### Embedding

In [4]:
from transformers import AutoTokenizer, AutoModel
import torch

# Choose a pre-trained model (e.g., BERT, RoBERTa, etc.)
# Load the tokenizer and model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)


def embedder(chunk):
    """
    Helper function to embed chunk of text
    """
    # Tokenize the input text
    tokens = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True)

    # Get the model's output (including embeddings)
    with torch.no_grad():
        model_output = model(**tokens)

    # Extract the embeddings
    embeddings = model_output.last_hidden_state[:, 0, :]
    embed = embeddings[0].numpy()
    return embed


In [5]:
# Embed all the chunks of text
embeds = []
for chunk in chunks:
    embed = embedder(chunks)
    embeds.append(embed)

In [16]:
############ GPU Accelerated Embedding

from sentence_transformers import SentenceTransformer

def embedder(chunks):    
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    return model.encode(chunks)

embeds = embedder(chunks)


### Vector Store

In [17]:
# Insert text chunks with their embeddings

import lancedb


def prepare_data(chunks, embeddings):
    """
    Helper function to prepare data to insert in LanceDB
    """
    data = []
    for chunk, embed in zip(chunks, embeddings):
        temp = {}
        temp["text"] = chunk
        temp["vector"] = embed
        data.append(temp)
    return data


def lanceDBConnection(chunks, embeddings):
    """
    LanceDB insertion
    """
    db = lancedb.connect("lance.db")
    data = prepare_data(chunks, embeddings)
    table = db.create_table(
        "scratch",
        data=data,
        mode="overwrite",
    )
    return table


table = lanceDBConnection(chunks, embeds)

### Retrieval & Prompt Preparation

In [32]:
# Retriever
k = 5
question = "Who were the first outsiders to visit jamaica?"

# Embed Question
query_embedding = embedder(question)
# Semantic Search
result = table.search(query_embedding).limit(5).to_list()

In [33]:
context = [r["text"] for r in result]
context

['\nThe first wave of English immigrants arrived to the island 1655 after conquering the Spanish, and they have historically been the dominant group. Prominent descendants from this group include former American Governor of New York David Paterson, Sandals Hotels owner Gordon Butch Stewart, United States Presidential Advisor and "mother" of the Pell Grant Lois Rice, and former United States National Security Advisor and Ambassador to the United Nations Susan Rice. The first Irish immigrants came to Jamaica in the 1600s as war prisoners and later, indentured labour. Their descendants include two of Jamaica\'s National Heroes: Prime Ministers Michael Manley and Alexander Bustamante. Along with',
 ' this group include Canadian billionaire investor Michael Lee-Chin, supermodels Naomi Campbell and Tyson Beckford, reggae producer Leslie Kong, and VP Records founder Vincent "Randy" Chin.\nThere are about 20,000 Jamaicans who have Lebanese and Syrian ancestry. Most were Christian immigrants wh

In [20]:
# Context Prompt

base_prompt = """You are an AI assistant. Your task is to understand the user question, and provide an answer using the provided contexts. Every answer you generate should have citations in this pattern  "Answer [position].", for example: "Earth is round [1][2].," if it's relevant.

Your answers are correct, high-quality, and written by an domain expert. If the provided context does not contain the answer, simply state, "The provided context does not have the answer."

User question: {}

Contexts:
{}
"""
     

### Answer Generation

In [34]:
import openai

# llm
prompt = f"{base_prompt.format(question, context)}"
response = openai.ChatCompletion.create(
    model="gpt-4-turbo-2024-04-09",
    temperature=0,
    messages=[
        {"role": "system", "content": prompt},
    ],
    api_key="sk-"
)

print(response.choices[0].message.content)

Christopher Columbus was the first European to see Jamaica, claiming the island for Spain after landing there in 1494 on his second voyage to the Americas [3].
