In [None]:
!pip install transformers scikit-learn docx2txt datasets nltk lancedb 

In [2]:
!pip install openai

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting openai
  Downloading openai-1.35.10-py3-none-any.whl.metadata (21 kB)
Downloading openai-1.35.10-py3-none-any.whl (328 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.3/328.3 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25hInstalling collected packages: openai
Successfully installed openai-1.35.10


In [None]:
!pip install wikipedia

In [None]:
!pip install sentence_transformers tf-keras

## RAG from scratch w/ LanceDB

### Data Loading

In [4]:
import wikipedia

def get_wiki_article(title):
    try:
        page = wikipedia.page(title)
        with open('jamaica.txt', 'w') as f:
            f.write(page.content)
        print("Article successfully written article to text")
    except Exception as e:
        print(f"An error occurred: {e}")

get_wiki_article('Jamaica')

Article successfully written to article.txt


In [1]:
with open('jamaica.txt', 'r') as file:
    text_data = file.read()

### Chunking

In [2]:
# Recursive Text Splitter

import nltk

nltk.download("punkt")
from nltk.tokenize import sent_tokenize
import re


def recursive_text_splitter(text, max_chunk_length=1000, overlap=100):
    """
    Helper function for chunking text recursively
    """
    # Initialize result
    result = []

    current_chunk_count = 0
    separator = ["\n", " "]
    _splits = re.split(f"({separator})", text)
    splits = [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)]

    for i in range(len(splits)):
        if current_chunk_count != 0:
            chunk = "".join(
                splits[
                    current_chunk_count
                    - overlap : current_chunk_count
                    + max_chunk_length
                ]
            )
        else:
            chunk = "".join(splits[0:max_chunk_length])

        if len(chunk) > 0:
            result.append("".join(chunk))
        current_chunk_count += max_chunk_length

    return result

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
#split text

chunks = recursive_text_splitter(text_data, max_chunk_length=100, overlap=10)
print("Number of Chunks: ", len(chunks))

Number of Chunks:  141


### Embedding

In [4]:
from transformers import AutoTokenizer, AutoModel
import torch

# Choose a pre-trained model (e.g., BERT, RoBERTa, etc.)
# Load the tokenizer and model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)


def embedder(chunk):
    """
    Helper function to embed chunk of text
    """
    # Tokenize the input text
    tokens = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True)

    # Get the model's output (including embeddings)
    with torch.no_grad():
        model_output = model(**tokens)

    # Extract the embeddings
    embeddings = model_output.last_hidden_state[:, 0, :]
    embed = embeddings[0].numpy()
    return embed


In [5]:
# Embed all the chunks of text
embeds = []
for chunk in chunks:
    embed = embedder(chunks)
    embeds.append(embed)

In [4]:
############ GPU Accelerated Embedding

from sentence_transformers import SentenceTransformer

def embedder(chunks):    
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    return model.encode(chunks)

embeds = embedder(chunks)


  from tqdm.autonotebook import tqdm, trange
2024-07-04 18:29:21.011236: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-04 18:29:21.037131: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Vector Store

In [5]:
# Insert text chunks with their embeddings

import lancedb


def prepare_data(chunks, embeddings):
    """
    Helper function to prepare data to insert in LanceDB
    """
    data = []
    for chunk, embed in zip(chunks, embeddings):
        temp = {}
        temp["text"] = chunk
        temp["vector"] = embed
        data.append(temp)
    return data


def lanceDBConnection(chunks, embeddings):
    """
    LanceDB insertion
    """
    db = lancedb.connect("lance.db")
    data = prepare_data(chunks, embeddings)
    table = db.create_table(
        "scratch",
        data=data,
        mode="overwrite",
    )
    return table


table = lanceDBConnection(chunks, embeds)

### Retrieval & Prompt Preparation

In [29]:
# Retriever
k = 5
question = "What is the jamaican motto"

# Embed Question
query_embedding = embedder(question)
# Semantic Search
result = table.search(query_embedding).limit(5).to_list()

In [30]:
context = [r["text"] for r in result]
context

[" Disney comedy Cool Runnings, which is loosely based on the true story of Jamaica's first bobsled team trying to make it in the Winter Olympics.\n\n\n=== Cuisine ===\n\nThe island is famous for its Jamaican jerk spice, curries and rice and peas which is integral to Jamaican cuisine. Jamaica is also home to Red Stripe beer and Jamaican Blue Mountain Coffee.\n\n\n=== National symbols ===\n(From the Jamaica Information Service)\n\nNational bird: red-billed streamertail (also called doctor bird) (a hummingbird, Trochilus polytmus)\nNational flower – lignum vitae (Guiacum officinale)\nNational tree: blue mahoe (Hibiscus talipariti elatum)\nNational fruit: ackee (Blighia",
 ' Commonwealth realm, with Charles III as its king, the appointed representative of the Crown is the Governor-General of Jamaica, an office held by Patrick Allen since 2009.\n\n\n== Etymology ==\nThe indigenous people, the Taíno, called the island Xaymaca in their language, meaning the "Land of Wood and Water" or the "L

In [14]:
# Context Prompt

base_prompt = """Your task is to understand the user question, and provide an answer using the provided contexts. Every answer you generate should have citations in this pattern  "Answer [position].", for example: "Earth is round [1][2].," if it's relevant.

Your answers are correct, high-quality, and written by an domain expert. If the provided context does not contain the answer, simply state, "The provided context does not have the answer."

User question: {}

Contexts:
{}
"""
     

### Answer Generation

In [31]:
from openai import OpenAI

# llm
prompt = f"{base_prompt.format(question, context)}"

client = OpenAI(api_key="sk-")
response = client.chat.completions.create(
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {'role': 'user', 'content': prompt, }
    ],
    model='gpt-4-turbo-2024-04-09',
    temperature=0,
)

print(response.choices[0].message.content)

The Jamaican motto is "Out of Many One People" [5].


In [32]:
# local Ollama 
from openai import OpenAI


prompt = f"{base_prompt.format(question, context)}"

client = OpenAI(
    base_url='http://192.168.8.116:11434/v1/',
    api_key='ollama',
)

chat_completion = client.chat.completions.create(
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {'role': 'user', 'content': prompt, }
    ],
    model='gemma2:9b-instruct-q5_K_M',
    temperature=0,
)
print(chat_completion.choices[0].message.content)

The Jamaican motto is "Out of Many One People". [2] 



In [33]:
print(prompt)

You are an AI assistant. Your task is to understand the user question, and provide an answer using the provided contexts. Every answer you generate should have citations in this pattern  "Answer [position].", for example: "Earth is round [1][2].," if it's relevant.

Your answers are correct, high-quality, and written by an domain expert. If the provided context does not contain the answer, simply state, "The provided context does not have the answer."

User question: What is the jamaican motto

Contexts:
[" Disney comedy Cool Runnings, which is loosely based on the true story of Jamaica's first bobsled team trying to make it in the Winter Olympics.\n\n\n=== Cuisine ===\n\nThe island is famous for its Jamaican jerk spice, curries and rice and peas which is integral to Jamaican cuisine. Jamaica is also home to Red Stripe beer and Jamaican Blue Mountain Coffee.\n\n\n=== National symbols ===\n(From the Jamaica Information Service)\n\nNational bird: red-billed streamertail (also called doct