In [None]:
!pip install transformers scikit-learn docx2txt datasets nltk lancedb 

In [2]:
!pip install openai

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting openai
  Downloading openai-1.35.10-py3-none-any.whl.metadata (21 kB)
Downloading openai-1.35.10-py3-none-any.whl (328 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.3/328.3 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25hInstalling collected packages: openai
Successfully installed openai-1.35.10


In [None]:
!pip install wikipedia

In [None]:
!pip install sentence_transformers tf-keras

## RAG from scratch w/ LanceDB

### Data Loading

In [4]:
import wikipedia

def get_wiki_article(title):
    try:
        page = wikipedia.page(title)
        with open('jamaica.txt', 'w') as f:
            f.write(page.content)
        print("Article successfully written article to text")
    except Exception as e:
        print(f"An error occurred: {e}")

get_wiki_article('Jamaica')

Article successfully written to article.txt


In [1]:
with open('jamaica.txt', 'r') as file:
    text_data = file.read()

### Chunking

In [2]:
# Recursive Text Splitter

import nltk

nltk.download("punkt")
from nltk.tokenize import sent_tokenize
import re


def recursive_text_splitter(text, max_chunk_length=1000, overlap=100):
    """
    Helper function for chunking text recursively
    """
    # Initialize result
    result = []

    current_chunk_count = 0
    separator = ["\n", " "]
    _splits = re.split(f"({separator})", text)
    splits = [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)]

    for i in range(len(splits)):
        if current_chunk_count != 0:
            chunk = "".join(
                splits[
                    current_chunk_count
                    - overlap : current_chunk_count
                    + max_chunk_length
                ]
            )
        else:
            chunk = "".join(splits[0:max_chunk_length])

        if len(chunk) > 0:
            result.append("".join(chunk))
        current_chunk_count += max_chunk_length

    return result

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
#split text

chunks = recursive_text_splitter(text_data, max_chunk_length=100, overlap=10)
print("Number of Chunks: ", len(chunks))

Number of Chunks:  141


### Embedding

In [4]:
from transformers import AutoTokenizer, AutoModel
import torch

# Choose a pre-trained model (e.g., BERT, RoBERTa, etc.)
# Load the tokenizer and model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)


def embedder(chunk):
    """
    Helper function to embed chunk of text
    """
    # Tokenize the input text
    tokens = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True)

    # Get the model's output (including embeddings)
    with torch.no_grad():
        model_output = model(**tokens)

    # Extract the embeddings
    embeddings = model_output.last_hidden_state[:, 0, :]
    embed = embeddings[0].numpy()
    return embed


In [5]:
# Embed all the chunks of text
embeds = []
for chunk in chunks:
    embed = embedder(chunks)
    embeds.append(embed)

In [4]:
############ GPU Accelerated Embedding

from sentence_transformers import SentenceTransformer

def embedder(chunks):    
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    return model.encode(chunks)

embeds = embedder(chunks)


  from tqdm.autonotebook import tqdm, trange
2024-07-04 18:29:21.011236: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-04 18:29:21.037131: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Vector Store

In [5]:
# Insert text chunks with their embeddings

import lancedb


def prepare_data(chunks, embeddings):
    """
    Helper function to prepare data to insert in LanceDB
    """
    data = []
    for chunk, embed in zip(chunks, embeddings):
        temp = {}
        temp["text"] = chunk
        temp["vector"] = embed
        data.append(temp)
    return data


def lanceDBConnection(chunks, embeddings):
    """
    LanceDB insertion
    """
    db = lancedb.connect("lance.db")
    data = prepare_data(chunks, embeddings)
    table = db.create_table(
        "scratch",
        data=data,
        mode="overwrite",
    )
    return table


table = lanceDBConnection(chunks, embeds)

### Retrieval & Prompt Preparation

In [58]:
# Retriever
k = 5
question = "what birds are found in jamaica"

# Embed Question
query_embedding = embedder(question)
# Semantic Search
result = table.search(query_embedding).limit(5).to_list()

In [59]:
context = [r["text"] for r in result]
context

[' including the endangered black-Billed parrots and the Jamaican blackbird, both of which are only found in Cockpit Country. It is also the indigenous home to four species of hummingbirds (three of which are found nowhere else in the world): the black-billed streamertail, the Jamaican mango, the Vervain hummingbird, and red-billed streamertails. The red-billed streamertail, known locally as the "doctor bird", is Jamaica\'s National Symbol. Other notable species include the Jamaican tody and the Greater flamingo,\nOne species of freshwater turtle is native to Jamaica, the Jamaican slider. It is found only on Jamaica and on a few islands in the Bahamas.',
 " wild boar and the small Asian mongoose are also common. Jamaica is also home to about 50 species of reptiles, the largest of which is the American crocodile; however, it is only present within the Black River and a few other areas. Lizards such as anoles, iguanas and snakes such as racers and the Jamaican boa (the largest snake on t

In [14]:
# Context Prompt

base_prompt = """Your task is to understand the user question, and provide an answer using the provided contexts. Every answer you generate should have citations in this pattern  "Answer [position].", for example: "Earth is round [1][2].," if it's relevant.

Your answers are correct, high-quality, and written by an domain expert. If the provided context does not contain the answer, simply state, "The provided context does not have the answer."

User question: {}

Contexts:
{}
"""
     

### Answer Generation

In [60]:
from openai import OpenAI

# llm
prompt = f"{base_prompt.format(question, context)}"

client = OpenAI(api_key="sk-")
response = client.chat.completions.create(
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {'role': 'user', 'content': prompt, }
    ],
    model='gpt-4-turbo-2024-04-09',
    temperature=0,
)

print(response.choices[0].message.content)

Birds found in Jamaica include the endangered black-billed parrots and the Jamaican blackbird, which are both unique to Cockpit Country. Additionally, Jamaica is home to four species of hummingbirds, three of which are endemic: the black-billed streamertail, the Jamaican mango, and the red-billed streamertail, also known as the "doctor bird" and is Jamaica's national bird. Other notable bird species include the Jamaican tody and the Greater flamingo [1][2].


In [62]:
# local Ollama 
from openai import OpenAI


prompt = f"{base_prompt.format(question, context)}"

client = OpenAI(
    base_url='http://192.168.8.116:11434/v1/',
    api_key='ollama',
)

chat_completion = client.chat.completions.create(
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {'role': 'user', 'content': prompt, }
    ],
    model='gemma2:27b-instruct-q5_K_M',
    temperature=0,
)
print(chat_completion.choices[0].message.content)

The provided contexts mention that the **Doctor Bird** is a species of bird found in Jamaica.

It also mentions that the **Jamaican Jerk Chicken** is a popular dish, but it doesn't specify if it's a traditional Jamaican dish or just a general term for "jerk" cuisine. 

Here are some additional details about the contexts:

* **"The Jamaican tody**, **one of the most notable endemic birds of Jamaica",** is mentioned as being found in the region, but not specifically where within the country it's native to.
* The **Lignum Vitae** (Guaiacum officinale) is a tree species, and the text mentions that it's the national bird, but doesn't specify if it's also a popular example of Jamaican flora.

Let me know if you'd like to know more about Jamaica's unique biodiversity!


In [55]:
print(prompt)

You are an AI assistant. Your task is to understand the user question, and provide an answer using the provided contexts. Every answer you generate should have citations in this pattern  "Answer [position].", for example: "Earth is round [1][2].," if it's relevant.

Your answers are correct, high-quality, and written by an domain expert. If the provided context does not contain the answer, simply state, "The provided context does not have the answer."

User question: What is the largest city in jamaica

Contexts:
[" over the previous year.\nMany Jamaicans are hostile towards LGBT and intersex people, and mob attacks against gay people have been reported. Numerous high-profile dancehall and ragga artists have produced songs featuring explicitly homophobic lyrics. This has prompted the formations of LGBT rights organisations such as Stop Murder Music. Male homosexuality is illegal and punishable by imprisonment.\n\n\n=== Major cities ===\n\n\n== Religion ==\n\nChristianity is the largest