In [1]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.core import Settings
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.postgres import PGVectorStore
from llama_index.core import Document
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from transformers import AutoTokenizer
from llama_index.core import set_global_tokenizer
from llama_index.core.node_parser import HTMLNodeParser
from pathlib import Path
from bs4 import BeautifulSoup
import psycopg
from llama_index.core import PromptTemplate

In [2]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-14B-Instruct")

Settings.embed_model = HuggingFaceEmbedding(
    model_name = "BAAI/bge-base-en-v1.5"
)

set_global_tokenizer(tokenizer.encode)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [3]:
data_dir = "/notebooks/ttlg/output/"
tags = []
html_docs = []
for ext in ["*.html"]:
    for path in Path(data_dir).rglob(ext):
        with open(path, "rb") as file:
            html_text = file.read().decode("windows-1252")
            soup = BeautifulSoup(html_text)
            tags.extend([tag.name for tag in soup.find_all()])
            html_docs.append(Document(text=html_text))


In [4]:
len(html_docs)

143

In [5]:
# tags = ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "b", "i", "u", "section", "blockquote", 'pagetitle']
tags = ["blockquote", 'pagetitle']

parser = HTMLNodeParser(tags=tags)
nodes = parser.get_nodes_from_documents(html_docs)
print(len(nodes))

143


In [6]:
def drop(name):
    with psycopg.connect(
        "host=postgres dbname=grover user=grover password=grover"
    ) as conn:
        with conn.cursor() as cur:
            cur.execute(f"""
                drop table if exists {name};
                """)
            conn.commit()


drop("data_html")

In [7]:
vector_store = PGVectorStore.from_params(
    database="grover",
    host="postgres",
    password="grover",
    port=5432,
    user="grover",
    table_name="html",
    embed_dim=768,
    hnsw_kwargs={
        "hnsw_m": 14,
        "hnsw_ef_construction": 72,
        "hnsw_ef_search": 52,
        "hnsw_dist_method": "vector_cosine_ops",
    },
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [8]:
index = VectorStoreIndex(nodes, storage_context=storage_context, show_progress=True, embed_model=Settings.embed_model)

Generating embeddings:   0%|          | 0/143 [00:00<?, ?it/s]

In [None]:
def completion_to_prompt(completion):
   return f"<|im_start|>system\n<|im_end|>\n<|im_start|>user\n{completion}<|im_end|>\n<|im_start|>assistant\n"

def messages_to_prompt(messages):
    prompt = ""
    for message in messages:
        if message.role == "system":
            prompt += f"<|im_start|>system\n{message.content}<|im_end|>\n"
        elif message.role == "user":
            prompt += f"<|im_start|>user\n{message.content}<|im_end|>\n"
        elif message.role == "assistant":
            prompt += f"<|im_start|>assistant\n{message.content}<|im_end|>\n"

    if not prompt.startswith("<|im_start|>system"):
        prompt = "<|im_start|>system\n" + prompt

    prompt = prompt + "<|im_start|>assistant\n"

    return prompt

llm = LlamaCPP(
    model_url="https://huggingface.co/bartowski/Qwen2.5-14B_Uncensored_Instruct-GGUF/resolve/main/Qwen2.5-14B_Uncensored_Instruct-Q6_K.gguf?download=true",
    temperature=0.1,
    max_new_tokens=1024,
    context_window=8192,
    generate_kwargs={"repeat_penalty": 1.1, "top_k": 0, "top_p": 0.5, "min_p": 0.1},
    model_kwargs={
        "n_gpu_layers": -1,
    },
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)

Settings.llm = llm

Downloading url https://huggingface.co/bartowski/Qwen2.5-14B_Uncensored_Instruct-GGUF/resolve/main/Qwen2.5-14B_Uncensored_Instruct-Q6_K.gguf?download=true to path /tmp/llama_index/models/Qwen2.5-14B_Uncensored_Instruct-Q6_K.gguf?download=true
total size (MB): 12124.68


  2%|▏         | 229/11562 [00:27<29:09,  6.48it/s]

In [None]:
llm.complete('Using the provided context, answer the following query: What controversial subject is often discussed on the Through The Looking Glass (TTLG) forums?')

In [None]:
print(
    index.as_query_engine().query(
        'Using the provided context, answer the following query: What controversial subject is often discussed on the Through The Looking Glass (TTLG) forums?'
    )
)