In [25]:
%pip install -qU langchain tiktoken matplotlib seaborn tqdm bs4 requests uuid

Note: you may need to restart the kernel to use updated packages.


In [8]:
import requests
from bs4 import BeautifulSoup
import re
import uuid

In [9]:
# Collect wiki data
url = 'https://en.wikipedia.org/wiki/Luke_Skywalker'
text_path = 'luke_skywalker_test.txt'

In [12]:
# scraper function
def scrape_wiki_page(url, output_file):
    """
    Scrapes a Wikipedia page, removes links and references, and saves the text.

    Args:
        url: The URL of the Wikipedia page to scrape.
        output_file: The path to the output text file.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all headings and paragraphs
    text_blocks = soup.find_all(['h2', 'h3', 'h4', 'h5', 'h6', 'p'])

    # Clean each block
    cleaned_text = []
    for block in text_blocks:
        # Remove internal links
        for link in block.find_all('a'):
            link.replace_with(link.text)
        # Remove references section entirely
        if block.name == 'sup':
            block.extract()
        cleaned_text.append(block.text.strip())
    
    # remove references like example [13] or [citation needed] or [c]
    cleaned_text = [re.sub(r'\[\d+\]', '', block) for block in cleaned_text]

    # Join cleaned blocks and save to file
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write('\n'.join(cleaned_text))

    cleaned_text = '\n'.join(cleaned_text)

    return cleaned_text

In [13]:
# scrape wiki page
cleaned_text = scrape_wiki_page(url, text_path)

In [14]:
# Chunking
import tiktoken

tokenizer = tiktoken.get_encoding('cl100k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

In [15]:
tiktoken_len(cleaned_text)

6682

In [16]:
# find encoding for model
# encoding = tokenizer.get_encoding("")  Only works for models from OpenAI

In [17]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,  # number of tokens overlap between chunks
    length_function=tiktoken_len,
    separators=['\n\n', '\n', ' ', '']
)

In [18]:
chunks = text_splitter.split_text(cleaned_text)
len(chunks)

24

In [None]:
for i in range(len(chunks)):
    print(tiktoken_len(chunks[i]), chunks[i])
    print()

In [None]:
uid = uuid.uuid4().hex
data = [
    {
        'id': f'{uid}-{i}',
        'text': chunk,
        'source': url
    } for i, chunk in enumerate(chunks)
]
data

In [None]:
# Vector DB
# import faiss

# index = faiss.IndexFlatL2(768)

In [None]:
# Similarity

In [None]:
# Input User Prompt
query = "Who is Luke Skywalker?"

In [None]:
# Prepare LLM Input

In [None]:
# LLM API Call
def query_llm(prompt):
    url = "https://api.together.xyz/v1/completions"

    payload = {
        "model": "meta-llama/Llama-3-8b-chat-hf",
        "prompt": prompt,
        "max_tokens": 4000,
        "stop": ["</s>"]
    }
    headers = {
        "accept": "application/json",
        "content-type": "application/json",
        "Authorization": "Bearer 551774149a5aa92cf7b8cee1a296851f6b04cf2a3b4f212317083155ceef6f72"
    }

    response = requests.post(url, json=payload, headers=headers)

    print(response.text)

    return response.json()["choices"][0]["text"].strip()

In [None]:
# Output
response = query_llm(query)
print(response)