In [3]:
from pdfminer.high_level import extract_text
import os

DOCS_DIR = "data/docs"
docs = []
for filename in os.listdir(DOCS_DIR):
    if filename.endswith(".pdf"):
        file_path = os.path.join(DOCS_DIR, filename)
        text = extract_text(file_path)
        docs.append(text)

In [4]:
docs

['Scaling Laws for Neural Language Models\n\nJared Kaplan ∗\n\nJohns Hopkins University, OpenAI\n\njaredk@jhu.edu\n\nSam McCandlish∗\n\nOpenAI\n\nsam@openai.com\n\nTom Henighan\n\nTom B. Brown\n\nBenjamin Chess\n\nRewon Child\n\nOpenAI\n\nOpenAI\n\nOpenAI\n\nOpenAI\n\nhenighan@openai.com\n\ntom@openai.com\n\nbchess@openai.com\n\nrewon@openai.com\n\nScott Gray\n\nOpenAI\n\nAlec Radford\n\nOpenAI\n\nJeffrey Wu\n\nOpenAI\n\nDario Amodei\n\nOpenAI\n\nscott@openai.com\n\nalec@openai.com\n\njeffwu@openai.com\n\ndamodei@openai.com\n\nAbstract\n\nWe study empirical scaling laws for language model performance on the cross-entropy loss.\nThe loss scales as a power-law with model size, dataset size, and the amount of compute\nused for training, with some trends spanning more than seven orders of magnitude. Other\narchitectural details such as network width or depth have minimal effects within a wide\nrange. Simple equations govern the dependence of overﬁtting on model/dataset size and the\ndepend

In [5]:
from functools import partial
import nltk
import tiktoken

nltk.download("punkt")

tiktoken_tokenizer = tiktoken.get_encoding("cl100k_base")
sentence_tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")


def split_by_separator(text, sep):
    splits = text.split(sep)
    res = [s + sep for s in splits[:-1]]
    if splits[-1]:
        res.append(splits[-1])
    return res


def split_sentences(text):
    spans = [s[0] for s in sentence_tokenizer.span_tokenize(text)] + [len(text)]
    return [text[spans[i] : spans[i + 1]] for i in range(len(spans) - 1)]


def token_size(text):
    return len(tiktoken_tokenizer.encode(text))


class TextSplitter:
    def __init__(self, chunk_size):
        self.chunk_size = chunk_size
        self.splitters = [
            partial(split_by_separator, sep="\n\n"),
            partial(split_by_separator, sep="\n"),
            split_sentences,
            partial(split_by_separator, sep=" "),
        ]

    def _split_recursive(self, text, level=0):
        if token_size(text) <= self.chunk_size or level == len(self.splitters):
            return [text]

        splits = []
        for s in self.splitters[level](text):
            if token_size(s) <= self.chunk_size:
                splits.append(s)
            else:
                splits.extend(self._split_recursive(s, level + 1))
        return splits

    def _merge_splits(self, splits):
        chunks = []
        current_chunk = ""

        for s in splits:
            if current_chunk and (token_size(current_chunk + s) > self.chunk_size):
                trimmed_chunk = current_chunk.strip()
                if trimmed_chunk:
                    chunks.append(trimmed_chunk)
                current_chunk = ""
            current_chunk += s

        trimmed_chunk = current_chunk.strip()
        if trimmed_chunk:
            chunks.append(trimmed_chunk)
        return chunks

    def split(self, text):
        splits = self._split_recursive(text)
        chunks = self._merge_splits(splits)
        return chunks

    def __call__(self, text):
        return self.split(text)

[nltk_data] Downloading package punkt to /Users/huy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
chunks = []
text_splitter = TextSplitter(chunk_size=512)
for i, doc in enumerate(docs):
    doc_chunks = text_splitter.split(doc)
    chunks += doc_chunks

In [7]:
chunks

['Scaling Laws for Neural Language Models\n\nJared Kaplan ∗\n\nJohns Hopkins University, OpenAI\n\njaredk@jhu.edu\n\nSam McCandlish∗\n\nOpenAI\n\nsam@openai.com\n\nTom Henighan\n\nTom B. Brown\n\nBenjamin Chess\n\nRewon Child\n\nOpenAI\n\nOpenAI\n\nOpenAI\n\nOpenAI\n\nhenighan@openai.com\n\ntom@openai.com\n\nbchess@openai.com\n\nrewon@openai.com\n\nScott Gray\n\nOpenAI\n\nAlec Radford\n\nOpenAI\n\nJeffrey Wu\n\nOpenAI\n\nDario Amodei\n\nOpenAI\n\nscott@openai.com\n\nalec@openai.com\n\njeffwu@openai.com\n\ndamodei@openai.com\n\nAbstract\n\nWe study empirical scaling laws for language model performance on the cross-entropy loss.\nThe loss scales as a power-law with model size, dataset size, and the amount of compute\nused for training, with some trends spanning more than seven orders of magnitude. Other\narchitectural details such as network width or depth have minimal effects within a wide\nrange. Simple equations govern the dependence of overﬁtting on model/dataset size and the\ndepend

In [8]:
from nomic import embed

embed_res = embed.text(
    texts=chunks,
    model="nomic-embed-text-v1.5",
    task_type="search_document",
    inference_mode="local",
)

Downloading: 100%|██████████| 274M/274M [00:11<00:00, 23.2MiB/s] 
Verifying: 100%|██████████| 274M/274M [00:00<00:00, 542MiB/s] 


In [9]:
print(
    f"\nCreated {len(embed_res['embeddings'])} vector embeddings, "
    f"{embed_res['usage']['total_tokens']} total tokens"
)


Created 53 vector embeddings, 24463 total tokens


In [10]:
import numpy as np


def cosine_similarity(query_vector, vectors):
    query_vector = np.array(query_vector)
    vectors = np.array(vectors)
    return np.dot(vectors, query_vector) / (
        np.linalg.norm(vectors, axis=1) * np.linalg.norm(query_vector)
    )

In [None]:
query_vector = embed_res.get("embeddings")[0]
vectors = np.array(embed_res.get("embeddings")[1:])
value, index = cosine_similarity(query_vector, vectors).max(0), cosine_similarity(
    query_vector, vectors
).argmax(0)

In [13]:
value, index

(np.float64(0.9048170795401734), np.int64(0))

In [14]:
import json

VECTOR_STORE_FILEPATH = "data/vector_store.json"


class VectorStore:
    def __init__(self):
        self.store = []

    def add(self, items):
        self.store.extend(items)

    def save(self, file_path=VECTOR_STORE_FILEPATH):
        with open(file_path, "w") as f:
            json.dump(self.store, f)

    def load(self, file_path=VECTOR_STORE_FILEPATH):
        with open(file_path, "r") as f:
            self.store = json.load(f)

    def query(self, vector, top_k=10):
        vectors = [item["vector"] for item in self.store]
        similarities = cosine_similarity(vector, vectors)
        top_k_indices = np.argsort(similarities)[-top_k:][::-1]
        return [{**self.store[i], "score": similarities[i]} for i in top_k_indices]

In [15]:
vector_store = VectorStore()
vectors = [
    {"vector": vector, "text": text}
    for vector, text in zip(embed_res["embeddings"], chunks)
]
vector_store.add(vectors)
vector_store.save()

In [20]:
vector_store.store[0]

{'vector': [0.025192266330122948,
  0.10301493853330612,
  -0.16977433860301971,
  -0.054056257009506226,
  0.0472666397690773,
  0.00872587040066719,
  0.02851969748735428,
  -0.0030272614676505327,
  -0.058842822909355164,
  0.04434778913855553,
  -0.04581887647509575,
  0.008022510446608067,
  0.09275990724563599,
  0.027995722368359566,
  -0.006200325675308704,
  0.023318292573094368,
  -0.0038849851116538048,
  -0.0360867865383625,
  -0.02529829554259777,
  0.006476488430052996,
  -0.04451005905866623,
  -0.012459545396268368,
  0.014795361086726189,
  -0.05941741168498993,
  0.03610574081540108,
  0.05737169459462166,
  -0.024454595521092415,
  -0.026275992393493652,
  -0.04933438077569008,
  -0.002469467930495739,
  0.059806209057569504,
  -0.026146337389945984,
  -0.023411475121974945,
  -0.07631231099367142,
  -0.09153863042593002,
  -0.006486288271844387,
  0.04220665991306305,
  -0.0010079490020871162,
  0.028497548773884773,
  0.04629940539598465,
  0.04328909516334534,
  -

In [23]:
from llama_cpp import Llama

llm = Llama(model_path="models/Llama-3.2-1B-Instruct.Q4_K_M.gguf", n_ctx=2048)

llama_load_model_from_file: using device Metal (Apple M1) - 4861 MiB free
llama_model_loader: loaded meta data with 35 key-value pairs and 147 tensors from models/Llama-3.2-1B-Instruct.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Models Meta Llama Llama 3.2 1B Instruct
llama_model_loader: - kv   3:                           general.finetune str              = Instruct
llama_model_loader: - kv   4:                           general.basename str              = models-meta-llama-Llama-3.2
llama_model_loader: - kv   5:                         general.size_label str              = 1B
llama_model_loader: - kv   6:     

In [24]:
SYSTEM_PROMPT = """You are an assistant that answers user questions about a collection of movie screenplays."""

USER_PROMPT = """
Use the following pieces of context to answer the user question.
You must only use the facts from the context to answer.
If the answer cannot be found in the context, say that you don't have enough information to answer the question and provide any relevant facts found in the context.
Don't address \"the context\" explicitly in your answer, answer the question like it's your own knowledge.

Context:
{context}

User Question:
{question}
"""


question = "what is the scalling law?"


# Embed the user's question
embed_res = embed.text(
    texts=[question],
    model="nomic-embed-text-v1.5",
    task_type="search_query",
    inference_mode="local",
)
query_vector = embed_res["embeddings"][0]

# Find the most relevant chunks in our vector store using semantic search
chunks = vector_store.query(query_vector, top_k=3)

# Prepare the context and prompt, and generate an answer with the LLM
context = "\n\n---\n\n".join([chunk["text"] for chunk in chunks]) + "\n\n---"
user_message = USER_PROMPT.format(context=context, question=question)
messages = [
    {"role": "system", "content": SYSTEM_PROMPT},
    {"role": "user", "content": user_message},
]
chat_completion = llm.create_chat_completion(messages=messages)

llama_perf_context_print:        load time =    9048.58 ms
llama_perf_context_print: prompt eval time =       0.00 ms /  1520 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /   238 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =   14882.08 ms /  1758 tokens


In [25]:
context

'We would like to thank Shan Carter, Paul Christiano, Jack Clark, Ajeya Cotra, Ethan Dyer, Jason Eisner,\nDanny Hernandez, Jacob Hilton, Brice Menard, Chris Olah, and Ilya Sutskever for discussions and for feed-\nback on drafts of this work.\n\n19\n\n\x0cAppendices\n\nA Summary of Power Laws\n\nFor easier reference, we provide a summary below of the key trends described throughout the paper.\n\nParameters Data Compute\n\nBatch Size Equation\n\nN\n\n∞\n\nOptimal\n\n∞\n\nD\n\n∞\n\n∞\n\nEarly Stop\n\nC\n\nFixed\n\nFixed\n\nFixed\n\nL (N ) = (Nc/N )αN\nL (D) = (Dc/D)αD\nL (C) = (Cc/C)αC (naive)\n\nNopt\n\nDopt\n\nCmin\n\nN\n\nN\n\nD\n\n∞\n\nEarly Stop\n\nB (cid:28) Bcrit L (Cmin) = (cid:0)C min\nc\n(cid:0) Nc\nN\n\nL (N, D) =\n\nFixed\n\n(cid:20)\n\n(cid:1)αmin\n\nC\n\n(cid:21)αD\n\n/Cmin\n(cid:1) αN\nαD + Dc\nD\n(cid:16)\n\n(cid:1)αN +\n\n(cid:17)αS\n\nSc\nSmin(S,B)\n\nS steps\n\nB\n\nL (N, S) = (cid:0) Nc\n\nN\n\nTable 4\n\nThe empirical ﬁtted values for these trends are:\n\nPower Law\n\

In [33]:
chat_completion["choices"][0]["message"]

{'role': 'assistant',
 'content': 'The scaling law mentioned in the context is a power-law relationship between the model size, dataset size, and the amount of compute used for training. Specifically, the scaling law is described as:\n\nL(Cmin) = (cid:0)C min\nc\n(cid:0) Nc\nN\n\nL (N, D) =\n\nFixed\n\n(cid:20)\n\n(cid:1)αmin\n\nC\n\n(cid:21)αD\n\n/Cmin\n(cid:1) αN\nαD + Dc\nD\n(cid:16)\n\n(cid:1)αN +\n\n(cid:17)αS\n\nSc\nSmin(S,B)\n\nS steps\n\nB\n\nL (N, S) = (cid:0) Nc\n\nN\n\nThis scaling law indicates that as the model size (N) increases, the dataset size (D) must also increase to maintain a constant loss (L) per step (S). The optimal model size (N*) is achieved when the loss per step (L) is minimized, and the scaling law provides a way to determine this optimal model size based on the model size, dataset size, and compute budget.'}