# Crypto Whitepaper LLM

In [None]:
from qdrant_client import QdrantClient
from src.corpus import *
from src.rag import *
from src.pipeline import *
from src.imaging import *

  from .autonotebook import tqdm as notebook_tqdm


### Step 1: Reading all Whitepapers (PDF files) from `data/raw_pds/`

Loading all PDF files:

In [None]:
docs = load_corpus("data")
docs

[{'document_class': 'raw_pdfs',
  'project_id': 'aave',
  'text': 'Protocol Whitepaper\nV1.0\n[EMAIL]\nJanuary 2020\nAbstract\nThis document describes the definitions and theory behind the Aave Protocol explaining the different aspects\nof the implementation.\nContents\nIntroduction\n1.1\nBasic Concepts . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .\n1.2\nFormal Definitions . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .\n2\nProtocol Architecture\n2.1\nLending Pool Core . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .\n2.2\nLending Pool Data Provider . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .\n2.3\nLending Pool . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .\n2.4\nLending Pool Configurator . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 

In [None]:
print(f"Loaded {len(docs)} documents")
if docs:
    print(docs[0].keys())  # Check structure
    print(len(docs[0]["text"]))  # Check text length

Loaded 6 documents
dict_keys(['document_class', 'project_id', 'text', 'source_path'])
27774


### Step 2: Setting up the RAG

In [None]:
chunk_objects = create_chunk_objects(docs=docs)
chunk_objects

[{'id': 'aave_0',
  'project_id': 'aave',
  'source': 'data\\raw_pdfs\\aave.pdf',
  'chunk_index': 0,
  'text': 'Protocol Whitepaper V1.0 [EMAIL] January 2020 Abstract This document describes the definitions and theory behind the Aave Protocol explaining the different aspects of the implementation. Contents Introduction 1.1 Basic Concepts . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 1.2 Formal Definitions . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 2 Protocol Architecture 2.1 Lending Pool Core . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 2.2 Lending Pool Data Provider . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 2.3 Lending Pool . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 2.4 Lending Pool Configurator . . . . . . . . . . . . . . . . . . . . . . . . . . 

In [None]:
chunk_embeddings = embed_chunks(chunk_objects)
chunk_embeddings

Batches: 100%|██████████| 7/7 [00:25<00:00,  3.60s/it]


array([[-5.01484983e-02, -3.35568301e-02, -5.43546788e-02, ...,
         1.10461907e-02,  3.74264978e-02,  2.32786201e-02],
       [-4.88143452e-02, -6.07051738e-02, -7.75696710e-02, ...,
        -1.07340422e-02,  6.35056663e-03,  9.09574423e-03],
       [-1.42356027e-02, -3.49780694e-02, -6.34338930e-02, ...,
         4.72240262e-02, -3.76252392e-05,  5.34903780e-02],
       ...,
       [-6.57549798e-02,  2.52599455e-02, -6.67224359e-03, ...,
         8.54725689e-02,  5.60708530e-02,  1.34296678e-02],
       [-8.73252079e-02, -2.24423539e-02, -4.96248938e-02, ...,
        -5.14123542e-03,  2.34074332e-02, -1.09556150e-02],
       [-1.05926186e-01, -8.31515156e-03, -9.39507782e-02, ...,
         7.61780515e-02,  4.14076708e-02,  3.81451249e-02]],
      shape=(219, 384), dtype=float32)

In [None]:
qdrant_client, COLLECTION_NAME = init_qdrant_collection(chunk_embeddings)

In [None]:
#qdrant_client = init_qdrant_collection(chunk_embeddings)
upload_to_qdrant(qdrant_client, chunk_objects, chunk_embeddings, COLLECTION_NAME)

Uploaded 219 chunks.
