In [39]:
import torch
import numpy as np
import gc
import pandas as pd
from transformers import pipeline
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import load_dataset, load_dataset_builder

model_name = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

foundation_model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    config=bnb_config,
    device_map = {"": 0}
    )


tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

OutOfMemoryError: CUDA out of memory. Tried to allocate 44.00 MiB. GPU 

In [1]:
from src.quantize import quantize, unquantize

ModuleNotFoundError: No module named 'trl'

In [5]:
arxiv_name = 'arxiv_dataset'
arxiv_builder = load_dataset_builder(arxiv_name)
arxiv_ds = load_dataset(name=arxiv_name, split='train', path='/home/smckean/Raw/arxiv_dataset')

  table = cls._concat_blocks(blocks, axis=0)


In [80]:
import torch
from transformers import AutoModel, AutoTokenizer

# use GPU if available, on mac can use MPS
device = "cuda" if torch.cuda.is_available() else "cpu"

model_id = "intfloat/e5-base-v2"

# initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id).to(device)
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [81]:
def embed(docs: list[str]) -> list[list[float]]:
    docs = [f"passage: {d}" for d in docs]
    # tokenize
    tokens = tokenizer(
        docs, padding=True, max_length=512, truncation=True, return_tensors="pt"
    ).to(device)
    with torch.no_grad():
        # process with model for token-level embeddings
        out = model(**tokens)
        # mask padding tokens
        last_hidden = out.last_hidden_state.masked_fill(
            ~tokens["attention_mask"][..., None].bool(), 0.0
        )
        # create mean pooled embeddings
        doc_embeds = last_hidden.sum(dim=1) / \
            tokens["attention_mask"].sum(dim=1)[..., None]
    return doc_embeds.cpu().numpy()

In [2]:

print_gpu_summary()

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------

In [97]:
test_ds = arxiv_ds.select(np.random.choice(len(arxiv_ds), 10000))

In [98]:
from tqdm.auto import tqdm
import numpy as np

batch_size = 50
data = test_ds['abstract']

for i in tqdm(range(0, len(data), batch_size)):
    gc.collect()
    torch.cuda.empty_cache()
    i_end = min(len(data), i+batch_size)
    chunk_batch = data[i:i_end]
    # embed current batch
    embed_batch = embed(chunk_batch)
    # add to existing np array if exists (otherwise create)
    if i == 0:
        embeddings = embed_batch.copy()
    else:
        embeddings = np.concatenate([embeddings, embed_batch.copy()])

100%|██████████| 200/200 [09:32<00:00,  2.86s/it]


In [154]:
# make the ChromaDB

import chromadb
from chromadb.config import Settings

client = chromadb.Client(Settings(persist_directory='/home/smckean/db/arxiv'))
collection = client.create_collection(name='abstracts')

UniqueConstraintError: Collection abstracts already exists

In [101]:
ids = [item['id'] for item in test_ds]

In [100]:
collection.add(
    ids = []
)

Collection(name=abstracts)

In [109]:
def suffix_duplicates(lst):
    seen = {}
    result = []
    for item in lst:
        if item in seen:
            seen[item] += 1
            item = f"{item}{chr(ord('a') + seen[item] - 1)}"
        else:
            seen[item] = 1
        result.append(item)
    return result

['cond-mat/0106208',
 '2304.09665',
 '0704.1679',
 '1912.10243',
 '1006.3906',
 '1105.0743',
 '1211.0690',
 '1002.1992',
 '0801.2328',
 '2401.09608',
 '1909.13513',
 '1903.09657',
 '2206.05622',
 'hep-ph/0302193',
 '2002.10318',
 '2206.15276',
 '1106.0893',
 'astro-ph/0110288',
 '2312.08046',
 '2007.00437',
 '2012.13447',
 '2305.18634',
 '2005.01330',
 '1407.5078',
 '2101.02827',
 '1711.10711',
 '1806.08069',
 '2101.07894',
 '2009.03851',
 'quant-ph/0507076',
 '2011.08065',
 '2306.08558',
 '2007.02622',
 'math/0701870',
 'quant-ph/9705018',
 '2401.07624',
 '2402.15582',
 '2007.11187',
 '2002.10629',
 '2011.11418',
 'hep-ph/0404081',
 '2304.12864',
 '1206.6174',
 '1503.07021',
 '0801.4811',
 '2405.08923',
 '2401.16352',
 '2210.10655',
 '0904.0649',
 '1309.0567',
 '2211.11202',
 '1010.6217',
 '2310.07422',
 'astro-ph/0607296',
 '2002.02947',
 '1709.09398',
 '2109.01496',
 '1401.1485',
 '0708.2193',
 '2107.04612',
 '2007.11618',
 'nucl-th/9701036',
 '2402.08910',
 '1910.01387',
 '1808.060

In [119]:
collection.add(
    ids = suffix_duplicates(test_ds['id']),
    documents = test_ds['abstract'],
    embeddings = embeddings.tolist()
)

In [150]:
results = collection.query(
    query_embeddings= embed(["retrieval augmented generation technical writing"]).tolist(),
    n_results=3
)

In [151]:
loaded_db = chromadb.PersistentClient(path='/home/smckean/db/arxiv')

In [153]:
results = loaded_db.get_collection('abstract').query(
    query_embeddings= embed(["retrieval augmented generation technical writing"]).tolist(),
    n_results=3
)

ValueError: Collection abstract does not exist.

In [46]:
for i in tqdm(range(0, test_ds.shape[0], batch_size)):
    pass

100%|██████████| 20/20 [00:00<00:00, 275036.33it/s]


In [24]:
[x['abstract'] for x in arxiv_ds[0:100]]

TypeError: string indices must be integers

In [7]:
# take the arxiv dataset
# create a vector database with it
# find papers most relevant to search query
# return the reference and link
# boom RAG done!s

arxiv_ds[530]['abstract']

'  We discuss the implementation of electric-magnetic duality transformations in\nfour-dimensional gravity linearized around Minkowski or (A)dS4 backgrounds. In\nthe presence of a cosmological constant duality generically modifies the\nHamiltonian, nevertheless the bulk dynamics is unchanged. We pay particular\nattention to the boundary terms generated by the duality transformations and\ndiscuss their implications for holography.\n'

In [14]:
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings

ModuleNotFoundError: No module named 'langchain_text_splitters'

In [20]:
model

LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (

In [18]:
embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")



In [2]:
# We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
messages = [
    {
        "role": "system",
        "content": "You are an expert in geothermal energy and always respond factually with as much technical detail as possible",
    },
    {"role": "user", "content": "How many closed loop geothermal projects are there in the world?"},
]
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
print(outputs[0]["generated_text"])

<|system|>
You are an expert in geothermal energy and always respond factually with as much technical detail as possible</s>
<|user|>
How many closed loop geothermal projects are there in the world?</s>
<|assistant|>
As of 2021, there are approximately 16 closed loop geothermal projects in the world. These projects consist of geothermal energy systems that use the heat from geothermal resources to power a community's heating and cooling needs. The geothermal energy systems are typically used to provide electricity, heating, and cooling to the community. Closed loop geothermal systems are known for their high efficiency, low emissions, and low maintenance requirements.


  table = cls._concat_blocks(blocks, axis=0)


In [6]:
geothermal_ds

KeyError: "Invalid key: 0. Please first select a split. For example: `my_dataset_dictionary['train'][0]`. Available splits: ['train']"