## Extracting features and storing them in a vector store

In this notebook, I pass article texts to the FlagEmbedding model from HuggingFace, and then store each embbeded node in a Postgres vector store database.

In [1]:
import pandas as pd
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import TextNode
from dotenv import load_dotenv, find_dotenv
import os
import psycopg2
from llama_index.vector_stores.postgres import PGVectorStore
from pathlib import Path
from csv_reader import CSVReader

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
csv_path =  Path("C:/Users/Ugne/Documents/studies/Python/DL-task1/deep-learning-task-2/lrt_article_texts.csv")s

In [3]:
#!set "CMAKE_ARGS=-DLLAMA_OPENBLAS=on"
#!set "FORCE_CMAKE=1"
#!pip install llama-cpp-python --no-cache-dir

## Loading Llama CPP and sentence transformers

In [10]:
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en")

## Initialising the vector store

In [5]:
load_dotenv(find_dotenv())
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [7]:
db_name = "vectors_db"
host = "localhost"
password = os.getenv("POSTGRES_PW")
port = "5432"
user = "postgres"

conn = psycopg2.connect(
    dbname="postgres",
    host=host,
    password=password,
    port=port,
    user=user,
)
conn.autocommit = True

vector_store = PGVectorStore.from_params(
    database=db_name,
    host=host,
    password=password,
    port=port,
    user=user,
    table_name="llama2_paper",
    embed_dim=384,  # openai embedding dimension
)

In [10]:
# concat_rows=False results in a separate Document being created for every row
loader = CSVReader(concat_rows=False, encoding="utf-8")
documents = loader.load_data(file=csv_path, extra_info={"source_url":""})

In [11]:
text_parser = SentenceSplitter(
    chunk_size=1024,
    # separator=" ",
)

text_chunks = []
doc_idxs = []
for doc_idx, doc in enumerate(documents):
    cur_text_chunks = text_parser.split_text(doc.text)
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))

In [12]:
nodes = []
for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text=text_chunk,
    )
    src_doc = documents[doc_idxs[idx]]
    node.metadata = src_doc.metadata
    nodes.append(node)

In [17]:
for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding

Batches: 100%|██████████| 1/1 [00:43<00:00, 43.89s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.53it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.18it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 16.41it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.50s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00, 30.88it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 26.38it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 26.03it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 62.27it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 31.17it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 24.93it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 19.91it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 19.46it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 20.33it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.28s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00, 29.92it/s]
Batches: 100%|██████████| 1/1 [00:02<00:00,  2.11s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00, 49.34it/s]
Batches: 1

In [20]:
vector_store.add(nodes)

['8d49b6fc-db75-419e-b9b9-68da56c9f20b',
 '47ddf539-4206-4c87-bdfe-cc7b02cb1fbe',
 'a0992d5c-424a-4a3d-89ff-d837b5c69fd8',
 '43b8a11a-5930-47e2-b4bb-90d9b5bfeb6e',
 'f63ae819-a340-47c9-9d82-877f94e86fd2',
 '47ce645b-3a3c-476d-a21c-0b74742e37a6',
 '6ee804d2-10fb-4d5f-97d7-c195f2c077ba',
 'fe9457a7-c59c-456c-a8f9-3ba293bb3b31',
 '25640385-1664-4f0d-bb5d-25cf63a0e39b',
 '38b843ac-6572-4ef4-893b-6626fe52b11b',
 'd1bd7dba-ce31-44a0-807e-c9df41611f17',
 'cc764fec-dbd8-4376-bba1-612930210cd9',
 '3a8cc661-a734-4b95-8d02-100d322d20bb',
 'eff433fd-6fab-4c4f-a74d-935df878d4a9',
 '4fec9443-52c3-4295-881f-60d34f0542cd',
 '474b7dbf-ef35-46c7-9883-c414f064b922',
 'c65b6c5a-a22c-4e60-8c96-e3c1e1ddc67f',
 'db90ba78-599e-427d-ae1a-f434a04170f7',
 '8c6fdf4e-716e-4eda-9c23-9794ad9e7304',
 'b2df5afd-3906-4681-b489-dcb80ffa3fee',
 '276e45c0-c8f2-42e5-aeb1-414c5a99ec21',
 '2894fd03-27c2-4b8f-b5e5-8ff8ce868d6e',
 '1d840508-a228-46ee-8f75-f7cadee05175',
 '8e5815a4-a6ff-48c3-896f-e13c66cd93c7',
 '254b0b54-1834-