In [1]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

#initializing embedding mdoel
embed_model = HuggingFaceEmbedding(model_name='BAAI/bge-small-en')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from llama_index.llms.ollama import Ollama

#initializing llm
llm = Ollama(model="llama2", request_timeout=300)

In [3]:
import psycopg2

#important parameters used for database connections
db_name = 'vector_store'
host = 'localhost'
user = 'vineet'
password = 'password'
port = '5432'

#connecting to existing database named 'postgres'
db = psycopg2.connect(
    host=host,
    database="postgres",
    user=user,
    password=password,
    port=port
)

# autocommit = true, cause create database and drop database commands 
# can not be executed in transaction state.
db.autocommit = True

#removing old database with same name and creating new database
with db.cursor() as c:
    c.execute(f"DROP DATABASE IF EXISTS {db_name}")
    c.execute(f"CREATE DATABASE {db_name}")

OperationalError: connection to server at "localhost" (::1), port 5432 failed: Connection refused
	Is the server running on that host and accepting TCP/IP connections?
connection to server at "localhost" (127.0.0.1), port 5432 failed: Connection refused
	Is the server running on that host and accepting TCP/IP connections?


In [None]:
from sqlalchemy import make_url
from llama_index.vector_stores.postgres import PGVectorStore

# creating vector database from our norimal database by using PGVectorStore
vector_store = PGVectorStore.from_params(
    database=db_name,
    host=host,
    user=user,
    password=password,
    port=port,
    table_name="research paper",
    embed_dim=384
)

In [None]:
from llama_index.readers.file import PyMuPDFReader

# loading pdf's in our environment 

loader = PyMuPDFReader()

documents = loader.load(file_path="./data/CLIP.pdf")


In [None]:
from llama_index.core.node_parser import SentenceSplitter
from transformers import AutoTokenizer

# chunking or splitting our docs in smaller chunks

splitter = SentenceSplitter(chunk_size=1024)

chunks = []
chunk_idx = []

for id, doc in enumerate(documents):
    cur_chunks = splitter.split_text(doc.text)
    chunks.extend(cur_chunks)
    chunk_idx.extend([id]* len(cur_chunks))

In [None]:
from llama_index.core.schema import TextNode

# saving those chunks as nodes 

nodes = []

for idx, chunk in enumerate(chunks):
    node = TextNode(text=chunk)
    src_doc = documents[chunk_idx[idx]]
    node.metadata = src_doc.metadata
    nodes.append(node)

In [None]:
# embedding part

for node in nodes:
    node_embedding = embed_model.get_text_embedding(node.get_content(metadata_mode=all))
    node.embedding = node_embedding

In [None]:
print(nodes[6].metadata)

{'total_pages': 48, 'file_path': './data/CLIP.pdf', 'source': '4'}


In [None]:
# adding all nodes to our vector database 
vector_store.add(nodes)

['3238681a-c33a-40a7-8c7e-185583610c93',
 '8831820e-0ceb-4b13-be22-780d2573509a',
 '2aa821b4-4552-4fee-952c-cb03977d6c06',
 '0d1b5714-21a9-464c-b7df-9017f4d7d57f',
 '769851e3-5e46-49a3-8144-ebd05157b348',
 'f1d9f14a-a9c1-46e5-b8ea-e56484934870',
 '4b075cf3-9913-4176-a282-4f6757777c01',
 'e95191f3-6b82-45af-a72b-d7f2057ca47a',
 'cebc0f8c-3d6f-418b-a5f7-bb2cb830b2b2',
 'aa3c02f0-7dc3-4a2a-b383-0f202d522bb2',
 '70570ca4-5ab4-4680-9775-4e1443731f3c',
 '1d1907de-9419-4859-b41d-ab0a6016a3e8',
 '840543a2-30e1-4880-982b-46f04a3f7cc6',
 '34a2ed76-5ae1-4652-81e0-d4752c267623',
 '5d0b181d-d640-4ec0-8b3a-18de2c212c4b',
 '91168979-6dca-466f-ae09-65958970fb8b',
 '308684e3-0228-40e2-bc2a-a5953a9f5cd9',
 'ddb91a14-9455-4e29-8953-75296f6917fe',
 '2e5d4cd3-392b-4a08-8a6c-7b5e5db2f6b5',
 '8f62395a-a3c0-47e4-9689-e97b6ac8e8fe',
 'dc97a4d5-42a9-45c5-8b45-172d3a4e8df8',
 '935d8e3f-c6b6-4fe5-a4ad-553f9e42ba37',
 '49e06095-3476-42d9-8828-101991706240',
 '204166af-c580-4053-9cc9-e5a510cce8fd',
 '4ddb1448-b054-

In [None]:
query_str = "do we use image embedding in CLIP?"

# embedding our query 
query_embedding = embed_model.get_query_embedding(query_str)

In [None]:
from llama_index.core.vector_stores import VectorStoreQuery

# creating query object 
query_obj = VectorStoreQuery(query_embedding=query_embedding, similarity_top_k=2, mode="default")

In [None]:
query_result = vector_store.query(query_obj)

In [None]:
print(query_result.nodes[0].get_text())

Learning Transferable Visual Models From Natural Language Supervision
20
CLIP also does not address the poor data efﬁciency of deep
learning. Instead CLIP compensates by using a source of
supervision that can be scaled to hundreds of millions of
training examples. If every image seen during training of
a CLIP model was presented at a rate of one per second,
it would take 405 years to iterate through the 12.8 billion
images seen over 32 training epochs. Combining CLIP
with self-supervision (Henaff, 2020; Chen et al., 2020c) and
self-training (Lee; Xie et al., 2020) methods is a promising
direction given their demonstrated ability to improve data
efﬁciency over standard supervised learning.
Our methodology has several signiﬁcant limitations. De-
spite our focus on zero-shot transfer, we repeatedly queried
performance on full validation sets to guide the develop-
ment of CLIP. These validation sets often have thousands
of examples, which is unrealistic for true zero-shot sce-
narios. Simi

In [None]:
from llama_index.core.schema import NodeWithScore
from typing import Optional

# saving our nodes and scores in single list 

nodes_with_scores = []

for index,node in enumerate(query_result.nodes):
    score : Optional [float] = None
    if query_result.similarities is not None:
        score = query_result.similarities[index]

    nodes_with_scores.append(NodeWithScore(node=node,score=score))


In [None]:
index = 0
query_result.similarities[index]

0.8907523611029847

In [None]:
from llama_index.core.retrievers import BaseRetriever
from typing import Any, List
from llama_index.core import QueryBundle

# building custom retriver by inheriting BaseRetriever class

class Retriever(BaseRetriever):
    def __init__(self,
                 vector_store : PGVectorStore,
                 embed_model: any,
                 query_mode: str = "default",
                 similarity_top_k: int = 2) -> None:
        self._vector_store = vector_store
        self._embed_model = embed_model
        self._query_mode = query_mode
        self._similarity_top_k = similarity_top_k
        super().__init__()

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        query_embedding = embed_model.get_query_embedding(
            query_bundle.query_str
        )
        vector_query_obj = VectorStoreQuery(
            query_embedding=query_embedding,
            similarity_top_k=self._similarity_top_k,
            mode=self._query_mode
        )
        query_result = vector_store.query(vector_query_obj)

        nodes_with_scores = []
        for idx, node in enumerate(query_result.nodes):
            if query_result.similarities is not None:
                score = query_result.similarities[idx]

            nodes_with_scores.append(NodeWithScore(node=node, score=score))

        return nodes_with_scores

In [None]:
retriever = Retriever(vector_store, embed_model, similarity_top_k=2, query_mode="default")

In [None]:
from llama_index.core.query_engine import RetrieverQueryEngine

query_engine = RetrieverQueryEngine.from_args(retriever=retriever, llm=llm)


In [None]:
query_str = "do we use image embedding in CLIP?"

query_response = query_engine.query(query_str)

print(str(query_response))

No, CLIP does not directly use image embeddings. Instead, it relies on text-image pairs as supervision to train a visual model that can carry out arbitrary image classification tasks. The text-image pairs are unfiltered and uncurated, which can result in the model learning social biases present in the training data. To address this limitation, CLIP falls back to fitting linear classifiers on top of its features when transitioning from a zero-shot to a few-shot setting, which results in a drop in performance. Future work is needed to develop methods that combine CLIP's strong zero-shot performance with efficient few-shot learning.
