In [1]:
import csv
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient, models, conversions
from tqdm.notebook import tqdm

In [2]:
client = QdrantClient(path="vec.db")

In [8]:
reader = csv.DictReader(open("nse.csv"))

In [9]:
print(reader.fieldnames)

['SYMBOL', 'NAME OF COMPANY', 'SERIES', 'DATE OF LISTING', 'PAID UP VALUE', 'MARKET LOT', 'ISIN NUMBER', 'FACE VALUE']


In [10]:
# ndq = pd.DataFrame(reader).to_dict(orient="list")
nse = list(reader)

In [11]:
len(nse)

1972

In [12]:
nse[0]

{'SYMBOL': '20MICRONS',
 'NAME OF COMPANY': '20 Microns Limited',
 'SERIES': 'EQ',
 'DATE OF LISTING': '06-OCT-2008',
 'PAID UP VALUE': '5',
 'MARKET LOT': '1',
 'ISIN NUMBER': 'INE144J01027',
 'FACE VALUE': '5'}

In [13]:
encoder = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")



In [14]:
client.recreate_collection(
    collection_name="nse",
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(),  # Vector size is defined by used model
        distance=models.Distance.COSINE,
    ),
)

  client.recreate_collection(


True

In [15]:
client.upload_points(
    collection_name="nse",
    points=[
        models.PointStruct(
            id=idx,
            vector=encoder.encode(f'''{doc["SYMBOL"]}: {doc["NAME OF COMPANY"]}''').tolist(), payload=doc
        )
        for idx, doc in tqdm(enumerate(nse), total=len(nse))
    ],
    parallel=4,
)

  0%|          | 0/1972 [00:00<?, ?it/s]

In [30]:
points = client.search(
    collection_name="nse",
    query_vector=encoder.encode("SBI Bank").tolist(),
    # Symbol should not have ^ in it
    # query_filter=models.Filter(
    #     must_not=[
    #         models.FieldCondition(key="Symbol", match=models.MatchValue(value="^")),
    #     ]
    # ),
    limit=3,
)

points

[ScoredPoint(id=1537, version=0, score=0.812858942231983, payload={'SYMBOL': 'SBIN', 'NAME OF COMPANY': 'State Bank of India', 'SERIES': 'EQ', 'DATE OF LISTING': '01-MAR-1995', 'PAID UP VALUE': '1', 'MARKET LOT': '1', 'ISIN NUMBER': 'INE062A01020', 'FACE VALUE': '1'}, vector=None, shard_key=None),
 ScoredPoint(id=1535, version=0, score=0.6722810397755001, payload={'SYMBOL': 'SBICARD', 'NAME OF COMPANY': 'SBI Cards and Payment Services Limited', 'SERIES': 'EQ', 'DATE OF LISTING': '16-MAR-2020', 'PAID UP VALUE': '10', 'MARKET LOT': '1', 'ISIN NUMBER': 'INE018E01016', 'FACE VALUE': '10'}, vector=None, shard_key=None),
 ScoredPoint(id=221, version=0, score=0.6668894581716207, payload={'SYMBOL': 'BANKINDIA', 'NAME OF COMPANY': 'Bank of India', 'SERIES': 'EQ', 'DATE OF LISTING': '30-APR-1997', 'PAID UP VALUE': '10', 'MARKET LOT': '1', 'ISIN NUMBER': 'INE084A01016', 'FACE VALUE': '10'}, vector=None, shard_key=None)]