In [1]:
import os
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from datasets import load_dataset

load_dotenv()

True

## Data load

In [2]:
data = load_dataset("llamafactory/PubMedQA", split='train')
data = data.to_pandas()
data.head()

Unnamed: 0,instruction,input,output
0,Answer the question based on the following con...,Question: Is naturopathy as effective as conve...,Naturopathy appears to be an effective alterna...
1,Answer the question based on the following con...,Question: Can randomised trials rely on existi...,Routine data have the potential to support hea...
2,Answer the question based on the following con...,Question: Is laparoscopic radical prostatectom...,The results of our non-randomized study show t...
3,Answer the question based on the following con...,Question: Does bacterial gastroenteritis predi...,Symptoms consistent with IBS and functional di...
4,Answer the question based on the following con...,Question: Is early colonoscopy after admission...,No significant association is apparent between...


In [3]:
MAX_ROWS = 1000
OUTPUT="output"
subset_data = data.head(MAX_ROWS)

chunks = subset_data[OUTPUT].to_list()

## Pinecone Index

In [4]:
from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))

In [5]:
# create index
pc.create_index(
    name="pubmed",
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(
        cloud=os.getenv("PINECONE_CLOUD"),
        region=os.getenv("PINECONE_REGION")
    )
)

In [6]:
# set embedding model
import openai
openai.api_key = os.getenv("OPENAI_API_KEY")

index = pc.Index("pubmed")

def embed(docs: list[str]) -> list[list[float]]:
    res = openai.embeddings.create(
        input=docs, 
        model="text-embedding-3-small"
    )
    doc_embeds = [r.embedding for r in res.data]
    return doc_embeds

In [7]:
# upsert data to index
from tqdm.auto import tqdm
batch_size = 100

for i in tqdm(range(0, len(chunks), batch_size)):
    i_end = min(len(chunks), i+batch_size)
    ids = [str(x) for x in range(i, i_end)]
    metadatas = [{'text': chunk} for chunk in chunks[i:i_end]]
    embeds = embed(chunk for chunk in chunks[i:i_end])
    records = list(zip(ids, embeds, metadatas))
    index.upsert(vectors=records)
    

  0%|          | 0/10 [00:00<?, ?it/s]

## Migrate from Pinecone to Qdrant

In [8]:
%load_ext dotenv

In [9]:
# export from Pinecone
!export_vdf pinecone --serverless --cloud $PINECONE_CLOUD --region $PINECONE_REGION --index pubmed --namespace ""

[0m[0mExporting index 'pubmed'[0m                                            [0m[0m
Exporting pubmed:   0%|                                   | 0/1 [00:00<?, ?it/s][0m[0m
[0m[0m                                                                        [0m[A[0m[0m
[0m[A[0mIterating namespace ''[0m                                           [0m
Exporting pubmed:   0%|                                   | 0/1 [00:00<?, ?it/s][0m
[0m[0m                                                                        [0m[A[0m[0m[?25l[0m[0m[0m[0m[0m[0m[0m[0m[0m[0m[0m[0m[0m[0m[0m[0m[0m[0m[0m[0m[0m[0m[0m[0m
[0m[A[0mCollected 1000 IDs using list_points with implicit pagination.[0m   [0m
Exporting pubmed:   0%|                                   | 0/1 [00:02<?, ?it/s][0m
Fetching namespaces:   0%|                                | 0/1 [00:01<?, ?it/s][0m[A[0m[0m

[0m[0m                                                                        [0m[A[A

### import data to Qdrant

```shell
$ import_vdf qdrant -u $QDRANT_HOST

Enter the directory of vector dataset to be imported: vdf_20240509_145419_88ae5
ImportVDB initialized successfully.
Importing data for index 'pubmed'
/Users/infoslack/Projects/vector-migration/vdf_20240509_145419_88ae5/pubmed/i1.parquet/1.parquet read successfully. len(df)=1000 rows
Extracting vectors: 100%|█████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 6349.32it/s]
Metadata was parsed to JSON
Uploading points in batches of 64 in 5 threads: 100%|██████████████████████████████████████| 1000/1000 [00:03<00:00, 280.44it/s]
Iterating parquet files: 100%|████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.14s/it]
Index 'pubmed' has 1000 vectors after import
1000 vectors were imported
Importing namespaces: 100%|███████████████████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.55s/it]
Importing indexes: 100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.55s/it]
Data import completed successfully.
Time taken: 5.62 seconds
```