### After creating embeddings, save file to csv

In [1]:
import pandas as pd
from qdrant_client import QdrantClient, models
from tqdm import tqdm
from llama_index.embeddings.openai import OpenAIEmbedding
import os
from dotenv import load_dotenv

load_dotenv()

openai_api = os.getenv("OPENAI_API_KEY")
qdrant_api = os.getenv("QDRANT_API_KEY")

In [6]:
df = pd.read_csv("data/all_scholarships.csv")
df.head(2)

Unnamed: 0,name,brief_description,value,eligibility,deadline,field_of_study,application_instructions,official_scholarship_website,id,level
0,Rhodes Scholarships at Oxford University for I...,The Rhodes Scholarships are postgraduate award...,A Rhodes Scholarship covers all University and...,Each applicant must fulfil the citizenship and...,July,Rhodes Scholars may study any full-time postgr...,Applications for the next round of Rhodes Scho...,https://www.rhodeshouse.ox.ac.uk/scholarships/...,rL9SVa8gdJY,Missing data
1,University of Sydney International Scholarship,The scholarship provides a yearly scholarship ...,"For 2023 awards, the scholarship is valued at ...",Must be a commencing or enrolled international...,13 September,Master’s by research or PhD Degree offered at ...,Commencing students should select the check bo...,https://sydney.edu.au/scholarships/e/universit...,sFhaLzf9exp,Missing data


In [7]:
# clean
df.rename(columns={"brief_description": "text"}, inplace=True)
df.deadline = df.deadline.fillna("Missing data")

#### Embedding

In [8]:
embedding = OpenAIEmbedding(model="text-embedding-3-small", api_key=openai_api, embed_batch_size=100)

In [5]:
# batch embedding function first
import time
def create_batch_embeddings(df, text_column, batch_size=100, max_retries=3):
    """
    Create embeddings for a column in a pandas DataFrame using OpenAI's API.
    
    Args:
    df (pandas.DataFrame): The input DataFrame.
    text_column (str): The name of the column containing text to be embedded.
    batch_size (int): Number of texts to process in each API call. Default is 100.
    max_retries (int): Maximum number of retries for failed API calls. Default is 3.
    
    Returns:
    pandas.DataFrame: The input DataFrame with a new column 'embedding' containing the embeddings.
    """
    
    def get_embeddings(texts):
        retries = 0
        while retries < max_retries:
            try:
                response = embedding.get_text_embedding_batch(texts)
                return response
            except Exception as e:
                print(f"An error occurred: {e}. Retrying...")
                retries += 1
                time.sleep(5)  # Wait for 5 seconds before retrying
        raise Exception(f"Failed to get embeddings after {max_retries} attempts.")

    embeddings = []
    for i in tqdm(range(0, len(df), batch_size), desc="Processing batches"):
        batch = df[text_column][i:i+batch_size].tolist()
        batch_embeddings = get_embeddings(batch)
        embeddings.extend(batch_embeddings)
    
    df['embedding'] = embeddings
    return df

In [6]:
# Create embeddings
df_with_embeddings = create_batch_embeddings(df, 'text')

Processing batches: 100%|██████████| 10/10 [00:11<00:00,  1.15s/it]


In [7]:
df_with_embeddings.to_csv("data/scholarships_embeddings.csv")

#### Vector Store Init

In [4]:
localhost_url = "http://localhost:6333"
cloud_url = "https://354019c4-4335-4286-b11e-5a59db5a8be2.europe-west3-0.gcp.cloud.qdrant.io:6333",
    
client = QdrantClient(
    url="https://354019c4-4335-4286-b11e-5a59db5a8be2.europe-west3-0.gcp.cloud.qdrant.io:6333", 
    api_key=qdrant_api,
)

# create vector store
def create_collection(collection_name):
    client.create_collection(
        collection_name=f"{collection_name}",
        vectors_config=models.VectorParams(size=1536, 
                                           distance=models.Distance.COSINE, on_disk=True),
        quantization_config=models.BinaryQuantization(
            binary=models.BinaryQuantizationConfig(
                always_ram=True,
            ),
        ),
        optimizers_config=models.OptimizersConfigDiff(
            default_segment_number=5,
            indexing_threshold=0 # disable during ingestion to make it fast and re-enable when done
        )
    )
    return "Vector store created!"

collection_name = "funding_store"

# create
print(create_collection(collection_name))

Vector store created!


#### Upsert to Vector Store

In [12]:
# prep data for upserting
id = df_with_embeddings.id.tolist()
vectors = df_with_embeddings.embedding.tolist()
df_with_embeddings.drop(columns=["id", "embedding"], inplace=True)
payload = df_with_embeddings.to_dict(orient="records")

In [None]:
# upsert data points
def upsert_data(collection_name, vectors, payload):
    client.upload_collection(
        collection_name=collection_name,
        vectors=vectors,
        payload=payload,
    )
    return "Data Upserted!"

print(upsert_data(collection_name, vectors, payload))

In [21]:
# update index with a threshold after upload. Remember at ingestion, I made indexing_threshold=0
# this will help make search faster
def update_index(collection):
    client.update_collection(
        collection_name=collection,
        optimizer_config=models.OptimizersConfigDiff(indexing_threshold=20000),
    )
    return "index updated!"

print(update_index(collection_name))

index updated!


### Create and Upsert to Qdrant Cloud

In [18]:
df_with_embeddings = pd.read_csv("data/scholarships_embeddings.csv")
df_with_embeddings.drop(columns=['Unnamed: 0'], inplace=True)
df_with_embeddings.columns

Index(['name', 'text', 'value', 'eligibility', 'deadline', 'field_of_study',
       'application_instructions', 'official_scholarship_website', 'id',
       'level', 'embedding'],
      dtype='object')

In [19]:
# prep data for upserting
id = df_with_embeddings.id.tolist()
vectors = df_with_embeddings.embedding.tolist()
df_with_embeddings.drop(columns=["id", "embedding"], inplace=True)
payload = df_with_embeddings.to_dict(orient="records")

In [20]:
# convert vectors from ['[1, 2, 3]',  '[4, 5, 6]'] to [[], []] (valid list_type)
import ast 

# Convert each string in the list to a list
converted_vectors = [ast.literal_eval(item) for item in vectors]
converted_vectors[:2]

[[-0.0007102790987119079,
  -0.009759560227394104,
  0.030189571902155876,
  0.050532832741737366,
  0.0062840720638632774,
  0.03192460536956787,
  -0.02073364332318306,
  0.0244422759860754,
  -0.03136071935296059,
  0.0005133934901095927,
  0.018770886585116386,
  -0.043744515627622604,
  0.017404548823833466,
  -0.011277713812887669,
  0.04528435692191124,
  0.0021579470485448837,
  -0.03066670522093773,
  -0.021687909960746765,
  -0.024333836510777473,
  0.09377852827310562,
  0.03652244061231613,
  -0.00432673841714859,
  0.02073364332318306,
  -0.00017002983076963574,
  0.08440934866666794,
  -0.032835498452186584,
  0.025418231263756752,
  -0.005020751152187586,
  -0.00032853794982656837,
  0.005015329457819462,
  0.020083004608750343,
  -0.0070051951333880424,
  0.00923904962837696,
  0.007102790754288435,
  -0.050532832741737366,
  0.009136032313108444,
  0.0335078239440918,
  0.003968887496739626,
  -0.017100917175412178,
  0.030926961451768875,
  -0.02249036356806755,
  -0.

In [21]:
# upsert data points
def upsert_data(collection_name, vectors, payload):
    client.upload_collection(
        collection_name=collection_name,
        vectors=converted_vectors, # type: ignore
        payload=payload,
    )
    return "Data Upserted!"

print(upsert_data(collection_name, vectors, payload))

Data Upserted!


In [22]:
# update index with a threshold after upload. Remember at ingestion, I made indexing_threshold=0
# this will help make search faster
def update_index(collection):
    client.update_collection(
        collection_name=collection,
        optimizer_config=models.OptimizersConfigDiff(indexing_threshold=20000),
    )
    return "index updated!"

print(update_index(collection_name))

index updated!
