#### Imoprt Necessary Packages

In [1]:
import os
import time
import pandas as pd
import numpy as np
import openai
from dotenv import load_dotenv
from openai.error import RateLimitError

#### Configure Azure OpenAI (adjust these values in your .env)

In [2]:
openai.api_type = "azure"
openai.api_base = "https://your-resource-name.openai.azure.com/"
openai.api_version = "2022-12-01"  
openai.api_key = "your_api_key"
AZURE_OPENAI_EMBEDDING_DEPLOYMENT="your_embedding_deployment"

#### Create Functions for Embedding Data


In [3]:
def get_embeddings_batch(texts, deployment=AZURE_OPENAI_EMBEDDING_DEPLOYMENT, max_retries=5):
    """
    Get embeddings for a batch of texts with retry mechanism on rate limit errors.
    """
    retries = 0
    while True:
        try:
            response = openai.Embedding.create(
                input=texts,
                engine=deployment  # Use your Azure embedding deployment name
            )
            embeddings = [np.array(item['embedding'], dtype=np.float32) for item in response['data']]
            return embeddings
        except RateLimitError as e:
            retries += 1
            if retries > max_retries:
                raise e
            delay = 3 * (2 ** (retries - 1))
            print(f"Rate limit error encountered (batch), retrying in {delay} seconds... (Attempt {retries}/{max_retries})")
            time.sleep(delay)

In [4]:
def combine_data(row, columns):
    """
    Combine specified columns from a row into a single text string.
    """
    return " | ".join(f"{col}: {row[col]}" for col in columns if col in row)

#### Push Webscraped DF CSV to Embedding Functions

In [5]:
def vectorize_dataset(csv_path='master_agent_data.csv',
                      output_parquet='master_agent_data_with_embeddings.parquet',
                      batch_size=50):
    """
    Compute embeddings for the dataset and save it as a Parquet file.
    """
    # Load the CSV data
    df = pd.read_csv(csv_path)
    
    # Define which columns to combine (adjust as needed)
    text_columns = [
        "TEAM", "CONF", "DIVISION", "GP", "PPG", "oPPG", "pDIFF", "PACE", "oEFF", "dEFF", "W", "L", "WIN%", "SEASON"
    ]
    
    # Create a list of combined text for each row
    texts = []
    for _, row in df.iterrows():
        text = combine_data(row, text_columns)
        texts.append(text)
    
    all_embeddings = []
    total = len(texts)
    print(f"Total rows to process: {total}")
    
    # Process texts in batches to avoid rate limits
    for i in range(0, total, batch_size):
        batch_texts = texts[i: i + batch_size]
        print(f"Processing batch {i} to {i + len(batch_texts)} out of {total}")
        batch_embeddings = get_embeddings_batch(batch_texts)
        all_embeddings.extend(batch_embeddings)
    
    # Save the computed embeddings as a new column in the DataFrame.
    # Convert each NumPy array to a list so it can be stored in Parquet.
    df['embedding'] = [emb.tolist() for emb in all_embeddings]
    
    print(f"Returning vectorized dataset with embeddings.... ")
    return df 
    
if __name__ == "__main__":
    outputdf = vectorize_dataset()

Total rows to process: 210
Processing batch 0 to 50 out of 210
Processing batch 50 to 100 out of 210
Processing batch 100 to 150 out of 210
Processing batch 150 to 200 out of 210
Processing batch 200 to 210 out of 210
Returning vectorized dataset with embeddings.... 


#### Save Vectorized Data DF as Parquet File

In [6]:
outputdf.to_parquet("master_agent_data_with_embeddings.parquet", index=False)