In [1]:
import pandas as pd
import openai
from tqdm import tqdm
import os

In [7]:
openai.api_key = os.getenv('OPENAI_API_KEY')

In [6]:
df = pd.read_excel('assets/isbsg_dataset.xlsx')

In [7]:
# Drop the specified columns
columns_to_drop = [
    "Data Quality Rating", "UFP rating", "Count Approach", "Relative Size", 
    "Value Adjustment Factor", "Normalised Work Effort Level 1", "Summary Work Effort", 
    "Normalised PDR (ufp)", "Pre 2002 PDR", "Total project cost", "Cost currency"
]

df = df.drop(columns=columns_to_drop)

# Update "Sprints / iterations" values based on "Development Methodologies"
df['Sprints / iterations'] = df.apply(
    lambda row: 1 if 'waterfall' in str(row['Development Methodologies']).lower() else row['Sprints / iterations'],
    axis=1
)

# Identify numerical and categorical columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
categorical_columns = df.select_dtypes(include=['object', 'category']).columns

# Fill missing values for numerical columns with the mean
df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())

# Fill missing values for categorical columns with the mode
df[categorical_columns] = df[categorical_columns].fillna(df[categorical_columns].mode().iloc[0])

In [8]:
from helper_functions.project_description import create_project_description

df['project_description'] = df.apply(create_project_description, axis=1)

In [37]:
import time

def get_embeddings(text_list, model="text-embedding-3-large", batch_size=20):
    """Generates embeddings in batches to improve efficiency."""
    embeddings = []
    
    for i in tqdm(range(0, len(text_list), batch_size), desc="Generating embeddings"):
        batch = text_list[i : i + batch_size]
        try:
            response = openai.embeddings.create(input=batch, model=model)
            batch_embeddings = [item.embedding for item in response.data]
        except Exception as e:
            print(f"Error generating embeddings: {e}")
            batch_embeddings = [None] * len(batch)  # Avoid breaking process
        
        embeddings.extend(batch_embeddings)
        time.sleep(1)  # Small delay to prevent rate limiting
    
    return embeddings

# Apply batch processing
df['embedding'] = get_embeddings(df['project_description'].tolist())

# Save embeddings
df.to_pickle('isbsg_with_embeddings.pkl')


Generating embeddings: 100%|██████████| 339/339 [17:29<00:00,  3.10s/it]


In [3]:
import pandas as pd

# Load the dataframe with stored embeddings
df = pd.read_pickle('isbsg_with_embeddings.pkl')

In [1]:
import chromadb

# Initialize ChromaDB client with persistent storage
chroma_client = chromadb.PersistentClient(path="./chroma_db")

# Create or get the collection (without OpenAI embedding function since we already have embeddings)
collection = chroma_client.get_or_create_collection(name="isbsg_projects")


In [5]:
# Convert embeddings to a list and store in ChromaDB
for index, row in df.iterrows():
    collection.add(
        ids=[str(index)],  # Use index as unique ID
        embeddings=[row["embedding"]],  # Use stored embedding
        metadatas=[{key: row[key] for key in df.columns if key not in ["project_description", "embedding"]}],  # Store metadata
        documents=[row["project_description"]]  # Store project description
    )
