In [16]:
import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from datasets import Dataset
from ragas.metrics import context_precision, faithfulness, answer_correctness
from ragas import evaluate
from dotenv import load_dotenv

import chromadb
from openai import OpenAI
import openai

load_dotenv()
openai.api_key = os.getenv('OPENAI_API_KEY')

In [17]:
chroma_client = chromadb.PersistentClient(path="../chroma_db")
collection = chroma_client.get_collection(name="isbsg_projects")

# Fetch all items
all_projects = collection.get(include=["embeddings", "documents", "metadatas"])

# Convert to DataFrame
df = pd.DataFrame({
    "id": all_projects["ids"],
    "embedding": [np.array(e) for e in all_projects["embeddings"]],
    "project_text": all_projects["documents"],
    **{key: [md.get(key, None) for md in all_projects["metadatas"]] for key in all_projects["metadatas"][0].keys()}
})

# Drop projects with missing effort
df = df[df["Normalised Work Effort"].notnull()].reset_index(drop=True)

print(f"Total usable projects: {len(df)}")


Total usable projects: 6762


In [18]:
from sklearn.model_selection import train_test_split

retrieval_df, evaluation_df = train_test_split(df, test_size=0.2, random_state=42)

# Save evaluation and retrieval splits (for resuming later)
retrieval_df.to_csv("retrieval_projects.csv", index=False)
evaluation_df.to_csv("evaluation_projects.csv", index=False)

print(f"Retrieval  projects: {len(retrieval_df)}")
print(f"Evaluation projects: {len(evaluation_df)}")

Retrieval  projects: 5409
Evaluation projects: 1353


In [19]:
def get_effort_estimation(prompt):
    response = openai.chat.completions.create(
        model="o1-mini",
        messages=[
            {"role": "user", "content": prompt},
        ],
    )
    return response.choices[0].message.content.strip()


In [20]:
import re

def remove_actual_effort_line(text):
    pattern = r"The normalized work effort for the project was [\d\.]+ person-hours\.\s*"
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text.strip()


In [21]:
import time

# Checkpoint files
efforts_file = "estimated_efforts.json"
contexts_file = "contexts.json"

# Load progress
estimated_efforts = json.load(open(efforts_file)) if os.path.exists(efforts_file) else {}
contexts_dict = json.load(open(contexts_file)) if os.path.exists(contexts_file) else {}

# Reload Chroma collection
collection = chroma_client.get_collection(name="isbsg_projects")

print("Starting evaluation...")
for idx in tqdm(range(len(evaluation_df))):
    project_id = evaluation_df.iloc[idx]["id"]

    # Skip if already done
    if project_id in estimated_efforts:
        continue

    # Step 1: Embed evaluation project
    complete_query_text = evaluation_df.iloc[idx]["project_text"]
    query_text = remove_actual_effort_line(complete_query_text)
    query_embedding = evaluation_df.iloc[idx]["embedding"]

    # Step 2: Query top-5 similar projects
    results = collection.query(query_embeddings=[query_embedding], n_results=5)
    top_5_docs = results["documents"][0]

    # Save contexts
    contexts_dict[project_id] = top_5_docs

    # Step 3: Construct prompt
    prompt = f"""
You are an expert software project estimation assistant. Based on the following information, estimate the effort in person-hours for the new software project.

New Project Description:
{query_text}

Similar Past Project Descriptions:"""
    for i, desc in enumerate(top_5_docs):
        prompt += f"\nProject {i+1}:\n{desc}"

    prompt += "\nPlease provide the estimated effort in person-hours for the new project. Format your answer exactly like:\nThe effort for that project will be XXX person-hours."

    # Step 4: Estimate effort
    try:
        effort_response = get_effort_estimation(prompt)
    except Exception as e:
        print(f"API failed at index {idx}: {e}")
        continue

    estimated_efforts[project_id] = effort_response

    # Save progress
    with open(efforts_file, "w") as f:
        json.dump(estimated_efforts, f)
    with open(contexts_file, "w") as f:
        json.dump(contexts_dict, f)

    # Wait to avoid rate limit
    time.sleep(1)

print("Evaluation collection complete.")


Starting evaluation...


100%|██████████| 1353/1353 [00:00<00:00, 10309.67it/s]

Evaluation collection complete.





In [22]:
data_samples = {
    'question': [],
    'answer': [],
    'contexts': [],
    'ground_truth': []
}

for idx in range(len(evaluation_df)):
    row = evaluation_df.iloc[idx]
    project_id = row["id"]

    if project_id not in estimated_efforts or project_id not in contexts_dict:
        continue

    project_text = remove_actual_effort_line(row["project_text"])
    actual_effort_str = f"The effort for that project will be {int(row['Normalised Work Effort'])} person-hours."
    contexts = contexts_dict[project_id]
    llm_response = estimated_efforts[project_id]

    data_samples["question"].append(project_text)
    data_samples["answer"].append(llm_response)
    data_samples["contexts"].append(contexts)
    data_samples["ground_truth"].append(actual_effort_str)

print(f"Final samples for evaluation: {len(data_samples['question'])}")


Final samples for evaluation: 1353


In [23]:
dataset = Dataset.from_dict(data_samples)

metrics = [context_precision, faithfulness, answer_correctness]

print("Evaluating using RAGAS...")
scores = evaluate(dataset, metrics=metrics)

# Save results
scores_df = scores.to_pandas()
scores_df.to_csv("evaluation_final_scores.csv", index=False)

# Print avg scores
print("\n Evaluation Scores:")
print(scores_df)

print(f"\nAverage Context Precision: {scores_df['context_precision'].mean():.4f}")
print(f"Average Faithfulness: {scores_df['faithfulness'].mean():.4f}")
print(f"Average Answer Correctness: {scores_df['answer_correctness'].mean():.4f}")

Evaluating using RAGAS...


Evaluating: 100%|██████████| 4059/4059 [32:27<00:00,  2.08it/s]  



 Evaluation Scores:
                                             user_input  \
0     A Government industry project was developed by...   
1     A Communication industry project was developed...   
2     A Insurance industry project was developed by ...   
3     A Government industry project was developed by...   
4     A Communication industry project was developed...   
...                                                 ...   
1348  A Communication industry project was developed...   
1349  A Services industry project was developed by a...   
1350  A Government industry project was developed by...   
1351  A Communication industry project was developed...   
1352  A Medical & Health Care industry project was d...   

                                     retrieved_contexts  \
0     [A Government industry project was developed b...   
1     [A Communication industry project was develope...   
2     [A Insurance industry project was developed by...   
3     [A Government industry proje