In [8]:
import pandas as pd
import tiktoken
from openai import OpenAI
# install proper packages
df = pd.read_csv('swebench_scenarios.csv')

In [9]:
label_exemplars = {
    "debugging": [
        "Fixing a logic or runtime error in existing code.",
        "Adjusting conditions or variable handling to resolve test failures."
    ],
    "api use": [
        "Using or integrating an external API such as requests or boto3.",
        "Calling REST endpoints or SDK client methods."
    ],
    "testing": [
        "Writing or modifying unit tests.",
        "Adding assertions to verify correct behavior."
    ],
    "security": [
        "Adding authentication or permission checks.",
        "Sanitizing input or handling secrets safely."
    ],
    "performance optimization": [
        "Improving speed or memory efficiency.",
        "Implementing caching or vectorized computation."
    ],
    "refactoring": [
        "Restructuring or renaming code without changing behavior.",
        "Extracting functions or simplifying logic."
    ]
}


In [10]:
from openai import OpenAI
import numpy as np

client = OpenAI()

def get_embedding(text):
    return client.embeddings.create(
        model="text-embedding-3-large",
        input=text
    ).data[0].embedding

# label embeddings
label_vectors = {}
for label, examples in label_exemplars.items():
    embs = [get_embedding(e) for e in examples]
    label_vectors[label] = np.mean(embs, axis=0)


print(label_vectors)


{'debugging': array([ 0.00021056,  0.01060644, -0.01057511, ..., -0.01309392,
       -0.01076047,  0.00256218], shape=(3072,)), 'api use': array([-9.21313046e-03,  3.58155562e-03, -6.37071521e-03, ...,
       -6.57979399e-07, -1.16467951e-02,  1.73384539e-02], shape=(3072,)), 'testing': array([ 0.0109779 ,  0.03433045, -0.01298303, ..., -0.00826629,
       -0.02069822, -0.01241162], shape=(3072,)), 'security': array([-0.01238621, -0.01560326, -0.01158076, ..., -0.01327529,
       -0.01338913, -0.00070311], shape=(3072,)), 'performance optimization': array([-0.02471134,  0.00292533, -0.01504645, ..., -0.01923638,
       -0.00131984, -0.00119789], shape=(3072,)), 'refactoring': array([-0.00261719,  0.01564523, -0.0080479 , ..., -0.01772625,
       -0.01034873, -0.00199171], shape=(3072,))}


In [None]:
# USING HUGGING FACE

# from sentence_transformers import SentenceTransformer
# import numpy as np

# model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# label_vectors = {
#     label: np.mean(model.encode(examples, normalize_embeddings=True), axis=0)
#     for label, examples in label_exemplars.items()
# }


In [11]:
# get embeddings for swe bench input + output

def combine_text(row, max_chars=3000):
    text = (row["problem_statement"] or "") + "\n" + (row["reference_output"] or "")
    return text[:max_chars]

df["combined"] = df.apply(combine_text, axis=1)

# OpenAI or HF embedding
df["embedding"] = df["combined"].apply(get_embedding)   # same get_embedding as above


In [12]:
df.head()

Unnamed: 0,scenario_id,name,problem_statement,reference_output,combined,embedding
0,scn_2zmp0KdIY5itVDOEH2O8E,astropy__astropy-14539,`io.fits.FITSDiff` may sometimes report differ...,diff --git a/astropy/io/fits/diff.py b/astropy...,`io.fits.FITSDiff` may sometimes report differ...,"[-0.008452149108052254, -0.012593905441462994,..."
1,scn_2zmp0EqlSNVLcTgHkFZ6F,astropy__astropy-12907,Modeling's `separability_matrix` does not comp...,diff --git a/astropy/modeling/separable.py b/a...,Modeling's `separability_matrix` does not comp...,"[-0.0054789441637694836, -0.017074495553970337..."
2,scn_2zmp0GuMgYGInwVXBDTi4,astropy__astropy-13453,ASCII table output to HTML does not support su...,diff --git a/astropy/io/ascii/html.py b/astrop...,ASCII table output to HTML does not support su...,"[0.002106586005538702, -0.0010695847449824214,..."
3,scn_2zmp0FNKBaWxeyag3pcOo,astropy__astropy-13033,TimeSeries: misleading exception when required...,diff --git a/astropy/timeseries/core.py b/astr...,TimeSeries: misleading exception when required...,"[0.009934665635228157, 0.006429404020309448, 0..."
4,scn_2zmy1cs5o86orY3mw2ibH,astropy__astropy-7606,Unit equality comparison with None raises Type...,diff --git a/astropy/units/core.py b/astropy/u...,Unit equality comparison with None raises Type...,"[-0.00738839665427804, 0.010516422800719738, -..."


In [13]:
from numpy.linalg import norm

# calculating cosine similarity 
def cosine_sim(a, b):
    return np.dot(a, b) / (norm(a) * norm(b))

def classify_embedding(emb):
    sims = {lbl: cosine_sim(emb, vec) for lbl, vec in label_vectors.items()}
    top_label = max(sims, key=sims.get)
    top_score = sims[top_label]
    # optionally allow multiple labels above a threshold
    multi_labels = [lbl for lbl, sc in sims.items() if sc >= 0.85 * top_score]
    return {"best": top_label, "scores": sims, "multi": multi_labels}

df["embedding_class"] = df["embedding"].apply(classify_embedding)
df["pred_label"] = df["embedding_class"].apply(lambda x: x["best"])


In [17]:
# distribution info
df["pred_label"].value_counts()

# sample qualitative check
df[["problem_statement", "pred_label", "embedding_class"]].sample(5, random_state=42)


Unnamed: 0,problem_statement,pred_label,embedding_class
361,`SparseCoder` doesn't expose `max_iter` for `L...,performance optimization,"{'best': 'performance optimization', 'scores':..."
73,Change in behaviour when saving a model instan...,testing,"{'best': 'testing', 'scores': {'debugging': 0...."
374,"IterativeImputer has no parameter ""fill_value""...",debugging,"{'best': 'debugging', 'scores': {'debugging': ..."
155,Missing import statement in generated migratio...,testing,"{'best': 'testing', 'scores': {'debugging': 0...."
104,limit_choices_to on a ForeignKey can render du...,debugging,"{'best': 'debugging', 'scores': {'debugging': ..."


In [18]:
# converting to csv 
df[["scenario_id", "problem_statement", "reference_output", "pred_label", "embedding_class"]].to_csv("swebench_labeled_embeddings.csv", index=False)