In [1]:
from transformers import pipeline
import pandas as pd
import re
from tqdm import tqdm

In [2]:
# load swe bench scenario data


df = pd.read_csv('swebench_scenarios.csv')
df.head(2)

Unnamed: 0,scenario_id,name,problem_statement,reference_output
0,scn_2zmp0KdIY5itVDOEH2O8E,astropy__astropy-14539,`io.fits.FITSDiff` may sometimes report differ...,diff --git a/astropy/io/fits/diff.py b/astropy...
1,scn_2zmp0EqlSNVLcTgHkFZ6F,astropy__astropy-12907,Modeling's `separability_matrix` does not comp...,diff --git a/astropy/modeling/separable.py b/a...


In [3]:
labels = ['api', 'security']

classifier = pipeline(
    "zero-shot-classification",
    model="MoritzLaurer/deberta-v3-large-zeroshot-v2.0",
    device_map="auto"
)

Device set to use mps


In [4]:
# classification prompts 

def classify_prompt(prompt):
    result = classifier(prompt, candidate_labels=labels, multi_label=True)
    preds = [lbl for lbl, score in zip(result["labels"], result["scores"]) if score >= 0.3]
    return preds

tqdm.pandas()
df["text_labels"] = df["problem_statement"].progress_apply(classify_prompt)


100%|██████████| 500/500 [04:13<00:00,  1.98it/s]


In [5]:
# heuristic labeling 
def classify_diff(diff):
    found = set()

    # API use
    if re.search(r"\b(import|from)\s+(requests|httpx|urllib|boto3|openai|google|api)", diff):
        found.add("api")
    if re.search(r"\.(get|post|put|delete)\s*\(", diff):
        found.add("api ")

    # # Testing
    # if re.search(r"test_", diff) or "assert" in diff:
    #     found.add("testing")

    # Security
    if re.search(r"(sanitize|escape|encode|auth|permission|csrf|token|secret|encrypt)", diff, re.I):
        found.add("security")

    # # Performance
    # if re.search(r"(optimiz|cache|speed|latenc|profil|batch|vectoriz|memoiz)", diff, re.I):
    #     found.add("performance optimization")

    # # Refactoring
    # if re.search(r"(rename|cleanup|refactor|restructure|simplif|extract|move function)", diff, re.I):
    #     found.add("refactoring")

    return list(found)

df["diff_labels"] = df["reference_output"].progress_apply(classify_diff)


df.head()

100%|██████████| 500/500 [00:00<00:00, 13101.22it/s]


Unnamed: 0,scenario_id,name,problem_statement,reference_output,text_labels,diff_labels
0,scn_2zmp0KdIY5itVDOEH2O8E,astropy__astropy-14539,`io.fits.FITSDiff` may sometimes report differ...,diff --git a/astropy/io/fits/diff.py b/astropy...,[api],[]
1,scn_2zmp0EqlSNVLcTgHkFZ6F,astropy__astropy-12907,Modeling's `separability_matrix` does not comp...,diff --git a/astropy/modeling/separable.py b/a...,[],[]
2,scn_2zmp0GuMgYGInwVXBDTi4,astropy__astropy-13453,ASCII table output to HTML does not support su...,diff --git a/astropy/io/ascii/html.py b/astrop...,[api],[]
3,scn_2zmp0FNKBaWxeyag3pcOo,astropy__astropy-13033,TimeSeries: misleading exception when required...,diff --git a/astropy/timeseries/core.py b/astr...,[],[]
4,scn_2zmy1cs5o86orY3mw2ibH,astropy__astropy-7606,Unit equality comparison with None raises Type...,diff --git a/astropy/units/core.py b/astropy/u...,[],[]


In [6]:
# merge and normalize 
def merge_labels(row):
    merged = set(row["text_labels"]) | set(row["diff_labels"])
    # if not merged:
    #     merged.add("debugging")
    return list(merged)

df["final_labels"] = df.apply(merge_labels, axis=1)
df.head()

Unnamed: 0,scenario_id,name,problem_statement,reference_output,text_labels,diff_labels,final_labels
0,scn_2zmp0KdIY5itVDOEH2O8E,astropy__astropy-14539,`io.fits.FITSDiff` may sometimes report differ...,diff --git a/astropy/io/fits/diff.py b/astropy...,[api],[],[api]
1,scn_2zmp0EqlSNVLcTgHkFZ6F,astropy__astropy-12907,Modeling's `separability_matrix` does not comp...,diff --git a/astropy/modeling/separable.py b/a...,[],[],[]
2,scn_2zmp0GuMgYGInwVXBDTi4,astropy__astropy-13453,ASCII table output to HTML does not support su...,diff --git a/astropy/io/ascii/html.py b/astrop...,[api],[],[api]
3,scn_2zmp0FNKBaWxeyag3pcOo,astropy__astropy-13033,TimeSeries: misleading exception when required...,diff --git a/astropy/timeseries/core.py b/astr...,[],[],[]
4,scn_2zmy1cs5o86orY3mw2ibH,astropy__astropy-7606,Unit equality comparison with None raises Type...,diff --git a/astropy/units/core.py b/astropy/u...,[],[],[]


In [7]:
df[["scenario_id", "problem_statement", "reference_output", "final_labels"]].to_csv("swebench_labeled.csv", index=False)