In [3]:
!pip install datasets
!pip install --upgrade openai
!pip install tiktoken

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [4]:
import os
import ast
import shutil
import hashlib
import requests
import numpy as np
import pandas as pd
from git import Repo
from pathlib import Path
from collections import Counter
from datasets import load_dataset
import openai
import tiktoken

# Load SWE Bench Verified

In [3]:
def get_modified_files(patch: str):
    """Gets the list of modified files from a patch.

    Args:
        patch (str): git diff patch

    Returns:
        list[str]: Modified files
    """
    modified_files = []
    for line in patch.splitlines():
        if line.startswith("diff --git"):
            parts = line.split()
            if len(parts) > 2:
                # The third part is the file path prefixed with 'a/'
                modified_file = parts[2].replace("a/", "", 1)
                modified_files.append(modified_file)
    return modified_files


def get_swebench_dataset() -> pd.DataFrame:
    """Loads the SWE-bench dataset."""
    ds = load_dataset("princeton-nlp/SWE-bench_Verified")
    df = pd.DataFrame(ds["test"])
    df["modified_files"] = df["patch"].apply(get_modified_files)
    df = df[
        ["repo", "base_commit", "modified_files", "problem_statement", "hints_text"]
    ]
    return df

In [4]:
swebench_df = get_swebench_dataset()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/3.23k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/2.09M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

In [5]:
swebench_df

Unnamed: 0,repo,base_commit,modified_files,problem_statement,hints_text
0,astropy/astropy,d16bfe05a744909de4b27f5875fe0d4ed41ce607,[astropy/modeling/separable.py],Modeling's `separability_matrix` does not comp...,
1,astropy/astropy,298ccb478e6bf092953bca67a3d29dc6c35f6752,[astropy/timeseries/core.py],TimeSeries: misleading exception when required...,The relevant code that produces the misleading...
2,astropy/astropy,6ed769d58d89380ebaa1ef52b300691eefda8928,[astropy/table/table.py],Consider removing auto-transform of structured...,@mhvk - I'm happy to do this PR if you think i...
3,astropy/astropy,6500928dc0e57be8f06d1162eacc3ba5e2eff692,[astropy/coordinates/builtin_frames/__init__.p...,A direct approach to ITRS to Observed transfor...,"cc @StuartLittlefair, @adrn, @eteq, @eerovaher..."
4,astropy/astropy,19cc80471739bcb67b7e8099246b391c355023ee,[astropy/io/ascii/html.py],ASCII table output to HTML does not support su...,Welcome to Astropy 👋 and thank you for your fi...
...,...,...,...,...,...
495,sympy/sympy,e8c22f6eac7314be8d92590bfff92ced79ee03e2,[sympy/physics/units/unitsystem.py],collect_factor_and_dimension does not detect e...,
496,sympy/sympy,809c53c077485ca48a206cee78340389cb83b7f1,[sympy/combinatorics/homomorphisms.py],`_check_homomorphism` is broken on Permutation...,
497,sympy/sympy,193e3825645d93c73e31cdceb6d742cc6919624d,[sympy/polys/rings.py],`PolyElement.as_expr()` not accepting symbols\...,
498,sympy/sympy,b1cb676cf92dd1a48365b731979833375b188bf2,[sympy/core/numbers.py],"Rational calc value error\npython 3.11, sympy ...",This should probably raise an error. The expec...


In [6]:
unique_repos = set(swebench_df["repo"])
print("Number of unique repos:",len(unique_repos))
for ind,repo in enumerate(unique_repos):
    print(str(ind+1)+")",repo)

Number of unique repos: 12
1) mwaskom/seaborn
2) sphinx-doc/sphinx
3) pytest-dev/pytest
4) django/django
5) astropy/astropy
6) psf/requests
7) pydata/xarray
8) scikit-learn/scikit-learn
9) pylint-dev/pylint
10) pallets/flask
11) sympy/sympy
12) matplotlib/matplotlib


In [7]:
counter = Counter(swebench_df["repo"])
print("Number of rows per repo:",counter)

Number of rows per repo: Counter({'django/django': 231, 'sympy/sympy': 75, 'sphinx-doc/sphinx': 44, 'matplotlib/matplotlib': 34, 'scikit-learn/scikit-learn': 32, 'astropy/astropy': 22, 'pydata/xarray': 22, 'pytest-dev/pytest': 19, 'pylint-dev/pylint': 10, 'psf/requests': 8, 'mwaskom/seaborn': 2, 'pallets/flask': 1})


In [8]:
counter = Counter([len(files) for files in swebench_df["modified_files"]])
print("Number of modified files per row:",counter)

Number of modified files per row: Counter({1: 429, 2: 49, 3: 12, 4: 7, 5: 1, 21: 1, 6: 1})


In [9]:
rows_mult = swebench_df[swebench_df['modified_files'].apply(lambda x: len(x) > 4)]
rows_mult

Unnamed: 0,repo,base_commit,modified_files,problem_statement,hints_text
52,django/django,a5308514fb4bc5086c9a16a8a24a945eeebb073c,"[django/core/mail/message.py, django/core/mail...",Email messages crash on non-ASCII domain when ...,Thanks for the report. Simple encoding should ...
431,sympy/sympy,d1320814eda6549996190618a21eaf212cfd4d1e,"[sympy/core/basic.py, sympy/core/exprtools.py,...","Return NotImplemented, not False, upon rich co...",Classes are generally required to subclass fro...
455,sympy/sympy,6fd65310fa3167b9626c38a5487e171ca407d988,"[sympy/assumptions/ask.py, sympy/assumptions/a...",a.is_even does not imply a.is_finite\nI'm not ...,Has anyone tried to represent SymPy's assumpti...


In [15]:
for id,row in rows_mult.iterrows():
    print(len(row['modified_files']))
    print(row['modified_files'])
    print(row['problem_statement'])

5
['django/core/mail/message.py', 'django/core/mail/utils.py', 'django/core/validators.py', 'django/utils/encoding.py', 'django/utils/html.py']
Email messages crash on non-ASCII domain when email encoding is non-unicode.
Description
	
When the computer hostname is set in unicode (in my case "正宗"), the following test fails: ​https://github.com/django/django/blob/master/tests/mail/tests.py#L368
Specifically, since the encoding is set to iso-8859-1, Python attempts to convert all of the headers to that encoding, including the Message-ID header which has been set here: ​https://github.com/django/django/blob/master/django/core/mail/message.py#L260
This is not just a problem in the tests, Django should be handling the encoding of the message properly
Steps to recreate:
Set hostname to non iso-8859-1 value (i.e. hostname 正宗)
run the mail tests
Fix:
have django.core.mail.utils or django.core.mail.message convert domain name to punycode before using
Test:
from unittest.mock import patch
from dj

In [16]:
rows_mult1 = swebench_df[swebench_df['modified_files'].apply(lambda x: len(x) < 5)]
rows_mult1

Unnamed: 0,repo,base_commit,modified_files,problem_statement,hints_text
0,astropy/astropy,d16bfe05a744909de4b27f5875fe0d4ed41ce607,[astropy/modeling/separable.py],Modeling's `separability_matrix` does not comp...,
1,astropy/astropy,298ccb478e6bf092953bca67a3d29dc6c35f6752,[astropy/timeseries/core.py],TimeSeries: misleading exception when required...,The relevant code that produces the misleading...
2,astropy/astropy,6ed769d58d89380ebaa1ef52b300691eefda8928,[astropy/table/table.py],Consider removing auto-transform of structured...,@mhvk - I'm happy to do this PR if you think i...
3,astropy/astropy,6500928dc0e57be8f06d1162eacc3ba5e2eff692,[astropy/coordinates/builtin_frames/__init__.p...,A direct approach to ITRS to Observed transfor...,"cc @StuartLittlefair, @adrn, @eteq, @eerovaher..."
4,astropy/astropy,19cc80471739bcb67b7e8099246b391c355023ee,[astropy/io/ascii/html.py],ASCII table output to HTML does not support su...,Welcome to Astropy 👋 and thank you for your fi...
...,...,...,...,...,...
495,sympy/sympy,e8c22f6eac7314be8d92590bfff92ced79ee03e2,[sympy/physics/units/unitsystem.py],collect_factor_and_dimension does not detect e...,
496,sympy/sympy,809c53c077485ca48a206cee78340389cb83b7f1,[sympy/combinatorics/homomorphisms.py],`_check_homomorphism` is broken on Permutation...,
497,sympy/sympy,193e3825645d93c73e31cdceb6d742cc6919624d,[sympy/polys/rings.py],`PolyElement.as_expr()` not accepting symbols\...,
498,sympy/sympy,b1cb676cf92dd1a48365b731979833375b188bf2,[sympy/core/numbers.py],"Rational calc value error\npython 3.11, sympy ...",This should probably raise an error. The expec...


# Random Testing - SKIP these Cells

In [9]:
def get_function_info(code):
    tree = ast.parse(code)
    functions = []
    for node in ast.walk(tree):
        if isinstance(node, ast.FunctionDef):
            # Collect different kinds of parameters
            arguments = []
            varlen_arguments = []
            keyword_arguments = []
            positional_arguments = []
            varlen_keyword_arguments = []
            # Positional-only arguments
            if node.args.posonlyargs:
                positional_arguments.extend([arg.arg for arg in node.args.posonlyargs])
            # Regular positional or keyword arguments
            arguments.extend([arg.arg for arg in node.args.args])
            # Variable-length positional arguments (*args)
            if node.args.vararg:
                varlen_arguments.append(f"*{node.args.vararg.arg}")
            # Keyword-only arguments (after '*')
            if node.args.kwonlyargs:
                keyword_arguments.extend([arg.arg for arg in node.args.kwonlyargs])
            # Variable-length keyword arguments (**kwargs)
            if node.args.kwarg:
                varlen_keyword_arguments.append(f"**{node.args.kwarg.arg}")
            code_lines = code.splitlines()
            code_snippet = "\n".join(code_lines[node.lineno - 1 : node.end_lineno])
            # Collect function information
            function_info = {
                "name": node.name,
                "start_line": node.lineno,
                "end_line": node.end_lineno,
                "arguments": arguments,
                "varlen_arguments": varlen_arguments,
                "keyword_arguments": keyword_arguments,
                "positional_arguments": positional_arguments,
                "varlen_keyword_arguments": varlen_keyword_arguments,
                "code_snippet": code_snippet,
            }
            functions.append(function_info)
    return functions

In [9]:
def generate_string_hash(string, hash_algorithm='sha256'):
    return hashlib.new(hash_algorithm, string.encode('utf-8')).hexdigest()

In [10]:
def clone_repository(repo_url, branch=None, commit_hash=None):
    # Clone the repo with url=repo_url into the cloned_repos directory
    # If repo already exists under cloned_repos, run git pull instead to update it
    repo_name = repo_url.split("/")[-1]
    repo_name = repo_name[:-4] if repo_name.endswith(".git") else repo_name
    clone_dir = f"./cloned_repos/{repo_name}"
    try:
        # If the directory exists, perform git pull, else clone it
        if os.path.exists(clone_dir):
            repo = Repo(clone_dir)
            repo.git.checkout("main") # if it was previously checked out to a random commit, switch to main and pull to update
            repo.remotes.origin.pull()
        else:
            Repo.clone_from(repo_url, clone_dir)
        # if a branch is passed as arguments to the method, checkout to that branch
        if branch:
            repo = Repo(clone_dir)
            repo.git.checkout(branch)
        # if a commit_hash is passed as arguments to the method, checkout to that commit_hash
        if commit_hash:
            repo = Repo(clone_dir)
            repo.git.checkout(commit_hash)
        return repo_name, clone_dir
    except Exception as e:
        raise e


def analyze_python_files(clone_dir):
    """Analyze all Python files in the cloned repository, including all subdirectories. Identify all functions found in the files and their metadata (start line, end line, args, etc)"""
    function_infos = []
    file_infos = []
    for root, _, files in os.walk(clone_dir):
        for file in files:
            if file.endswith(".py"):
                file_path = os.path.join(root, file)
                try:
                    with open(file_path, "r", encoding='utf-8') as f:
                        code = f.read().strip()
                        # Add all python files into a global list
                        if code:
                            file_infos.append(
                                {"file_path": file_path, "name": file,
                                    "code_snippet": code, "hash": generate_string_hash(code)}
                            )
                        function_info = get_function_info(code)
                        for info in function_info:
                            # Add the file path to each function's information
                            info["file_path"] = file_path
                        # Add found function info to the list
                        function_infos.extend(function_info)
                except Exception as e:
                    print(f"Error for file {file_path} {e}")
    files_list = [f["file_path"] for f in file_infos]
    return file_infos, files_list, function_infos

def remove_directory(cloned_dir):
    shutil.rmtree(cloned_dir)

In [16]:
for repo in unique_repos:
    try:
        repo_url = f"https://github.com/{repo}"
        repo_name, cloned_dir = clone_repository(repo_url=repo_url)
        file_infos, files_list, function_infos = analyze_python_files(cloned_dir)
        # remove_directory(cloned_dir)
        print(f"{repo_name}: Files: {len(files_list)}    Functions: {len(function_infos)}")
    except Exception as e:
        print(f"Encountered error when processing {repo}. Error {e}")

matplotlib: Files: 902    Functions: 10370
Error for file ./cloned_repos/pylint/doc/data/messages/s/syntax-error/bad.py invalid syntax. Perhaps you forgot a comma? (<unknown>, line 3)
Error for file ./cloned_repos/pylint/doc/data/messages/u/using-exception-groups-in-unsupported-version/bad.py invalid syntax (<unknown>, line 9)
Error for file ./cloned_repos/pylint/doc/data/messages/u/using-generic-type-syntax-in-unsupported-version/bad.py invalid syntax (<unknown>, line 1)
Error for file ./cloned_repos/pylint/tests/functional/s/statement_without_effect_py312.py invalid syntax (<unknown>, line 3)
Error for file ./cloned_repos/pylint/tests/functional/s/syntax/syntax_error_jython.py invalid syntax (<unknown>, line 1)
Error for file ./cloned_repos/pylint/tests/functional/s/syntax/syntax_error.py invalid syntax (<unknown>, line 1)
Error for file ./cloned_repos/pylint/tests/functional/t/tokenize_error.py unexpected EOF while parsing (<unknown>, line 4)
Error for file ./cloned_repos/pylint/tes

In [11]:
repo_commits = swebench_df.groupby('repo')['base_commit'].apply(list).to_dict()

In [12]:
repo_commits

{'astropy/astropy': ['d16bfe05a744909de4b27f5875fe0d4ed41ce607',
  '298ccb478e6bf092953bca67a3d29dc6c35f6752',
  '6ed769d58d89380ebaa1ef52b300691eefda8928',
  '6500928dc0e57be8f06d1162eacc3ba5e2eff692',
  '19cc80471739bcb67b7e8099246b391c355023ee',
  '0df94ff7097961e92fd7812036a24b145bc13ca8',
  '5250b2442501e6c671c6b380536f1edb352602d1',
  '1a4462d72eb03f30dc83a879b1dd57aac8b2c18b',
  'a5917978be39d13cd90b517e1de4e7a539ffaa48',
  'cdb66059a2feb44ee49021874605ba90801f9986',
  '7269fa3e33e8d02485a647da91a5a2a60a06af61',
  'fa4e8d1cd279acf9b24560813c8652494ccd5922',
  'a3f4ae6cd24d5ecdf49f213d77b3513dd509a06c',
  'c0a24c1dc957a3b565294213f435fefb2ec99714',
  '80c3854a5f4f4a6ab86c03d9db7854767fcd83c1',
  'b16c7d12ccbc7b2d20364b89fb44285bcbfede54',
  '26d147868f8a891a6009a25cd6a8576d2e1bd747',
  '732d89c2940156bdc0e200bb36dc38b5e424bcba',
  '3cedd79e6c121910220f8e6df77c54a0b344ea94',
  'a7141cd90019b62688d507ae056298507678c058',
  'a85a0747c54bac75e9c3b2fe436b105ea029d6cf',
  'b750a0e6ee76

In [28]:
file_hashes = {'psf/requests': []}
infos = {'psf/requests': []}
for commit in repo_commits['psf/requests']:
    repo = 'psf/requests'
    repo_url = f"https://github.com/{repo}"
    repo_name, cloned_dir = clone_repository(repo_url=repo_url, commit_hash=commit)
    file_infos, files_list, function_infos = analyze_python_files(cloned_dir)
    for info in file_infos:
        if info['hash'] not in file_hashes[repo]:
            file_hashes[repo].append(info['hash'])
            infos[repo].append(info)
    files_hashes = [f["hash"] for f in file_infos]
    print(len(files_list),len(files_hashes))
    remove_directory(cloned_dir)

Error for file ./cloned_repos/requests/requests/packages/urllib3/contrib/ntlmpool.py (unicode error) 'unicodeescape' codec can't decode bytes in position 130-131: truncated \uXXXX escape (<unknown>, line 38)
69 69
74 74
75 75
75 75
82 82
83 83
34 34
34 34


In [29]:
print(len(file_hashes['psf/requests']))
print(len(set(file_hashes['psf/requests'])))

271
271


In [19]:
def get_file_infos(clone_dir):
    """Analyze all Python files in the cloned repository, including all subdirectories. Identify all functions found in the files and their metadata (start line, end line, args, etc)"""
    file_infos = []
    for root, _, files in os.walk(clone_dir):
        for file in files:
            if file.endswith(".py"):
                file_path = os.path.join(root, file)
                try:
                    with open(file_path, "r", encoding='utf-8') as f:
                        code = f.read().strip()
                        # Add all python files into a global list
                        if code:
                            file_infos.append(
                                {"file_path": Path(file_path).relative_to(clone_dir), "name": file,
                                    "code_snippet": code, "hash": generate_string_hash(code)}
                            )
                except Exception as e:
                    print(f" --- Error for file {file_path} {e}. Skipping file.")
    return file_infos

In [14]:
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

num_tokens_from_string("tiktoken is great!", "cl100k_base")

6

In [22]:
hashes = []
infos = []
for commit in repo_commits['psf/requests']:
    repo = 'psf/requests'
    repo_url = f"https://github.com/{repo}"
    repo_name, cloned_dir = clone_repository(repo_url=repo_url, commit_hash=commit)
    file_infos = get_file_infos(cloned_dir)
    for info in file_infos:
        if info['hash'] not in hashes:
            hashes.append(info['hash'])
            infos.append(info)
    remove_directory(cloned_dir)

./cloned_repos/requests/requests/packages/charade/jisfreq.py has 27972 tokens
./cloned_repos/requests/requests/packages/charade/langgreekmodel.py has 10118 tokens
./cloned_repos/requests/requests/packages/charade/gb2312freq.py has 20867 tokens
./cloned_repos/requests/requests/packages/charade/langhungarianmodel.py has 9990 tokens
./cloned_repos/requests/requests/packages/charade/euctwfreq.py has 20489 tokens
./cloned_repos/requests/requests/packages/charade/big5freq.py has 46595 tokens
./cloned_repos/requests/requests/packages/charade/mbcssm.py has 10558 tokens
./cloned_repos/requests/requests/packages/charade/jpcntx.py has 15133 tokens
./cloned_repos/requests/requests/packages/charade/langbulgarianmodel.py has 10160 tokens
./cloned_repos/requests/requests/packages/charade/langthaimodel.py has 9319 tokens
./cloned_repos/requests/requests/packages/charade/langcyrillicmodel.py has 12870 tokens
./cloned_repos/requests/requests/packages/charade/langhebrewmodel.py has 9341 tokens
./cloned_r

# Get Embeddings and Save to CSV

In [23]:
repo_commits = swebench_df.groupby('repo')['base_commit'].apply(list).to_dict()
repo_commits

{'astropy/astropy': ['d16bfe05a744909de4b27f5875fe0d4ed41ce607',
  '298ccb478e6bf092953bca67a3d29dc6c35f6752',
  '6ed769d58d89380ebaa1ef52b300691eefda8928',
  '6500928dc0e57be8f06d1162eacc3ba5e2eff692',
  '19cc80471739bcb67b7e8099246b391c355023ee',
  '0df94ff7097961e92fd7812036a24b145bc13ca8',
  '5250b2442501e6c671c6b380536f1edb352602d1',
  '1a4462d72eb03f30dc83a879b1dd57aac8b2c18b',
  'a5917978be39d13cd90b517e1de4e7a539ffaa48',
  'cdb66059a2feb44ee49021874605ba90801f9986',
  '7269fa3e33e8d02485a647da91a5a2a60a06af61',
  'fa4e8d1cd279acf9b24560813c8652494ccd5922',
  'a3f4ae6cd24d5ecdf49f213d77b3513dd509a06c',
  'c0a24c1dc957a3b565294213f435fefb2ec99714',
  '80c3854a5f4f4a6ab86c03d9db7854767fcd83c1',
  'b16c7d12ccbc7b2d20364b89fb44285bcbfede54',
  '26d147868f8a891a6009a25cd6a8576d2e1bd747',
  '732d89c2940156bdc0e200bb36dc38b5e424bcba',
  '3cedd79e6c121910220f8e6df77c54a0b344ea94',
  'a7141cd90019b62688d507ae056298507678c058',
  'a85a0747c54bac75e9c3b2fe436b105ea029d6cf',
  'b750a0e6ee76

In [17]:
def generate_string_hash(string, hash_algorithm='sha256'):
    return hashlib.new(hash_algorithm, string.encode('utf-8')).hexdigest()

In [57]:
def clone_repository(repo_url, branch=None, commit_hash=None):
    # Clone the repo with url=repo_url into the cloned_repos directory
    # If repo already exists under cloned_repos, run git pull instead to update it
    repo_name = repo_url.split("/")[-1]
    repo_name = repo_name[:-4] if repo_name.endswith(".git") else repo_name
    clone_dir = f"./cloned_repos/{repo_name}"
    try:
        # If the directory exists, perform git pull, else clone it
        if os.path.exists(clone_dir):
            repo = Repo(clone_dir)
            try:
                repo.git.checkout("main") # if it was previously checked out to a random commit, switch to main and pull to update
            except:
                repo.git.checkout("master")
            repo.remotes.origin.pull()
        else:
            Repo.clone_from(repo_url, clone_dir)
        # if a branch is passed as arguments to the method, checkout to that branch
        if branch:
            repo = Repo(clone_dir)
            repo.git.checkout(branch)
        # if a commit_hash is passed as arguments to the method, checkout to that commit_hash
        if commit_hash:
            repo = Repo(clone_dir)
            repo.git.checkout(commit_hash)
        return repo_name, clone_dir
    except Exception as e:
        raise e

In [19]:
def remove_directory(cloned_dir):
    shutil.rmtree(cloned_dir)

In [8]:
def generate_chunk_embedding(code_chunk):
    # Generate embeddings for each chunk of code
    return openai.embeddings.create(
        model="text-embedding-3-small",
        input=code_chunk
    ).data[0].embedding

def aggregate_embeddings(embeddings):
    # Aggregate chunk embeddings
    return np.mean(embeddings, axis=0)

tokenizer = tiktoken.get_encoding("cl100k_base")

def num_tokens_from_string(string: str) -> int:
    # Return the number of tokens in a text string
    tokens = tokenizer.encode(string)
    return len(tokens)

def chunk_code_by_tokens(code_text, max_tokens=6000):
    # Tokenize the input code
    tokens = tokenizer.encode(code_text)
    # Split the tokens into chunks of max_tokens length
    chunks = []
    for i in range(0, len(tokens), max_tokens):
        chunk = tokens[i:i + max_tokens]
        # Decode the chunk back into text/code
        chunk_text = tokenizer.decode(chunk)
        chunks.append(chunk_text)
    return chunks

def generate_file_embedding(file_string):
    # if less than 8000 tokens, get single embedding
    if num_tokens_from_string(file_string)<8000:
        return generate_chunk_embedding(file_string)
    # else chunk file, embed chunks, aggregate embeddings
    chunks = chunk_code_by_tokens(file_string)
    embeddings = [generate_chunk_embedding(chunk) for chunk in chunks]
    return aggregate_embeddings(embeddings)

In [38]:
def get_file_infos(clone_dir):
    """Analyze all Python files in the cloned repository, including all subdirectories."""
    file_infos = []
    for root, _, files in os.walk(clone_dir):
        for file in files:
            if file.endswith(".py"):
                file_path = os.path.join(root, file)
                try:
                    with open(file_path, "r", encoding='utf-8') as f:
                        code = f.read().strip()
                        # Add all python files into a global list
                        if code:
                            file_infos.append(
                                {"file_path": Path(file_path).relative_to(clone_dir), "name": file,
                                    "code_snippet": code, "hash": generate_string_hash(code)}
                            )
                except Exception as e:
                    print(f" --- Error for file {file_path} {e}. Skipping file.")
    return file_infos

In [3]:
from google.colab import drive
from google.colab import userdata

# Mount Google Drive
drive.mount('/content/drive')

# Set your OpenAI API key
openai.api_key = userdata.get('OPENAI_API_KEY')

Mounted at /content/drive


In [60]:
def format_repo_name(name):
    return "_".join(name.split("/"))

commit_to_files_map = {}
commit_to_hashes_map = {}

for repo in repo_commits:
    if repo.endswith("astropy") or repo.endswith("django") or repo.endswith("matplotlib"):
        print(f"Skipping {repo} as it has already been processed")
    print(f"Processing repo: {repo}")
    infos = []
    hashes = []
    for commit in repo_commits[repo]:
        print(f" - Processing commit {commit} in repo {repo}")
        repo_url = f"https://github.com/{repo}"
        repo_name, cloned_dir = clone_repository(repo_url=repo_url, commit_hash=commit)
        file_infos = get_file_infos(cloned_dir)
        for info in file_infos:
            if info['hash'] not in hashes:
                hashes.append(info['hash'])
                infos.append(info)
        commit_to_files_map[commit] = [info["file_path"] for info in file_infos]
        commit_to_hashes_map[commit] = [info["hash"] for info in file_infos]
        # remove_directory(cloned_dir)
    print(f" - Found {len(infos)} unique files in {repo} across {len(repo_commits[repo])} commits")
    df = pd.DataFrame(infos)
    # print(" - Generating embeddings for them")
    # df['code_embedding'] = df['code_snippet'].apply(lambda x: generate_file_embedding(x))
    info_csv_path = f'/content/drive/My Drive/NLP/Project/data_new/{format_repo_name(repo)}_files_infos.csv'
    df.to_csv(info_csv_path, index=False)
    print(f"CSV file with all necessary file infos for repo {repo_name} has been saved to {info_csv_path}. Contains data on {len(infos)} files.")

Skipping astropy/astropy as it has already been processed
Processing repo: astropy/astropy
 - Processing commit d16bfe05a744909de4b27f5875fe0d4ed41ce607 in repo astropy/astropy
 - Processing commit 298ccb478e6bf092953bca67a3d29dc6c35f6752 in repo astropy/astropy
 - Processing commit 6ed769d58d89380ebaa1ef52b300691eefda8928 in repo astropy/astropy
 - Processing commit 6500928dc0e57be8f06d1162eacc3ba5e2eff692 in repo astropy/astropy
 - Processing commit 19cc80471739bcb67b7e8099246b391c355023ee in repo astropy/astropy
 - Processing commit 0df94ff7097961e92fd7812036a24b145bc13ca8 in repo astropy/astropy
 - Processing commit 5250b2442501e6c671c6b380536f1edb352602d1 in repo astropy/astropy
 - Processing commit 1a4462d72eb03f30dc83a879b1dd57aac8b2c18b in repo astropy/astropy
 - Processing commit a5917978be39d13cd90b517e1de4e7a539ffaa48 in repo astropy/astropy
 - Processing commit cdb66059a2feb44ee49021874605ba90801f9986 in repo astropy/astropy
 - Processing commit 7269fa3e33e8d02485a647da91a5

In [61]:
swebench_df['files_in_repo'] = swebench_df['base_commit'].apply(lambda x: [str(f) for f in commit_to_files_map[x]])
swebench_df['hashes'] = swebench_df['base_commit'].apply(lambda x: commit_to_hashes_map[x])
swebench_df

Unnamed: 0,repo,base_commit,modified_files,problem_statement,hints_text,files_in_repo,hashes
0,astropy/astropy,d16bfe05a744909de4b27f5875fe0d4ed41ce607,[astropy/modeling/separable.py],Modeling's `separability_matrix` does not comp...,,"[setup.py, conftest.py, astropy/logger.py, ast...",[f3938f6642f4286182c4254164d30ea344da1f57586d6...
1,astropy/astropy,298ccb478e6bf092953bca67a3d29dc6c35f6752,[astropy/timeseries/core.py],TimeSeries: misleading exception when required...,The relevant code that produces the misleading...,"[setup.py, conftest.py, astropy/logger.py, ast...",[f3938f6642f4286182c4254164d30ea344da1f57586d6...
2,astropy/astropy,6ed769d58d89380ebaa1ef52b300691eefda8928,[astropy/table/table.py],Consider removing auto-transform of structured...,@mhvk - I'm happy to do this PR if you think i...,"[setup.py, conftest.py, astropy/logger.py, ast...",[f3938f6642f4286182c4254164d30ea344da1f57586d6...
3,astropy/astropy,6500928dc0e57be8f06d1162eacc3ba5e2eff692,[astropy/coordinates/builtin_frames/__init__.p...,A direct approach to ITRS to Observed transfor...,"cc @StuartLittlefair, @adrn, @eteq, @eerovaher...","[setup.py, conftest.py, astropy/logger.py, ast...",[f3938f6642f4286182c4254164d30ea344da1f57586d6...
4,astropy/astropy,19cc80471739bcb67b7e8099246b391c355023ee,[astropy/io/ascii/html.py],ASCII table output to HTML does not support su...,Welcome to Astropy 👋 and thank you for your fi...,"[setup.py, conftest.py, astropy/logger.py, ast...",[f3938f6642f4286182c4254164d30ea344da1f57586d6...
...,...,...,...,...,...,...,...
495,sympy/sympy,e8c22f6eac7314be8d92590bfff92ced79ee03e2,[sympy/physics/units/unitsystem.py],collect_factor_and_dimension does not detect e...,,"[setup.py, setupegg.py, conftest.py, isympy.py...",[bf6340d7ca6c6164d67756bfeb625659e294c380bdbd1...
496,sympy/sympy,809c53c077485ca48a206cee78340389cb83b7f1,[sympy/combinatorics/homomorphisms.py],`_check_homomorphism` is broken on Permutation...,,"[setup.py, setupegg.py, conftest.py, isympy.py...",[e18ef7e45477477cfdc9e4d1642c9d9b3e002a90371f6...
497,sympy/sympy,193e3825645d93c73e31cdceb6d742cc6919624d,[sympy/polys/rings.py],`PolyElement.as_expr()` not accepting symbols\...,,"[setup.py, setupegg.py, conftest.py, isympy.py...",[e18ef7e45477477cfdc9e4d1642c9d9b3e002a90371f6...
498,sympy/sympy,b1cb676cf92dd1a48365b731979833375b188bf2,[sympy/core/numbers.py],"Rational calc value error\npython 3.11, sympy ...",This should probably raise an error. The expec...,"[setup.py, setupegg.py, conftest.py, isympy.py...",[e18ef7e45477477cfdc9e4d1642c9d9b3e002a90371f6...


In [64]:
swebench_df.to_csv(f'/content/drive/My Drive/NLP/Project/data_new/swebench_updated.csv', index=False)

In [63]:
for ind, row in swebench_df.iterrows():
    if row['modified_files'][0] not in row['files_in_repo']:
        print(row['files_in_repo'])
        print(row['modified_files'])
    if len(row['hashes']) != len(row['files_in_repo']):
        print("hello")


Plan
1. Identify number of unique files based on hashes
2. Get embeddings for each file and save in Vector Database and CSV
    - record file path and commit_id
3. Get summaries for each file and save in json
    - record file path and commit_id
4. Evaluation
    Method 1: Pass all filepaths to query
    Method 2: Pass only top K filepaths to query
    Method 3: Pass top K files summaries to query

In [45]:
!rm -rf cloned_repos/

In [25]:
import openai
from google.colab import userdata

# Set your OpenAI API key
openai.api_key = userdata.get('OPENAI_API_KEY')

# Python code snippet to get embeddings for
python_code = """
def add(a, b):
    return a + b
"""

# Requesting embeddings using `text-embedding-ada-002`
response = openai.embeddings.create(
    model="text-embedding-ada-002",  # Embedding model optimized for text & code
    input=python_code
)

# Extracting the embedding vector
embedding = response.data[0].embedding

# Print the embedding (this is a high-dimensional vector)
print(embedding)


[0.008832651190459728, -0.0048835016787052155, 0.024414148181676865, 0.004876779858022928, -0.01640157401561737, 0.0013536340557038784, -0.011985248886048794, 0.0026602144353091717, 0.008194065652787685, -0.01919790916144848, -0.003989481367170811, -0.009935052134096622, -0.0034332394134253263, -0.016562901437282562, 0.028366658836603165, -0.014250548556447029, 0.02390327863395214, 0.0053473166190087795, 0.008725100196897984, 0.004896945785731077, 0.012166742235422134, 0.01938612386584282, 0.005918683018535376, -0.003646661527454853, -0.027586910873651505, -0.0025190534070134163, 0.012623835355043411, 0.003939066547900438, 0.010708076879382133, -0.00425499863922596, 0.014183329418301582, -0.0009973703417927027, -0.028716199100017548, -0.020838066935539246, -0.019816329702734947, -0.0070580546744167805, -0.04011663794517517, -0.019708776846528053, 0.030732786282896996, -0.0042886086739599705, 0.015729378908872604, 0.012570058926939964, -0.004933916497975588, -0.010062769055366516, -0.01

In [None]:
import openai
import os
import tiktoken  # OpenAI's tokenizer library

# Set OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY")

# Function to calculate token count for a query
def count_tokens(prompt, model="gpt-4o-mini"):
    tokenizer = tiktoken.get_encoding("cl100k_base")  # Tokenizer for OpenAI models
    return len(tokenizer.encode(prompt))

# Example queries
queries = [
    "What is the capital of France?",
    "Explain the process of photosynthesis.",
    "What is artificial intelligence?"
]

# Prepare queries in JSONL format and ensure token count fits within limits
max_tokens = 4096  # Example token limit for a model

with open("queries.jsonl", "w") as f:
    for query in queries:
        tokens = count_tokens(query)
        if tokens <= max_tokens:
            json_entry = {"prompt": query, "model": "gpt-4", "max_tokens": max_tokens - tokens}
            f.write(f"{json.dumps(json_entry)}\n")
        else:
            print(f"Query exceeds token limit: {query}")

# Make LLM Calls

In [5]:
from google.colab import drive
from google.colab import userdata

# Mount Google Drive
drive.mount('/content/drive')

# Set your OpenAI API key
from openai import OpenAI
openai.api_key = userdata.get('OPENAI_API_KEY')
client = OpenAI(api_key=userdata.get('OPENAI_API_KEY'))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
swebench_df = pd.read_csv(f'/content/drive/My Drive/NLP/Project/data_new/swebench_updated.csv')

In [7]:
swebench_df

Unnamed: 0,repo,base_commit,modified_files,problem_statement,hints_text,files_in_repo,hashes
0,astropy/astropy,d16bfe05a744909de4b27f5875fe0d4ed41ce607,['astropy/modeling/separable.py'],Modeling's `separability_matrix` does not comp...,,"['setup.py', 'conftest.py', 'astropy/logger.py...",['f3938f6642f4286182c4254164d30ea344da1f57586d...
1,astropy/astropy,298ccb478e6bf092953bca67a3d29dc6c35f6752,['astropy/timeseries/core.py'],TimeSeries: misleading exception when required...,The relevant code that produces the misleading...,"['setup.py', 'conftest.py', 'astropy/logger.py...",['f3938f6642f4286182c4254164d30ea344da1f57586d...
2,astropy/astropy,6ed769d58d89380ebaa1ef52b300691eefda8928,['astropy/table/table.py'],Consider removing auto-transform of structured...,@mhvk - I'm happy to do this PR if you think i...,"['setup.py', 'conftest.py', 'astropy/logger.py...",['f3938f6642f4286182c4254164d30ea344da1f57586d...
3,astropy/astropy,6500928dc0e57be8f06d1162eacc3ba5e2eff692,['astropy/coordinates/builtin_frames/__init__....,A direct approach to ITRS to Observed transfor...,"cc @StuartLittlefair, @adrn, @eteq, @eerovaher...","['setup.py', 'conftest.py', 'astropy/logger.py...",['f3938f6642f4286182c4254164d30ea344da1f57586d...
4,astropy/astropy,19cc80471739bcb67b7e8099246b391c355023ee,['astropy/io/ascii/html.py'],ASCII table output to HTML does not support su...,Welcome to Astropy 👋 and thank you for your fi...,"['setup.py', 'conftest.py', 'astropy/logger.py...",['f3938f6642f4286182c4254164d30ea344da1f57586d...
...,...,...,...,...,...,...,...
495,sympy/sympy,e8c22f6eac7314be8d92590bfff92ced79ee03e2,['sympy/physics/units/unitsystem.py'],collect_factor_and_dimension does not detect e...,,"['setup.py', 'setupegg.py', 'conftest.py', 'is...",['bf6340d7ca6c6164d67756bfeb625659e294c380bdbd...
496,sympy/sympy,809c53c077485ca48a206cee78340389cb83b7f1,['sympy/combinatorics/homomorphisms.py'],`_check_homomorphism` is broken on Permutation...,,"['setup.py', 'setupegg.py', 'conftest.py', 'is...",['e18ef7e45477477cfdc9e4d1642c9d9b3e002a90371f...
497,sympy/sympy,193e3825645d93c73e31cdceb6d742cc6919624d,['sympy/polys/rings.py'],`PolyElement.as_expr()` not accepting symbols\...,,"['setup.py', 'setupegg.py', 'conftest.py', 'is...",['e18ef7e45477477cfdc9e4d1642c9d9b3e002a90371f...
498,sympy/sympy,b1cb676cf92dd1a48365b731979833375b188bf2,['sympy/core/numbers.py'],"Rational calc value error\npython 3.11, sympy ...",This should probably raise an error. The expec...,"['setup.py', 'setupegg.py', 'conftest.py', 'is...",['e18ef7e45477477cfdc9e4d1642c9d9b3e002a90371f...


In [8]:
from langchain.prompts import PromptTemplate
code_location_template = PromptTemplate(
    input_variables=[
        "repo_name",
        "files_list",
        "query",
    ],
    template="""
You are provided with details about the repository named **{repo_name}**.

**List of Files in the Repository:**
{files_list}

**User Query:**
{query}

**Your Task**:
- Identify the exact file (or files) names that needs to be modified to solve the problem described in the user query based on the files list provided.

**Response Format**:
- Return a json string with the below format
- If you are unable to determine the answer, respond with {{"answered": False}}
- If you are able to determine the answer, respond with {{"answered": True, "num_files": <integer>, "files": {{"rank_1": "file path", "rank_2": "file path", ...}}}} and rank the files in the order of most likely to be needed to be modified to solve the **user query**. You can choose upto 10 files. You can choose less than 10 files as well but need to select at least one file if you set "answered":True. The number of files you choose must be set in "num_files" in the response json and the selected files must be ranked and returned as "files": {{"rank_1": "file path", "rank_2": "file path", ...}} in the response json.

**CRITICAL INSTRUCTIONS**:
1. **DO NOT create or assume any file names**. Only return file names or files paths if it matches exactly with one from the above list of **files in the repository**.
2. **Respond ONLY with the json format described**. Avoid explanations, additional text, or clarifications.
3. **If the query cannot be answered with the provided information**, respond strictly with: {{"answered": False}}
"
**IMPORTANT**: Precision is essential; ensure that your answer is concise and follows the format exactly.
""",)


In [9]:
def make_request(prompt):
    response = client.chat.completions.create(
        model = "gpt-4o-mini",
        temperature = 0.1,
        response_format = { "type": "json_object"},
        messages = [
            {
                "role": "system",
                "content": "You are an expert code assistant for analyzing, summarizing, and locating elements within code repositories. Follow the user's instructions exactly, using only the provided information to deliver precise, concise answers. Avoid creating new information or making assumptions beyond the context given."
            },
            {
                "role": "user",
                "content": prompt
            }
        ]
    )
    return response.choices[0].message.content

In [43]:
import time
def make_throttled_requests(df):
    for idx,row in df.iterrows():
        user_query = f"**Problem Description**:\n {row['problem_statement']}"
        if not pd.isna(row['hints_text']):
            user_query+=f"\n\n **Hint**: (You may use this hint if it is relevant or else ignore it) \n {row['hints_text']}\n"
        prompt = code_location_template.format(repo_name=row['repo'], files_list=row['files_in_repo'], query=user_query)
        res = make_request(prompt)
        with open("/content/drive/My Drive/NLP/Project/data_new/results.jsonl", "a") as f:
            f.write(f"{res}\n")
        with open("/content/drive/My Drive/NLP/Project/data_new/order.csv", "a") as f:
            f.write(f"{row['base_commit']},{idx}\n")
        print(f"- Processed Row {idx} - {row['base_commit']}")
        time.sleep(10)

In [11]:
df1 = swebench_df.iloc[0:100]
df2 = swebench_df.iloc[100:200]
df3 = swebench_df.iloc[200:300]
df4 = swebench_df.iloc[300:400]
df5 = swebench_df.iloc[400:500]

In [17]:
make_throttled_requests(df1)

- Processed Row 0 - d16bfe05a744909de4b27f5875fe0d4ed41ce607
- Processed Row 1 - 298ccb478e6bf092953bca67a3d29dc6c35f6752
- Processed Row 2 - 6ed769d58d89380ebaa1ef52b300691eefda8928
- Processed Row 3 - 6500928dc0e57be8f06d1162eacc3ba5e2eff692
- Processed Row 4 - 19cc80471739bcb67b7e8099246b391c355023ee
- Processed Row 5 - 0df94ff7097961e92fd7812036a24b145bc13ca8
- Processed Row 6 - 5250b2442501e6c671c6b380536f1edb352602d1
- Processed Row 7 - 1a4462d72eb03f30dc83a879b1dd57aac8b2c18b
- Processed Row 8 - a5917978be39d13cd90b517e1de4e7a539ffaa48
- Processed Row 9 - cdb66059a2feb44ee49021874605ba90801f9986
- Processed Row 10 - 7269fa3e33e8d02485a647da91a5a2a60a06af61
- Processed Row 11 - fa4e8d1cd279acf9b24560813c8652494ccd5922
- Processed Row 12 - a3f4ae6cd24d5ecdf49f213d77b3513dd509a06c
- Processed Row 13 - c0a24c1dc957a3b565294213f435fefb2ec99714
- Processed Row 14 - 80c3854a5f4f4a6ab86c03d9db7854767fcd83c1
- Processed Row 15 - b16c7d12ccbc7b2d20364b89fb44285bcbfede54
- Processed Row 16

RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-cdF8QjvdJ0wjbNYnF9ax4STd on tokens per min (TPM): Limit 60000, Used 39518, Requested 22402. Please try again in 1.92s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}

In [21]:
df1_2 = df1.iloc[74:100]
make_throttled_requests(df1_2)

- Processed Row 74 - 3fb7c12158a2402f0f80824f6778112071235803
- Processed Row 75 - 5a68f024987e6d16c2626a31bf653a2edddea579
- Processed Row 76 - 69331bb851c34f05bc77e9fc24020fe6908b9cd5
- Processed Row 77 - 927c903f3cd25c817c21738328b53991c035b415
- Processed Row 78 - 53d8646f799de7f92ab9defe9dc56c6125448102
- Processed Row 79 - 4c1b401e8250f9f520b3c7dc369554477ce8b15a
- Processed Row 80 - 2e0f04507b17362239ba49830d26fec504d46978
- Processed Row 81 - 29c126bb349526b5f1cd78facbe9f25906f18563
- Processed Row 82 - 335c9c94acf263901fb023404408880245b0c4b4
- Processed Row 83 - 7fa1a93c6c8109010a6ff3f604fda83b604e0e97
- Processed Row 84 - fa5e7e46d875d4143510944f19d79df7b1739bab
- Processed Row 85 - 447980e72ac01da1594dd3373a03ba40b7ee6f80
- Processed Row 86 - 5b884d45ac5b76234eca614d90c83b347294c332
- Processed Row 87 - 537d422942b53bc0a2b6a51968f379c0de07793c
- Processed Row 88 - 18759b2209ff556aed7f20d83cbf23e3d234e41c
- Processed Row 89 - 67f9d076cfc1858b94f9ed6d1a5ce2327dcc8d0d
- Proces

In [23]:
make_throttled_requests(df2)

- Processed Row 100 - 2d67222472f80f251607ae1b720527afceba06ad
- Processed Row 101 - 7af8f4127397279d19ef7c7899e93018274e2f9b
- Processed Row 102 - 156a2138db20abc89933121e4ff2ee2ce56a173a
- Processed Row 103 - f4e93919e4608cfc50849a1f764fd856e0917401
- Processed Row 104 - 6e9c5ee88fc948e05b4a7d9f82a8861ed2b0343d
- Processed Row 105 - 8954f255bbf5f4ee997fd6de62cb50fc9b5dd697
- Processed Row 106 - 36bc47069ce071e80c8129500de3b8664d2058a7
- Processed Row 107 - ece18207cbb64dd89014e279ac636a6c9829828e
- Processed Row 108 - e39e727ded673e74016b5d3658d23cbe20234d11
- Processed Row 109 - 9c92924cd5d164701e2514e1c2d6574126bd7cc2
- Processed Row 110 - 76e0151ea0e0f56dca66cee846a78b89346d2c4c
- Processed Row 111 - 453967477e3ddae704cd739eac2449c0e13d464c
- Processed Row 112 - 84609b3205905097d7d3038d32e6101f012c0619
- Processed Row 113 - 580a4341cb0b4cbfc215a70afc004875a7e815f4
- Processed Row 114 - 71ae1ab0123582cc5bfe0f7d5f4cc19a9412f396
- Processed Row 115 - 2a55431a5678af52f669ffe7dff3dd0bd

RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-cdF8QjvdJ0wjbNYnF9ax4STd on tokens per min (TPM): Limit 60000, Used 39718, Requested 23327. Please try again in 3.045s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}

In [30]:
df2_2 = df2.iloc[35:100]
make_throttled_requests(df2_2)

- Processed Row 135 - 42e8cf47c7ee2db238bf91197ea398126c546741
- Processed Row 136 - f39634ff229887bf7790c069d0c411b38494ca38
- Processed Row 137 - 619f26d2895d121854b1bed1b535d42b722e2eba
- Processed Row 138 - e4430f22c8e3d29ce5d9d0263fba57121938d06d
- Processed Row 139 - 466920f6d726eee90d5566e0a9948e92b33a122e
- Processed Row 140 - db1fc5cd3c5d36cdb5d0fe4404efd6623dd3e8fb
- Processed Row 141 - 179ee13eb37348cd87169a198aec18fedccc8668
- Processed Row 142 - d01709aae21de9cd2565b9c52f32732ea28a2d98
- Processed Row 143 - bc04941bf811d1ea2c79fb7fc20457ed2c7e3410
- Processed Row 144 - 45814af6197cfd8f4dc72ee43b90ecde305a1d5a
- Processed Row 145 - 2f13c476abe4ba787b6cb71131818341911f43cc
- Processed Row 146 - 6efc35b4fe3009666e56a60af0675d7d532bf4ff
- Processed Row 147 - 30e123ed351317b7527f632b3b7dc4e81e850449
- Processed Row 148 - 5a8e8f80bb82a867eab7e4d9d099f21d0a976d22
- Processed Row 149 - 187118203197801c6cb72dc8b06b714b23b6dd3d
- Processed Row 150 - a708f39ce67af174df90c5b5e50ad1976

RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-cdF8QjvdJ0wjbNYnF9ax4STd on tokens per min (TPM): Limit 60000, Used 39726, Requested 22939. Please try again in 2.665s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}

In [32]:
df2_3 = df2.iloc[55:100]
# df2_3
make_throttled_requests(df2_3)

- Processed Row 155 - 5e04e84d67da8163f365e9f5fcd169e2630e2873
- Processed Row 156 - 7272e1963ffdf39c1d4fe225d5425a45dd095d11
- Processed Row 157 - 8c3bd0b708b488a1f6e8bd8cc6b96569904605be
- Processed Row 158 - 910ecd1b8df7678f45c3d507dde6bcb1faafa243
- Processed Row 159 - 6a5ef557f80a8eb6a758ebe99c8bb477ca47459e
- Processed Row 160 - d79be3ed39b76d3e34431873eec16f6dd354ab17
- Processed Row 161 - 36fa071d6ebd18a61c4d7f1b5c9d17106134bd44
- Processed Row 162 - 7f33c1e22dbc34a7afae7967783725b10f1f13b1
- Processed Row 163 - 84400d2e9db7c51fee4e9bb04c028f665b8e7624
- Processed Row 164 - 00ea883ef56fb5e092cbe4a6f7ff2e7470886ac4
- Processed Row 165 - 0af9a5fc7d765aa05ea784e2c3237675f3bb4b49
- Processed Row 166 - b64db05b9cedd96905d637a2d824cbbf428e40e7
- Processed Row 167 - 4e8121e8e42a24acc3565851c9ef50ca8322b15c
- Processed Row 168 - 4884a87e022056eda10534c13d74e49b8cdda632
- Processed Row 169 - 004b4620f6f4ad87261e149898940f2dcd5757ef
- Processed Row 170 - d89f976bddb49fb168334960acc8979c3

RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-cdF8QjvdJ0wjbNYnF9ax4STd on tokens per min (TPM): Limit 60000, Used 39650, Requested 24313. Please try again in 3.963s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}

In [33]:
df2_4 = df2.iloc[78:100]
# df2_4
make_throttled_requests(df2_4)

- Processed Row 178 - a7e7043c8746933dafce652507d3b821801cdc7d
- Processed Row 179 - 9a6e2df3a8f01ea761529bec48e5a8dc0ea9575b
- Processed Row 180 - cb383753c0e0eb52306e1024d32a782549c27e61
- Processed Row 181 - 96e7ff5e9ff6362d9a886545869ce4496ca4b0fb
- Processed Row 182 - 361bb8f786f112ee275be136795c0b1ecefff928
- Processed Row 183 - 0ab58c120939093fea90822f376e1866fc714d1f
- Processed Row 184 - 30613d6a748fce18919ff8b0da166d9fda2ed9bc
- Processed Row 185 - 0ab58c120939093fea90822f376e1866fc714d1f
- Processed Row 186 - 973fa566521037ac140dcece73fceae50ee522f1
- Processed Row 187 - 652c68ffeebd510a6f59e1b56b3e007d07683ad8
- Processed Row 188 - e972620ada4f9ed7bc57f28e133e85c85b0a7b20
- Processed Row 189 - beb7ddbcee03270e833b2f74927ccfc8027aa693
- Processed Row 190 - 71e7c8e73712419626f1c2b6ec036e8559a2d667
- Processed Row 191 - 770d3e6a4ce8e0a91a9e27156036c1985e74d4a3
- Processed Row 192 - e0442a628eb480eac6a7888aed5a86f83499e299
- Processed Row 193 - d90e34c61b27fba2527834806639eebbc

RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-cdF8QjvdJ0wjbNYnF9ax4STd on requests per day (RPD): Limit 200, Used 200, Requested 1. Please try again in 7m12s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'requests', 'param': None, 'code': 'rate_limit_exceeded'}}

In [34]:
df2_5 = df2.iloc[95:100]
# df2_5
make_throttled_requests(df2_5)

- Processed Row 195 - fbacaa58ffc5a62456ee68b90efa13957f761ce4
- Processed Row 196 - 59ab3fd0e9e606d7f0f7ca26609c06ee679ece97


RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-cdF8QjvdJ0wjbNYnF9ax4STd on tokens per min (TPM): Limit 60000, Used 52120, Requested 24066. Please try again in 16.186s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}

In [37]:
df2_6 = df2.iloc[97:100]
# df2_6
make_throttled_requests(df2_6)

- Processed Row 197 - 6991880109e35c879b71b7d9d9c154baeec12b89
- Processed Row 198 - 9ffd4eae2ce7a7100c98f681e2b6ab818df384a4


RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-cdF8QjvdJ0wjbNYnF9ax4STd on tokens per min (TPM): Limit 60000, Used 48501, Requested 24059. Please try again in 12.56s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}

In [40]:
df2_7 = df2.iloc[99:100]
# df2_7
make_throttled_requests(df2_7)

- Processed Row 199 - 884b4c27f506b3c29d58509fc83a35c30ea10d94


In [42]:
make_throttled_requests(df3)

- Processed Row 200 - 0b31e024873681e187b574fe1c4afe5e48aeeecf
- Processed Row 201 - 694cf458f16b8d340a3195244196980b2dec34fd


RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-cdF8QjvdJ0wjbNYnF9ax4STd on tokens per min (TPM): Limit 60000, Used 48763, Requested 24078. Please try again in 12.841s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}

In [12]:
df3_1 = df3.iloc[48:100]
# df3_1
make_throttled_requests(df3_1)

- Processed Row 248 - 953f29f700a60fc09b08b2c2270c12c447490c6a
- Processed Row 249 - f8c43aca467b7b0c4bb0a7fa41362f90b610b8df
- Processed Row 250 - 4a72da71001f154ea60906a2f74898d32b7322a7
- Processed Row 251 - f8fab6f90233c7114d642dfe01a4e6d4cb14ee7d
- Processed Row 252 - 84322a29ce9b0940335f8ab3d60e55192bef1e50
- Processed Row 253 - a3e2897bfaf9eaac1d6649da535c4e721c89fa69
- Processed Row 254 - d65c9ca20ddf81ef91199e6d819f9d3506ef477c
- Processed Row 255 - b7ce415c15eb39b026a097a2865da73fbcf15c9c
- Processed Row 256 - 6786f437df54ca7780a047203cbcfaa1db8dc542
- Processed Row 257 - a0d2e399729d36499a1924e5ca5bc067c8396810
- Processed Row 258 - 64619e53e9d0ed417daba287ac0d3a06943a54d5
- Processed Row 259 - f0632c0fc7339f68e992ed63ae4cfac76cd41aad
- Processed Row 260 - a2a1b0a11b993fe5f8fab64b6161e99243a6393c
- Processed Row 261 - c6c7ec1978c22ae2c704555a873d0ec6e1e2eaa8
- Processed Row 262 - a7b7260bf06c20d408215d95ce20a1a01c12e5b1
- Processed Row 263 - 3eadeacc06c9f2ddcdac6ae39819faa9f

In [13]:
make_throttled_requests(df4)

- Processed Row 300 - 118f4d996e7711c9aced916e6049af9f28d5ec66
- Processed Row 301 - 69c7e01e5167a3137c285cb50d1978252bb8bcbf
- Processed Row 302 - ef6e6a7b86f8479b9a1fecf15ad5b88a2326b31e
- Processed Row 303 - 8cc34cb412ba89ebca12fc84f76a9e452628f1bc
- Processed Row 304 - 19b088636eb7d3f65ab7a1046ac672e0689371d8
- Processed Row 305 - a64cf2d5476e7bbda099b34c40b7be1880dbd39a
- Processed Row 306 - e05fddea852d08fc0845f954b79deb9e9f9ff883
- Processed Row 307 - a41edc7bf5302f2ea327943c0c48c532b12009bc
- Processed Row 308 - d3b6aa6d8b997df115a53c001d00222a0f92f63a
- Processed Row 309 - 51ef2a66c4e0896eab7d2b03e3dfb3963e338e3c
- Processed Row 310 - 37522e991a32ee3c0ad1a5ff8afe8e3eb1885550
- Processed Row 311 - 851dadeb0338403e5021c3fbe80cbc9127ee672d
- Processed Row 312 - 6bb2b855498b5c68d7cca8cceb710365d58e6048
- Processed Row 313 - cc183652bf6e1273e985e1c4b3cba79c896c1193
- Processed Row 314 - 7cc6cc991e586a6158bb656b8001234ccda25407
- Processed Row 315 - c4e40d991c28be51de9ac560ce895ac7f

RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-cdF8QjvdJ0wjbNYnF9ax4STd on requests per day (RPD): Limit 200, Used 200, Requested 1. Please try again in 7m12s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'requests', 'param': None, 'code': 'rate_limit_exceeded'}}

In [29]:
df4_1 = df4.iloc[98:100]
# df4_1
make_throttled_requests(df4_1)

- Processed Row 398 - 5e6da19f0e44a0ae83944fb6ce18f18f781e1a6e
- Processed Row 399 - e188d56ed1248dead58f3f8018c0e9a3f99193f7


In [44]:
make_throttled_requests(df5)

- Processed Row 400 - 795747bdb6b8fb7d717d5bbfc2c3316869e66a73
- Processed Row 401 - b428cd2404675475a5c3dc2a2b0790ba57676202
- Processed Row 402 - 1e2ccd8f0eca0870cf6f8fce6934e2da8eba9b72
- Processed Row 403 - 68aa4fb29e7dfe521749e1e14f750d7afabb3481
- Processed Row 404 - 3ea1ec84cc610f7a9f4f6b354e264565254923ff
- Processed Row 405 - dd1615c59dc6fff633e27dbb3861f2d27e1fb976
- Processed Row 406 - 57ed10c68057c96491acbd3e62254ccfaf9e3861
- Processed Row 407 - 07983a5a8704ad91ae855218ecbda1c8598200ca
- Processed Row 408 - b19bce971e82f2497d67fdacdeca8db08ae0ba56
- Processed Row 409 - 21698c14461d27933864d73e6fba568a154e83b3
- Processed Row 410 - 4b452338f914d4f6b54704222d70ae8a746e3db5
- Processed Row 411 - 82ef497a8c88f0f6e50d84520e7276bfbf65025d
- Processed Row 412 - 876fa81e0a038cda466925b85ccf6c5452e0f685
- Processed Row 413 - 567ff22716ac258b9edd2c1711d766b440ac0b11
- Processed Row 414 - 06107f838c28ab6ca6bfc2cc208e15997fcb2146
- Processed Row 415 - 8ec06e9a1bd862cd713b9db748e039ccc

In [15]:
for idx,row in swebench_df.iterrows():
    user_query = f"**Problem Description**:\n {row['problem_statement']}"
    if not pd.isna(row['hints_text']):
        user_query+=f"\n\n **Hint**: (You may use this hint if it is relevant or else ignore it) \n {row['hints_text']}\n"
    prompt = code_location_template.format(repo_name=row['repo'], files_list=row['files_in_repo'], query=user_query)

    request_json = {
        "custom_id": f"{row['base_commit']}_{idx}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o-mini",
            "temperature": 0.1,
            "response_format": {
                "type": "json_object"
            },
            "messages": [
                {
                    "role": "system",
                    "content": "You are an expert code assistant for analyzing, summarizing, and locating elements within code repositories. Follow the user's instructions exactly, using only the provided information to deliver precise, concise answers. Avoid creating new information or making assumptions beyond the context given."
                },
                {
                    "role": "user",
                    "content": prompt
                }]
        }
    }
    prompts.append(prompt)
    sizes.append(num_tokens_from_string(prompt))

In [16]:
print(sizes)

[10657, 12320, 11092, 14969, 14573, 12125, 11722, 11058, 11095, 12035, 11577, 12020, 11703, 11367, 11727, 12400, 7958, 8165, 8169, 8353, 8859, 9143, 21216, 22272, 21303, 22499, 21269, 21705, 22258, 25410, 21585, 21491, 21430, 21646, 22265, 21713, 21673, 21513, 21633, 21599, 21580, 21579, 23521, 21587, 21407, 22140, 21838, 21941, 21477, 21783, 21840, 21473, 22355, 22879, 21787, 21576, 22183, 21960, 22493, 21853, 21655, 22559, 21667, 22219, 21861, 21824, 21715, 22234, 22889, 21959, 21640, 22630, 21981, 22062, 21864, 22482, 21888, 23078, 22009, 21794, 22137, 22413, 22024, 21924, 23134, 22268, 21945, 21993, 25227, 22192, 22123, 22718, 22597, 22105, 22907, 23805, 22428, 22589, 22883, 22377, 22130, 22357, 22683, 22596, 22229, 22664, 23907, 22393, 22825, 22600, 22928, 22435, 23612, 22529, 22642, 24191, 22734, 22623, 22343, 22340, 22379, 22996, 22417, 22626, 22500, 22618, 22578, 22422, 23102, 22339, 22833, 22776, 22490, 22707, 23634, 22741, 22827, 22610, 23787, 22839, 24637, 23093, 22322, 2237

In [17]:
from openai import OpenAI
openai.api_key = userdata.get('OPENAI_API_KEY')
client = OpenAI(api_key=userdata.get('OPENAI_API_KEY'))

In [18]:
def make_request(queries):
    print(queries)
    response = client.chat.completions.create(model="gpt-4o-mini", messages=[{queries)
    results = [""] * len(queries)
    for choice in response.choices:
        results[choice.index] = choice.text
    return results

In [19]:
responses = []
size = 0
queries = []
for i in range(len(prompts)):
    if size+sizes[i] < 59000:
        queries.append(prompts[i])
        size+=sizes[i]
    else:
        res = make_request(queries)
        print(res)
        break
        responses.extend(res)
        size = size[i]
        queries = []
        queries.append(prompts[i])



TypeError: Missing required arguments; Expected either ('messages' and 'model') or ('messages', 'model' and 'stream') arguments to be given

In [28]:
import json
with open('/content/drive/My Drive/NLP/Project/data_new/prompts.jsonl', 'w') as file:
    for obj in prompts:
        file.write(json.dumps(obj) + '\n')

In [12]:
batch_file = client.files.create(
  file=open('/content/drive/My Drive/NLP/Project/data_new/prompts.jsonl', "rb"),
  purpose="batch"
)

In [30]:
print(batch_file)

FileObject(id='file-60PMAGXVGH6yfav2toNnYkRM', bytes=31831893, created_at=1732094183, filename='prompts.jsonl', object='file', purpose='batch', status='processed', status_details=None)


In [31]:
batch_job = client.batches.create(
  input_file_id=batch_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h"
)

In [32]:
batch_job = client.batches.retrieve(batch_job.id)

In [33]:
print(batch_job)

Batch(id='batch_673da8f7192081909f9a55de329554f2', completion_window='24h', created_at=1732094199, endpoint='/v1/chat/completions', input_file_id='file-60PMAGXVGH6yfav2toNnYkRM', object='batch', status='failed', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=Errors(data=[BatchError(code='token_limit_exceeded', line=None, message='Enqueued token limit reached for gpt-4o-mini in organization org-cdF8QjvdJ0wjbNYnF9ax4STd. Limit: 200,000 enqueued tokens. Please try again once some in_progress batches have been completed.', param=None)], object='list'), expired_at=None, expires_at=1732180599, failed_at=1732094200, finalizing_at=None, in_progress_at=None, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))


In [19]:
len(swebench_df["base_commit"])

500

In [20]:
len(set(swebench_df["base_commit"]))


499

In [23]:
duplicates = swebench_df['base_commit'].value_counts()
duplicates = duplicates[duplicates > 1]

In [24]:
duplicates

Unnamed: 0_level_0,count
base_commit,Unnamed: 1_level_1
0ab58c120939093fea90822f376e1866fc714d1f,2


In [25]:
swebench_df[swebench_df['base_commit']=='0ab58c120939093fea90822f376e1866fc714d1f']

Unnamed: 0,repo,base_commit,modified_files,problem_statement,hints_text,files_in_repo,hashes
183,django/django,0ab58c120939093fea90822f376e1866fc714d1f,['django/db/migrations/operations/models.py'],Optimize multiple AlterFooTogether operations ...,,"['setup.py', 'scripts/manage_translations.py',...",['2c58b58ac3cd9d644bb89026ab122777ca668d04f707...
185,django/django,0ab58c120939093fea90822f376e1866fc714d1f,['django/db/backends/sqlite3/schema.py'],Adding nullable OneToOneField crashes on SQLit...,Thanks for the report! Regression in 2f73e5406...,"['setup.py', 'scripts/manage_translations.py',...",['2c58b58ac3cd9d644bb89026ab122777ca668d04f707...


In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="meta-llama/Llama-3.1-8B")

In [23]:
num_stories = 10
prompts = ["Once upon a time,"] * num_stories

# batched example, with 10 story completions per request
response = client.completions.create(
    model="gpt-4",
    prompt=prompts,
    max_tokens=20,
)

# match completions to prompts by index
stories = [""] * len(prompts)
for choice in response.choices:
    stories[choice.index] = prompts[choice.index] + choice.text

# print stories
for story in stories:
    print(story)

NotFoundError: Error code: 404 - {'error': {'message': 'This is a chat model and not supported in the v1/completions endpoint. Did you mean to use v1/chat/completions?', 'type': 'invalid_request_error', 'param': 'model', 'code': None}}

# Vectorization

In [19]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModel

# Check if a GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [14]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
device

device(type='cuda')

In [7]:
# Load the UniXcoder model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/unixcoder-base")
model = AutoModel.from_pretrained("microsoft/unixcoder-base").to(device)  # Move the model to GPU if available

In [20]:
def get_embeddings(text: str):
    # Tokenize the input text and convert it to input IDs
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    # Move the tokenized inputs to GPU if available
    inputs = {key: value.to(device) for key, value in inputs.items()}
    # Get the embeddings from the model
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Averaging the token embeddings
    return embeddings

def aggregate_embeddings(embeddings):
    # Aggregate embeddings from all chunks by averaging
    embeddings_tensor = torch.cat(embeddings, dim=0)
    aggregated_embedding = embeddings_tensor.mean(dim=0, keepdim=True)
    return aggregated_embedding

def get_document_embedding(document: str, chunk_size=512):
    # Split the document into smaller chunks if it exceeds the chunk size (token limit)
    chunks = [document[i:i + chunk_size] for i in range(0, len(document), chunk_size)]
    embeddings = []
    for chunk in chunks:
        embeddings.append(get_embeddings(chunk))
    final_embedding = aggregate_embeddings(embeddings)
    return final_embedding.squeeze().cpu().numpy().tolist()

In [21]:
# Sample large document (You can replace this with your actual document)
large_document = "This is a sample document. " * 5000 # Repeating to create a larger text

# Process the document
final_embedding = get_document_embedding(large_document)

# Print out the final aggregated embedding
print(final_embedding)

[1.8817613124847412, -0.9991443753242493, 0.7410063743591309, 1.627164602279663, -2.240138292312622, 1.2892656326293945, -1.2957674264907837, 1.2541987895965576, 0.7910654544830322, 0.7937154769897461, 1.9379205703735352, 0.11459693312644958, -2.2261688709259033, -1.0724483728408813, -2.34417986869812, -1.0655608177185059, -0.46116986870765686, 3.001136302947998, 0.7915621995925903, 0.1152200847864151, 0.5635496377944946, 0.06345896422863007, -0.047550350427627563, 0.7362407445907593, -2.3504388332366943, -1.3507252931594849, -0.050697848200798035, 0.8608629703521729, 0.23575659096240997, 1.8951431512832642, -2.215881586074829, 0.5669121146202087, -2.136099100112915, -0.34091079235076904, 0.6976006627082825, 5.664979934692383, 0.03703898563981056, -0.31486737728118896, -0.5007683634757996, -1.6610170602798462, 2.560377359390259, 2.8667995929718018, 1.3891469240188599, -1.7014403343200684, 3.9662656784057617, -1.6242694854736328, -0.28659728169441223, 2.0216634273529053, 1.9888033866882

In [23]:
final_embedding

[1.8817613124847412,
 -0.9991443753242493,
 0.7410063743591309,
 1.627164602279663,
 -2.240138292312622,
 1.2892656326293945,
 -1.2957674264907837,
 1.2541987895965576,
 0.7910654544830322,
 0.7937154769897461,
 1.9379205703735352,
 0.11459693312644958,
 -2.2261688709259033,
 -1.0724483728408813,
 -2.34417986869812,
 -1.0655608177185059,
 -0.46116986870765686,
 3.001136302947998,
 0.7915621995925903,
 0.1152200847864151,
 0.5635496377944946,
 0.06345896422863007,
 -0.047550350427627563,
 0.7362407445907593,
 -2.3504388332366943,
 -1.3507252931594849,
 -0.050697848200798035,
 0.8608629703521729,
 0.23575659096240997,
 1.8951431512832642,
 -2.215881586074829,
 0.5669121146202087,
 -2.136099100112915,
 -0.34091079235076904,
 0.6976006627082825,
 5.664979934692383,
 0.03703898563981056,
 -0.31486737728118896,
 -0.5007683634757996,
 -1.6610170602798462,
 2.560377359390259,
 2.8667995929718018,
 1.3891469240188599,
 -1.7014403343200684,
 3.9662656784057617,
 -1.6242694854736328,
 -0.28659728

In [29]:
repo_csvs = [
    # "astropy_astropy_files_infos.csv",
    # "django_django_files_infos.csv",
    # "matplotlib_matplotlib_files_infos.csv",
    # "mwaskom_seaborn_files_infos.csv",
    # "pallets_flask_files_infos.csv",
    # "psf_requests_files_infos.csv",
    # "pydata_xarray_files_infos.csv",
    # "pylint-dev_pylint_files_infos.csv",
    # "pytest-dev_pytest_files_infos.csv",
    "scikit-learn_scikit-learn_files_infos.csv",
    "sphinx-doc_sphinx_files_infos.csv",
    "sympy_sympy_files_infos.csv"
]

In [None]:
from tqdm import tqdm
tqdm.pandas()
for file_name in repo_csvs:
    print(f'Processing {file_name}')
    df = pd.read_csv(f'/content/drive/My Drive/NLP/Project/data_new/{file_name}')
    print(f' - Generating embeddings for {len(df)} documents')
    df['embeddings'] = df['code_snippet'].progress_apply(lambda x: get_document_embedding(x))
    csv_path = f'/content/drive/My Drive/NLP/Project/data_new/embeddings/{file_name}'
    df.to_csv(csv_path, index=False)
    print(f' - Embeddings saved to file {csv_path}')

Processing django_django_files_infos.csv
 - Generating embeddings for 14717 documents


100%|██████████| 14717/14717 [1:57:08<00:00,  2.09it/s]


 - Embeddings saved to file /content/drive/My Drive/NLP/Project/data_new/embeddings/django_django_files_infos.csv
Processing matplotlib_matplotlib_files_infos.csv
 - Generating embeddings for 5560 documents


100%|██████████| 5560/5560 [50:14<00:00,  1.84it/s]


 - Embeddings saved to file /content/drive/My Drive/NLP/Project/data_new/embeddings/matplotlib_matplotlib_files_infos.csv
Processing mwaskom_seaborn_files_infos.csv
 - Generating embeddings for 167 documents


100%|██████████| 167/167 [01:09<00:00,  2.40it/s]


 - Embeddings saved to file /content/drive/My Drive/NLP/Project/data_new/embeddings/mwaskom_seaborn_files_infos.csv
Processing pallets_flask_files_infos.csv
 - Generating embeddings for 77 documents


100%|██████████| 77/77 [00:13<00:00,  5.69it/s]


 - Embeddings saved to file /content/drive/My Drive/NLP/Project/data_new/embeddings/pallets_flask_files_infos.csv
Processing pydata_xarray_files_infos.csv
 - Generating embeddings for 1625 documents


100%|██████████| 1625/1625 [24:01<00:00,  1.13it/s]


 - Embeddings saved to file /content/drive/My Drive/NLP/Project/data_new/embeddings/pydata_xarray_files_infos.csv
Processing pylint-dev_pylint_files_infos.csv
 - Generating embeddings for 4241 documents


100%|██████████| 4241/4241 [07:25<00:00,  9.51it/s]


 - Embeddings saved to file /content/drive/My Drive/NLP/Project/data_new/embeddings/pylint-dev_pylint_files_infos.csv
Processing pytest-dev_pytest_files_infos.csv
 - Generating embeddings for 1497 documents


100%|██████████| 1497/1497 [10:48<00:00,  2.31it/s]


 - Embeddings saved to file /content/drive/My Drive/NLP/Project/data_new/embeddings/pytest-dev_pytest_files_infos.csv
Processing scikit-learn_scikit-learn_files_infos.csv
 - Generating embeddings for 5583 documents


 49%|████▉     | 2743/5583 [21:30<33:51,  1.40it/s]

# Evaluate LLM results

In [45]:
results_file_path = f'/content/drive/My Drive/NLP/Project/data_new/results.jsonl'
swebench_file_path = f'/content/drive/My Drive/NLP/Project/data_new/swebench_updated.csv'

In [46]:
import json
import pandas as pd
import numpy as np
# Initialize an empty list to store the JSON objects
results = []
# Open the jsonl file for reading
with open(results_file_path, 'r') as file:
    for line in file:
        # Parse each line as a JSON object and append it to the list
        results.append(json.loads(line.strip()))
# Now json_objects contains all the parsed JSON objects from the file
print(len(results))

500


In [47]:
print(results[0])

{'answered': True, 'num_files': 2, 'files': {'rank_1': 'astropy/modeling/separable.py', 'rank_2': 'astropy/modeling/__init__.py'}}


In [48]:
top_1_results = []
top_2_results = []
top_3_results = []
predictions = []
for res in results:
    files = []
    if res['answered']:
        for i in range(1,res['num_files']+1):
            files.append(res['files'][f'rank_{i}'])
    predictions.append(files)
    top_1_results.append(files[:1])
    top_2_results.append(files[:2])
    top_3_results.append(files[:3])

In [49]:
print(len(results), len(top_1_results), len(top_2_results), len(top_3_results))

500 500 500 500


In [50]:
swebench_df = pd.read_csv(swebench_file_path)

In [51]:
swebench_df

Unnamed: 0,repo,base_commit,modified_files,problem_statement,hints_text,files_in_repo,hashes
0,astropy/astropy,d16bfe05a744909de4b27f5875fe0d4ed41ce607,['astropy/modeling/separable.py'],Modeling's `separability_matrix` does not comp...,,"['setup.py', 'conftest.py', 'astropy/logger.py...",['f3938f6642f4286182c4254164d30ea344da1f57586d...
1,astropy/astropy,298ccb478e6bf092953bca67a3d29dc6c35f6752,['astropy/timeseries/core.py'],TimeSeries: misleading exception when required...,The relevant code that produces the misleading...,"['setup.py', 'conftest.py', 'astropy/logger.py...",['f3938f6642f4286182c4254164d30ea344da1f57586d...
2,astropy/astropy,6ed769d58d89380ebaa1ef52b300691eefda8928,['astropy/table/table.py'],Consider removing auto-transform of structured...,@mhvk - I'm happy to do this PR if you think i...,"['setup.py', 'conftest.py', 'astropy/logger.py...",['f3938f6642f4286182c4254164d30ea344da1f57586d...
3,astropy/astropy,6500928dc0e57be8f06d1162eacc3ba5e2eff692,['astropy/coordinates/builtin_frames/__init__....,A direct approach to ITRS to Observed transfor...,"cc @StuartLittlefair, @adrn, @eteq, @eerovaher...","['setup.py', 'conftest.py', 'astropy/logger.py...",['f3938f6642f4286182c4254164d30ea344da1f57586d...
4,astropy/astropy,19cc80471739bcb67b7e8099246b391c355023ee,['astropy/io/ascii/html.py'],ASCII table output to HTML does not support su...,Welcome to Astropy 👋 and thank you for your fi...,"['setup.py', 'conftest.py', 'astropy/logger.py...",['f3938f6642f4286182c4254164d30ea344da1f57586d...
...,...,...,...,...,...,...,...
495,sympy/sympy,e8c22f6eac7314be8d92590bfff92ced79ee03e2,['sympy/physics/units/unitsystem.py'],collect_factor_and_dimension does not detect e...,,"['setup.py', 'setupegg.py', 'conftest.py', 'is...",['bf6340d7ca6c6164d67756bfeb625659e294c380bdbd...
496,sympy/sympy,809c53c077485ca48a206cee78340389cb83b7f1,['sympy/combinatorics/homomorphisms.py'],`_check_homomorphism` is broken on Permutation...,,"['setup.py', 'setupegg.py', 'conftest.py', 'is...",['e18ef7e45477477cfdc9e4d1642c9d9b3e002a90371f...
497,sympy/sympy,193e3825645d93c73e31cdceb6d742cc6919624d,['sympy/polys/rings.py'],`PolyElement.as_expr()` not accepting symbols\...,,"['setup.py', 'setupegg.py', 'conftest.py', 'is...",['e18ef7e45477477cfdc9e4d1642c9d9b3e002a90371f...
498,sympy/sympy,b1cb676cf92dd1a48365b731979833375b188bf2,['sympy/core/numbers.py'],"Rational calc value error\npython 3.11, sympy ...",This should probably raise an error. The expec...,"['setup.py', 'setupegg.py', 'conftest.py', 'is...",['e18ef7e45477477cfdc9e4d1642c9d9b3e002a90371f...


In [52]:
ground_truth = swebench_df['modified_files']

In [None]:
def get_metrics_for_instance(ground_truth, pred, k):
    count = 0
    for i in ground_truth:
        if i in pred[:k]:
            count+=1
    precision = count/k
    recall = count/len(ground_truth) if len(ground_truth)<=k else count/k
    return (recall,precision)

In [None]:
def compute_metrics(ground_truth, predictions, k_values):
    recall_scores = {k: [] for i in k_values}
    precision_scores = {k: [] for i in k_values}
    for gt,pred in zip(ground_truth, predictions):
        for k in k_values:
            recall,precision = get_metrics_for_instance(ground_truth, prediction, k)
            recall_scores[k].append(recall)
            precision_scores[k].append(precision)
    results = {}
    for k in recall_scores:
        results[f"recall@{k}"] = mean(recall_scores[k])
        results[f"precision@{k}"] = mean(precision_scores[k])

In [None]:
print("Swebench LLM Basic Query Results : ")
print(compute_metrics(ground_truth,predictions,[1,2,3]))