## Code search

We index our own [openai-python code repository](https://github.com/openai/openai-python), and show how it can be searched. We implement a simple version of file parsing and extracting of functions from python files.

In [16]:
from openai.embeddings_utils import get_embedding
import openai
import os

# Authenticate with OpenAI API
openai.api_key = os.environ.get('OPENAI')

# Root directory where the Python repo is located
ROOT_DIR = "/Users/krishna/research/mpm/LearnMPM/"
CODE_REPO = "LearnMPM"


In [17]:
import os
from glob import glob
import pandas as pd
import numpy as np

def get_function_name(code):
    """
    Extract function name from a line beginning with "def "
    """
    assert code.startswith("def ")
    return code[len("def "): code.index("(")]

def get_until_no_space(all_lines, i) -> str:
    """
    Get all lines until a line outside the function definition is found.
    """
    ret = [all_lines[i]]
    for j in range(i + 1, i + 10000):
        if j < len(all_lines):
            if len(all_lines[j]) == 0 or all_lines[j][0] in [" ", "\t", ")"]:
                ret.append(all_lines[j])
            else:
                break
    return "\n".join(ret)

def get_functions(filepath):
    """
    Get all functions in a Python file.
    """
    whole_code = open(filepath).read().replace("\r", "\n")
    all_lines = whole_code.split("\n")
    for i, l in enumerate(all_lines):
        if l.startswith("def "):
            code = get_until_no_space(all_lines, i)
            function_name = get_function_name(code)
            yield {"code": code, "function_name": function_name, "filepath": filepath}


# get user root directory
root_dir = os.path.expanduser(ROOT_DIR)

# path to code repository directory
code_root = root_dir + CODE_REPO

code_files = [y for x in os.walk(code_root) for y in glob(os.path.join(x[0], '*.py'))]
print("Total number of py files:", len(code_files))

if len(code_files) == 0:
    print("Double check that you have downloaded the repo and set the code_root variable correctly.")

all_funcs = []
for code_file in code_files:
    funcs = list(get_functions(code_file))
    for func in funcs:
        all_funcs.append(func)

print("Total number of functions extracted:", len(all_funcs))



Total number of py files: 12
Total number of functions extracted: 21


### Create embedding and write to a CSV file

In [18]:
from transformers import GPT2Tokenizer, GPT2Model
import torch

# Initialize the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2Model.from_pretrained("gpt2")

def gpt2_embedding(text):
    input_ids = tokenizer.encode(text, return_tensors="pt")

    # Get the vector embedding
    embedding = model(input_ids)[0][0, -1, :]
    return embedding


## Create embeddings

In [19]:
def count_tokens(text: str) -> int:
    """count the number of tokens in a string"""
    return len(tokenizer.tokenize(text))

In [20]:
df = pd.DataFrame(all_funcs)
df['tokens'] = df['code'].apply(lambda x: count_tokens(x))
# Include only rows with < 1024 tokens
# df = df[df.tokens<1024]
### OpenAI
df['code_embedding'] = df['code'].apply(lambda x: get_embedding(x, engine='text-embedding-ada-002'))
### GPT2 
### df['code_embedding'] = df['code'].apply(lambda x: gpt2_embedding(x))
df['filepath'] = df['filepath'].apply(lambda x: x.replace(code_root, ""))
df.to_csv("code_search_openai-python.csv", index=False)
df.head()

Unnamed: 0,code,function_name,filepath,tokens,code_embedding
0,"def nodal_total_force(mesh):\n """"""\n Com...",nodal_total_force,/update.py,129,"[-0.008052323944866657, 0.025560934096574783, ..."
1,"def nodal_acceleration_velocity(mesh, dt):\n ...",nodal_acceleration_velocity,/update.py,249,"[0.008488386869430542, 0.024847134947776794, -..."
2,"def nodal_velocity(mesh):\n """"""Compute noda...",nodal_velocity,/update.py,125,"[-0.0038946340791881084, 0.013509807176887989,..."
3,"def fix_nodal_bc_momentum(mesh):\n """"""Set m...",fix_nodal_bc_momentum,/update.py,133,"[-0.01586133986711502, -0.005389683414250612, ..."
4,"def fix_nodal_bc_force(mesh):\n """"""Set noda...",fix_nodal_bc_force,/update.py,111,"[-0.01658795401453972, 0.004961120896041393, -..."


### Read embeddings from a CSV file

In [21]:
## Only works with OpenAI model
df = pd.read_csv("code_search_openai-python.csv")
df['code_embedding'] = df['code_embedding'].apply(lambda x: [float(i) for i in x[1:-1].split(',')])
df.head()

Unnamed: 0,code,function_name,filepath,tokens,code_embedding
0,"def nodal_total_force(mesh):\n """"""\n Com...",nodal_total_force,/update.py,129,"[-0.008052323944866657, 0.025560934096574783, ..."
1,"def nodal_acceleration_velocity(mesh, dt):\n ...",nodal_acceleration_velocity,/update.py,249,"[0.008488386869430542, 0.024847134947776794, -..."
2,"def nodal_velocity(mesh):\n """"""Compute noda...",nodal_velocity,/update.py,125,"[-0.0038946340791881084, 0.013509807176887989,..."
3,"def fix_nodal_bc_momentum(mesh):\n """"""Set m...",fix_nodal_bc_momentum,/update.py,133,"[-0.01586133986711502, -0.005389683414250612, ..."
4,"def fix_nodal_bc_force(mesh):\n """"""Set noda...",fix_nodal_bc_force,/update.py,111,"[-0.01658795401453972, 0.004961120896041393, -..."


### Compute similarity

In [22]:
def compute_similarity(x, y):
    return np.dot(np.array(x), np.array(y))

def cosine_similarity(x, y):
    dot_product = torch.dot(x, y)
    norm_x = torch.norm(x)
    norm_y = torch.norm(y)
    cosine_similarity = dot_product / (norm_x * norm_y)
    return cosine_similarity

def search_functions(df, code_query, n=3, pprint=True, n_lines=7):
    # OpenAI
    embedding = get_embedding(code_query, engine='text-embedding-ada-002')
    # GPT2
    # embedding = gpt2_embedding(code_query)
    df['similarities'] = df.code_embedding.apply(lambda x: compute_similarity(x, embedding))

    res = df.sort_values('similarities', ascending=False).head(n)
    
    if pprint:
        for r in res.iterrows():
            print(r[1].filepath+":"+r[1].function_name + "  score=" + str(round(r[1].similarities, 3)))
            print("\n".join(r[1].code.split("\n")[:n_lines]))
            print('-'*70)
    return res

In [23]:
res = search_functions(df, 'How do I map velocity from nodes to material points?', n=3)

/update.py:particle_position_velocity  score=0.757
def particle_position_velocity(mesh, dt):
    """
    Compute particle position and velocity based on nodal velocity. :math:`x_p += \sum_i N_i(x_p) * v_i` and particle position :math:`x_p += v_p * dt`.

    Arguments:
        mesh: mesh
            a mesh object
----------------------------------------------------------------------
/update.py:particle_velocity  score=0.75
def particle_velocity(mesh, dt):
    """
    Compute particle velocity transfer nodal velocity to particle. :math:`v_p += \sum_i N_i(x_p) * {f_{total}}_i/m_i * dt`.

    Arguments:
        mesh: mesh
            a mesh object
----------------------------------------------------------------------
/update.py:nodal_velocity  score=0.75
def nodal_velocity(mesh):
    """Compute nodal velocity as :math:`v = mv / m`.

    Arguments:
        mesh: mesh
            a mesh object
    """
----------------------------------------------------------------------
