In [9]:
import ast
from collections import defaultdict
import copy
import openai
import os
import numpy as np
import pandas as pd
from openai.embeddings_utils import get_embedding, cosine_similarity
from tqdm import tqdm

openai.api_key = os.getenv('OPENAI_KEY')


In [7]:
def get_block(code, node, code_type, file_path):
    file_path = '/'.join([x for x in file_path.split("/") if x not in [".", ".."]])
    source = ast.get_source_segment(code, node['node'])
    return {
        'code_type': code_type, 
        'source': source,
        'start_line': node['node'].lineno,
        'end_line': node['node'].end_lineno,
        'file_path': file_path,
        'pretext': node['pretext'],
    }

def parse_file(file_path):
    with open(file_path, 'r') as fob:
        code = fob.read()
    parsed_code = ast.parse(code)
    nodes = [{'pretext': '', 'node': node} for node in parsed_code.body]
    codeblocks = []

    while len(nodes) > 0:
        node = nodes.pop(0)
        if isinstance(node['node'], ast.Import) or isinstance(node['node'], ast.ImportFrom):
            codeblocks.append(get_block(code, node, 'import', file_path))
        elif isinstance(node['node'], ast.Assign):
            codeblocks.append(get_block(code, node, 'assign', file_path))
        elif isinstance(node['node'], ast.FunctionDef):
            codeblocks.append(get_block(code, node, 'function', file_path))
        elif isinstance(node['node'], ast.ClassDef):
            nodes += [{'pretext': f"{node['pretext']}#class {node['node'].name} \n", 'node': x} for x in node['node'].body]
        else:
            try:
                print(node['node'], node['node'].name)
            except Exception:
                pass
            codeblocks.append(get_block(code, node, 'misc', file_path))

    # collate imports, assign
    collate_types = ['import', 'assign']
    tempblock = None
    finblocks = []

    for block in codeblocks:
        if block['code_type'] in collate_types:
            if tempblock is None:
                tempblock = {k:v for k,v in block.items()}
            elif tempblock['code_type'] == block['code_type']:
                tempblock['source'] += f"\n{block['source']}"
                tempblock['start_line'] = min(tempblock['start_line'], block['start_line'])
                tempblock['end_line'] = max(tempblock['start_line'], block['end_line'])
            else:
                finblocks.append(tempblock)
                tempblock = {k:v for k,v in block.items()}
        else:
            if tempblock is not None:
                finblocks.append(tempblock)
                tempblock = None
            finblocks.append(block)
            
    #reduce size of blocks
    finfinblocks = []
    for block in finblocks:
        if len(block['source']) < 2000:
            block['blob'] = f"{block['pretext']}{block['source']}"
            block['chars'] = len(block['blob'])
            finfinblocks.append(block)
        else:
            lines = block['source'].split("\n")
            parts = int(len(block['source']) / 1500) + 1
            line_size = int(len(lines)/ parts)
            for i in range(parts):
                temp = copy.deepcopy(block)
                selected_lines = '\n'.join(lines[i * line_size : (i + 1) * line_size])
                temp['blob'] = f"{temp['pretext']}#code part: {i+1}\n{selected_lines}"
                temp['chars'] = len(temp['blob'])
                finfinblocks.append(temp)
                
            
    
    return finfinblocks

def get_files_to_parse(root_path, files_extensions_to_parse=['py'], dirs_to_ignore=['tests']):
    files_to_parse = []

    for root, dirs, files in os.walk("../../openpilot/selfdrive/controls"):
        for name in files:
            if (root.rsplit("/", 1)[-1] in dirs_to_ignore) or (name.rsplit('.')[-1] not in files_extensions_to_parse):
                continue
            temp_path = os.path.join(root, name)
            files_to_parse.append(temp_path)
    return files_to_parse

In [8]:
# Tests 1
res = parse_file("../../openpilot/selfdrive/controls/lib/lateral_planner.py")
pd.DataFrame(res)

Unnamed: 0,code_type,source,start_line,end_line,file_path,pretext,blob,chars
0,import,import numpy as np\nfrom common.realtime impor...,1,10,openpilot/selfdrive/controls/lib/lateral_plann...,,import numpy as np\nfrom common.realtime impor...,478
1,assign,TRAJECTORY_SIZE = 33\nCAMERA_OFFSET = 0.04\nPA...,12,24,openpilot/selfdrive/controls/lib/lateral_plann...,,TRAJECTORY_SIZE = 33\nCAMERA_OFFSET = 0.04\nPA...,161
2,function,"def __init__(self, CP):\n self.DH = DesireH...",28,44,openpilot/selfdrive/controls/lib/lateral_plann...,#class LateralPlanner \n,"#class LateralPlanner \ndef __init__(self, CP)...",659
3,function,"def reset_mpc(self, x0=np.zeros(4)):\n self...",46,48,openpilot/selfdrive/controls/lib/lateral_plann...,#class LateralPlanner \n,"#class LateralPlanner \ndef reset_mpc(self, x0...",111
4,function,"def update(self, sm):\n # clip speed , late...",50,110,openpilot/selfdrive/controls/lib/lateral_plann...,#class LateralPlanner \n,#class LateralPlanner \n#code part: 1\ndef upd...,1629
5,function,"def update(self, sm):\n # clip speed , late...",50,110,openpilot/selfdrive/controls/lib/lateral_plann...,#class LateralPlanner \n,#class LateralPlanner \n#code part: 2\n\n a...,1281
6,function,"def publish(self, sm, pm):\n plan_solution_...",112,133,openpilot/selfdrive/controls/lib/lateral_plann...,#class LateralPlanner \n,"#class LateralPlanner \ndef publish(self, sm, ...",1029


In [9]:
# Tests 2
get_files_to_parse("../../openpilot/selfdrive/controls/")

['../../openpilot/selfdrive/controls/controlsd.py',
 '../../openpilot/selfdrive/controls/plannerd.py',
 '../../openpilot/selfdrive/controls/__init__.py',
 '../../openpilot/selfdrive/controls/radard.py',
 '../../openpilot/selfdrive/controls/lib/latcontrol_torque.py',
 '../../openpilot/selfdrive/controls/lib/longitudinal_planner.py',
 '../../openpilot/selfdrive/controls/lib/vehicle_model.py',
 '../../openpilot/selfdrive/controls/lib/latcontrol_indi.py',
 '../../openpilot/selfdrive/controls/lib/drive_helpers.py',
 '../../openpilot/selfdrive/controls/lib/latcontrol_angle.py',
 '../../openpilot/selfdrive/controls/lib/lateral_planner.py',
 '../../openpilot/selfdrive/controls/lib/radar_helpers.py',
 '../../openpilot/selfdrive/controls/lib/events.py',
 '../../openpilot/selfdrive/controls/lib/__init__.py',
 '../../openpilot/selfdrive/controls/lib/latcontrol_pid.py',
 '../../openpilot/selfdrive/controls/lib/pid.py',
 '../../openpilot/selfdrive/controls/lib/latcontrol.py',
 '../../openpilot/selfd

In [10]:
res = []
for file in get_files_to_parse("../../openpilot/selfdrive/controls/"):
    res += parse_file(file)


In [27]:
embedding_model = "text-embedding-ada-002"
for e, ele in enumerate(tqdm(res)):
    if 'embeddings' not in ele:
        embeddings = openai.embeddings_utils.get_embedding(ele['blob'], engine=embedding_model)
        res[e]['embeddings'] = embeddings
    if 'summary' not in ele:
        oai_res = openai.Completion.create(
            model="text-davinci-003",
            prompt=f"Summarize the code block below:\n\n{ele['blob']}\n\nSummary:",
            max_tokens=1024,
            temperature=0.7
        )
        summary = oai_res.to_dict()['choices'][0].to_dict()['text']
        res[e]['summary'] = summary
    if 'summary_embedding' not in ele:
        se = openai.embeddings_utils.get_embedding(ele['summary'], engine=embedding_model)
        res[e]['summary_embeddings'] = se
        
df = pd.DataFrame(res)

100%|█████████████████████████████████████████████████████████████| 270/270 [00:34<00:00,  7.89it/s]


In [28]:
df

Unnamed: 0,code_type,source,start_line,end_line,file_path,pretext,blob,chars,embeddings,summary,summary_embeddings
0,import,import os\nimport math\nfrom typing import Sup...,2,31,openpilot/selfdrive/controls/controlsd.py,,import os\nimport math\nfrom typing import Sup...,1618,"[-0.015812689438462257, 0.006434303242713213, ...",This code block imports several modules for u...,"[0.0008985632448457181, 0.0019410370150581002,..."
1,assign,SOFT_DISABLE_TIME = 3\nLDW_MIN_SPEED = 31 * CV...,33,58,openpilot/selfdrive/controls/controlsd.py,,SOFT_DISABLE_TIME = 3\nLDW_MIN_SPEED = 31 * CV...,1221,"[-0.01241289172321558, 0.013535411097109318, -...",This code block establishes constants related...,"[0.014841819182038307, 0.015876270830631256, -..."
2,function,"def main(sm=None, pm=None, logcan=None):\n co...",864,866,openpilot/selfdrive/controls/controlsd.py,,"def main(sm=None, pm=None, logcan=None):\n co...",108,"[-0.014029370620846748, 0.004167469684034586, ...",This code block defines a main function that ...,"[-0.018333639949560165, 0.03314927965402603, 0..."
3,misc,"if __name__ == ""__main__"":\n main()",869,870,openpilot/selfdrive/controls/controlsd.py,,"if __name__ == ""__main__"":\n main()",35,"[0.0004368811787571758, 0.00701922457665205, 0...",This code block checks if the current file is...,"[-0.012727517634630203, 0.0177313219755888, -0..."
4,function,"def __init__(self, sm=None, pm=None, can_sock=...",62,210,openpilot/selfdrive/controls/controlsd.py,#class Controls \n,#class Controls \n#code part: 1\ndef __init__(...,1101,"[-0.015575501136481762, 0.005867051891982555, ...",This code block is part of a class called Con...,"[-0.014329985715448856, 0.007618751376867294, ..."
...,...,...,...,...,...,...,...,...,...,...,...
265,function,"def set_accel_limits(self, min_a, max_a):\n ...",303,307,openpilot/selfdrive/controls/lib/longitudinal_...,#class LongitudinalMpc \n,#class LongitudinalMpc \ndef set_accel_limits(...,226,"[0.0012401152634993196, -0.0075665744952857494...",\nThis code block defines a class Longitudinal...,"[-0.008481696248054504, 0.03178854286670685, 0..."
266,function,"def update(self, radarstate, v_cruise, x, v, a...",309,386,openpilot/selfdrive/controls/lib/longitudinal_...,#class LongitudinalMpc \n,#class LongitudinalMpc \n#code part: 1\ndef up...,1228,"[-0.02075737901031971, 0.017854150384664536, 0...",\nThis code block is part of a class called Lo...,"[-0.02256379835307598, 0.026338839903473854, -..."
267,function,"def update(self, radarstate, v_cruise, x, v, a...",309,386,openpilot/selfdrive/controls/lib/longitudinal_...,#class LongitudinalMpc \n,#class LongitudinalMpc \n#code part: 2\n ...,1096,"[-0.00651545962318778, -0.005185484886169434, ...",This code block is part of a class called Lon...,"[-0.012943500652909279, 0.005906902719289064, ..."
268,function,"def update(self, radarstate, v_cruise, x, v, a...",309,386,openpilot/selfdrive/controls/lib/longitudinal_...,#class LongitudinalMpc \n,#class LongitudinalMpc \n#code part: 3\n se...,1032,"[-0.015268523246049881, 0.005408196710050106, ...",This code block is part of the LongitudinalMp...,"[-0.0015480555593967438, 0.021089613437652588,..."


In [29]:
# df.to_csv("./embedded_summarized.csv", index=False)

In [65]:
def search_code(df, query, n=3):
    query_embedding = get_embedding(
        query,
        engine="text-embedding-ada-002"
    )
    df["similarity"] = df.embeddings.apply(lambda x: cosine_similarity(x, query_embedding))

    results = (
        df.sort_values("similarity", ascending=False)
        
    )
    return results.head(n)

In [50]:
def code_structure(df):
    kids_structure = defaultdict(list)
    parents_structure = {}
    for path in paths:
        t = path.split("/")
        for e in range(len(t)):
            if e < len(t)-1:
                kids_structure[t[e]].append(t[e+1])
            if e == 0:
                parents_structure[t[e]] = "./"
            else:
                parents_structure[t[e]] = "/".join(t[:e])
                
    for k, v in kids_structure.items():
        kids_structure[k] = list(set(v))
    return dict(kids_structure), parents_structure

In [51]:
code_structure(df)

({'openpilot': ['selfdrive'],
  'selfdrive': ['controls'],
  'controls': ['lib', 'radard.py', 'controlsd.py', 'plannerd.py'],
  'lib': ['latcontrol_torque.py',
   'desire_helper.py',
   'lateral_mpc_lib',
   'longitudinal_planner.py',
   'longitudinal_mpc_lib',
   'lateral_planner.py',
   'pid.py',
   'latcontrol.py',
   'vehicle_model.py',
   'radar_helpers.py',
   'latcontrol_pid.py',
   'latcontrol_angle.py',
   'drive_helpers.py',
   'events.py',
   'longcontrol.py',
   'latcontrol_indi.py',
   'alertmanager.py'],
  'lateral_mpc_lib': ['lat_mpc.py'],
  'longitudinal_mpc_lib': ['long_mpc.py']},
 {'openpilot': './',
  'selfdrive': 'openpilot',
  'controls': 'openpilot/selfdrive',
  'controlsd.py': 'openpilot/selfdrive/controls',
  'plannerd.py': 'openpilot/selfdrive/controls',
  'radard.py': 'openpilot/selfdrive/controls',
  'lib': 'openpilot/selfdrive/controls',
  'latcontrol_torque.py': 'openpilot/selfdrive/controls/lib',
  'longitudinal_planner.py': 'openpilot/selfdrive/controls/l

defaultdict(list,
            {'openpilot': ['selfdrive'],
             'selfdrive': ['controls'],
             'controls': ['lib', 'radard.py', 'controlsd.py', 'plannerd.py'],
             'lib': ['latcontrol_torque.py',
              'desire_helper.py',
              'lateral_mpc_lib',
              'longitudinal_planner.py',
              'longitudinal_mpc_lib',
              'lateral_planner.py',
              'pid.py',
              'latcontrol.py',
              'vehicle_model.py',
              'radar_helpers.py',
              'latcontrol_pid.py',
              'latcontrol_angle.py',
              'drive_helpers.py',
              'events.py',
              'longcontrol.py',
              'latcontrol_indi.py',
              'alertmanager.py'],
             'lateral_mpc_lib': ['lat_mpc.py'],
             'longitudinal_mpc_lib': ['long_mpc.py']})

In [7]:
import pandas as pd
tdf = pd.read_csv("../app/frontend/data/withsummary.csv")

Unnamed: 0.1,Unnamed: 0,code_type,source,start_line,end_line,file_path,pretext,blob,chars,embeddings,summary
0,0,import,import os\nimport math\nfrom typing import Sup...,2,31,openpilot/selfdrive/controls/controlsd.py,,file path: openpilot/selfdrive/controls/contro...,1671,"[-0.0020728062372654676, 0.008319668471813202,...",\n\nThe above code is from the controlsd.py fi...
1,1,assign,SOFT_DISABLE_TIME = 3\nLDW_MIN_SPEED = 31 * CV...,33,58,openpilot/selfdrive/controls/controlsd.py,,file path: openpilot/selfdrive/controls/contro...,1274,"[-0.0021910234354436398, 0.015653282403945923,...",\n\nThe above code defines a set of constants ...
2,2,function,"def main(sm=None, pm=None, logcan=None):\n co...",864,866,openpilot/selfdrive/controls/controlsd.py,,file path: openpilot/selfdrive/controls/contro...,161,"[-0.005893922410905361, 0.00789132621139288, 0...",\n\nThis code creates a Controls object with t...
3,3,misc,"if __name__ == ""__main__"":\n main()",869,870,openpilot/selfdrive/controls/controlsd.py,,file path: openpilot/selfdrive/controls/contro...,88,"[0.003028794191777706, 0.01985102705657482, 0....",\n\nThis code is the main entry point for the ...
4,4,function,"def __init__(self, sm=None, pm=None, can_sock=...",62,210,openpilot/selfdrive/controls/controlsd.py,#class Controls \n,#file path: openpilot/selfdrive/controls/contr...,1155,"[-0.008290477097034454, 0.006133547518402338, ...",\n\nThe above code is part of the __init__ met...
...,...,...,...,...,...,...,...,...,...,...,...
265,265,function,"def set_accel_limits(self, min_a, max_a):\n ...",303,307,openpilot/selfdrive/controls/lib/longitudinal_...,#class LongitudinalMpc \n,file path: openpilot/selfdrive/controls/lib/lo...,303,"[0.011291579343378544, 0.0007856169831939042, ...",\n\nThis code sets the minimum and maximum acc...
266,266,function,"def update(self, radarstate, v_cruise, x, v, a...",309,386,openpilot/selfdrive/controls/lib/longitudinal_...,#class LongitudinalMpc \n,#file path: openpilot/selfdrive/controls/lib/l...,1306,"[-0.018248511478304863, 0.02136959880590439, 0...",\n\nThis code segment is part of the Longitudi...
267,267,function,"def update(self, radarstate, v_cruise, x, v, a...",309,386,openpilot/selfdrive/controls/lib/longitudinal_...,#class LongitudinalMpc \n,#file path: openpilot/selfdrive/controls/lib/l...,1174,"[-0.010142628103494644, 0.0029282064642757177,...",\n\nThe above code is part of the Longitudinal...
268,268,function,"def update(self, radarstate, v_cruise, x, v, a...",309,386,openpilot/selfdrive/controls/lib/longitudinal_...,#class LongitudinalMpc \n,#file path: openpilot/selfdrive/controls/lib/l...,1110,"[-0.01571042463183403, 0.00733485771343112, -0...","\nThe above code is part of the ""LongitudinalM..."


In [10]:


embedding_model = "text-embedding-ada-002"
embeddings = tdf.summary.apply([lambda x: get_embedding(x.strip(), engine=embedding_model)])
     


In [11]:
tdf

Unnamed: 0.1,Unnamed: 0,code_type,source,start_line,end_line,file_path,pretext,blob,chars,embeddings,summary
0,0,import,import os\nimport math\nfrom typing import Sup...,2,31,openpilot/selfdrive/controls/controlsd.py,,file path: openpilot/selfdrive/controls/contro...,1671,"[-0.0020728062372654676, 0.008319668471813202,...",\n\nThe above code is from the controlsd.py fi...
1,1,assign,SOFT_DISABLE_TIME = 3\nLDW_MIN_SPEED = 31 * CV...,33,58,openpilot/selfdrive/controls/controlsd.py,,file path: openpilot/selfdrive/controls/contro...,1274,"[-0.0021910234354436398, 0.015653282403945923,...",\n\nThe above code defines a set of constants ...
2,2,function,"def main(sm=None, pm=None, logcan=None):\n co...",864,866,openpilot/selfdrive/controls/controlsd.py,,file path: openpilot/selfdrive/controls/contro...,161,"[-0.005893922410905361, 0.00789132621139288, 0...",\n\nThis code creates a Controls object with t...
3,3,misc,"if __name__ == ""__main__"":\n main()",869,870,openpilot/selfdrive/controls/controlsd.py,,file path: openpilot/selfdrive/controls/contro...,88,"[0.003028794191777706, 0.01985102705657482, 0....",\n\nThis code is the main entry point for the ...
4,4,function,"def __init__(self, sm=None, pm=None, can_sock=...",62,210,openpilot/selfdrive/controls/controlsd.py,#class Controls \n,#file path: openpilot/selfdrive/controls/contr...,1155,"[-0.008290477097034454, 0.006133547518402338, ...",\n\nThe above code is part of the __init__ met...
...,...,...,...,...,...,...,...,...,...,...,...
265,265,function,"def set_accel_limits(self, min_a, max_a):\n ...",303,307,openpilot/selfdrive/controls/lib/longitudinal_...,#class LongitudinalMpc \n,file path: openpilot/selfdrive/controls/lib/lo...,303,"[0.011291579343378544, 0.0007856169831939042, ...",\n\nThis code sets the minimum and maximum acc...
266,266,function,"def update(self, radarstate, v_cruise, x, v, a...",309,386,openpilot/selfdrive/controls/lib/longitudinal_...,#class LongitudinalMpc \n,#file path: openpilot/selfdrive/controls/lib/l...,1306,"[-0.018248511478304863, 0.02136959880590439, 0...",\n\nThis code segment is part of the Longitudi...
267,267,function,"def update(self, radarstate, v_cruise, x, v, a...",309,386,openpilot/selfdrive/controls/lib/longitudinal_...,#class LongitudinalMpc \n,#file path: openpilot/selfdrive/controls/lib/l...,1174,"[-0.010142628103494644, 0.0029282064642757177,...",\n\nThe above code is part of the Longitudinal...
268,268,function,"def update(self, radarstate, v_cruise, x, v, a...",309,386,openpilot/selfdrive/controls/lib/longitudinal_...,#class LongitudinalMpc \n,#file path: openpilot/selfdrive/controls/lib/l...,1110,"[-0.01571042463183403, 0.00733485771343112, -0...","\nThe above code is part of the ""LongitudinalM..."


In [12]:
tdf["summary_embeddings"] = embeddings


In [None]:
tdf.to_csv("../app/frontend/data/withsummary2.csv", index=False)