In [1]:
import pandas as pd
import torch
import tqdm
from tqdm.auto import tqdm
tqdm.pandas()
from collections import Counter
import numpy as np

In [2]:
from datasets import load_dataset
dataset = load_dataset("code_search_net", "python")
df_train = dataset['train'].to_pandas().reset_index(drop = True)
df_val = dataset['validation'].to_pandas().reset_index(drop = True)
df_test = dataset['test'].to_pandas().reset_index(drop = True)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


## Dataset Sizes

In [7]:
df_train['func_code_len'] = df_train['func_code_tokens'].progress_apply(lambda x : len(x))
df_val['func_code_len'] = df_val['func_code_tokens'].progress_apply(lambda x : len(x))
df_test['func_code_len'] = df_test['func_code_tokens'].progress_apply(lambda x : len(x))

df_train = df_train[df_train['func_code_len'] <= 500]
df_val = df_val[df_val['func_code_len'] <= 500]
df_test = df_test[df_test['func_code_len'] <= 500]

  0%|          | 0/412178 [00:00<?, ?it/s]

  0%|          | 0/23107 [00:00<?, ?it/s]

  0%|          | 0/22176 [00:00<?, ?it/s]

In [8]:
print(f"Train Size: {df_train.shape}")
print(f"Validation Size: {df_val.shape}")
print(f"Test Size: {df_test.shape}")

Train Size: (402906, 12)
Validation Size: (22478, 12)
Test Size: (21736, 12)


In [9]:
df_train.head()

Unnamed: 0,repository_name,func_path_in_repository,func_name,whole_func_string,language,func_code_string,func_code_tokens,func_documentation_string,func_documentation_tokens,split_name,func_code_url,func_code_len
0,proycon/pynlpl,pynlpl/formats/folia.py,AbstractElement.addidsuffix,"def addidsuffix(self, idsuffix, recursive = Tr...",python,"def addidsuffix(self, idsuffix, recursive = Tr...","[def, addidsuffix, (, self, ,, idsuffix, ,, re...","Appends a suffix to this element's ID, and opt...","[Appends, a, suffix, to, this, element, s, ID,...",train,https://github.com/proycon/pynlpl/blob/7707f69...,44
1,proycon/pynlpl,pynlpl/formats/folia.py,AbstractElement.setparents,"def setparents(self):\n """"""Correct all ...",python,"def setparents(self):\n """"""Correct all ...","[def, setparents, (, self, ), :, for, c, in, s...",Correct all parent relations for elements with...,"[Correct, all, parent, relations, for, element...",train,https://github.com/proycon/pynlpl/blob/7707f69...,29
2,proycon/pynlpl,pynlpl/formats/folia.py,AbstractElement.setdoc,"def setdoc(self,newdoc):\n """"""Set a dif...",python,"def setdoc(self,newdoc):\n """"""Set a dif...","[def, setdoc, (, self, ,, newdoc, ), :, self, ...",Set a different document. Usually no need to c...,"[Set, a, different, document, ., Usually, no, ...",train,https://github.com/proycon/pynlpl/blob/7707f69...,53
3,proycon/pynlpl,pynlpl/formats/folia.py,AbstractElement.hastext,"def hastext(self,cls='current',strict=True, co...",python,"def hastext(self,cls='current',strict=True, co...","[def, hastext, (, self, ,, cls, =, 'current', ...",Does this element have text (of the specified ...,"[Does, this, element, have, text, (, of, the, ...",train,https://github.com/proycon/pynlpl/blob/7707f69...,106
4,proycon/pynlpl,pynlpl/formats/folia.py,AbstractElement.hasphon,"def hasphon(self,cls='current',strict=True,cor...",python,"def hasphon(self,cls='current',strict=True,cor...","[def, hasphon, (, self, ,, cls, =, 'current', ...",Does this element have phonetic content (of th...,"[Does, this, element, have, phonetic, content,...",train,https://github.com/proycon/pynlpl/blob/7707f69...,103


In [10]:
df_test.head()

Unnamed: 0,repository_name,func_path_in_repository,func_name,whole_func_string,language,func_code_string,func_code_tokens,func_documentation_string,func_documentation_tokens,split_name,func_code_url,func_code_len
0,soimort/you-get,src/you_get/extractors/youtube.py,YouTube.get_vid_from_url,"def get_vid_from_url(url):\n """"""Extract...",python,"def get_vid_from_url(url):\n """"""Extract...","[def, get_vid_from_url, (, url, ), :, return, ...",Extracts video ID from URL.,"[Extracts, video, ID, from, URL, .]",test,https://github.com/soimort/you-get/blob/b746ac...,53
1,soimort/you-get,src/you_get/extractors/miomio.py,sina_xml_to_url_list,"def sina_xml_to_url_list(xml_data):\n """"""st...",python,"def sina_xml_to_url_list(xml_data):\n """"""st...","[def, sina_xml_to_url_list, (, xml_data, ), :,...",str->list\n Convert XML to URL List.\n F...,"[str, -, >, list, Convert, XML, to, URL, List,...",test,https://github.com/soimort/you-get/blob/b746ac...,52
2,soimort/you-get,src/you_get/extractors/fc2video.py,makeMimi,"def makeMimi(upid):\n """"""From http://cdn37....",python,"def makeMimi(upid):\n """"""From http://cdn37....","[def, makeMimi, (, upid, ), :, strSeed, =, ""gG...",From http://cdn37.atwikiimg.com/sitescript/pub...,"[From, http, :, //, cdn37, ., atwikiimg, ., co...",test,https://github.com/soimort/you-get/blob/b746ac...,30
3,soimort/you-get,src/you_get/extractors/fc2video.py,fc2video_download,"def fc2video_download(url, output_dir = '.', m...",python,"def fc2video_download(url, output_dir = '.', m...","[def, fc2video_download, (, url, ,, output_dir...",wrapper,[wrapper],test,https://github.com/soimort/you-get/blob/b746ac...,66
4,soimort/you-get,src/you_get/extractors/dailymotion.py,dailymotion_download,"def dailymotion_download(url, output_dir='.', ...",python,"def dailymotion_download(url, output_dir='.', ...","[def, dailymotion_download, (, url, ,, output_...",Downloads Dailymotion videos by URL.,"[Downloads, Dailymotion, videos, by, URL, .]",test,https://github.com/soimort/you-get/blob/b746ac...,150


In [11]:
df_val.head()

Unnamed: 0,repository_name,func_path_in_repository,func_name,whole_func_string,language,func_code_string,func_code_tokens,func_documentation_string,func_documentation_tokens,split_name,func_code_url,func_code_len
1,openai/baselines,baselines/deepq/deepq.py,ActWrapper.save_act,"def save_act(self, path=None):\n """"""Sav...",python,"def save_act(self, path=None):\n """"""Sav...","[def, save_act, (, self, ,, path, =, None, ), ...",Save model to a pickle located at `path`,"[Save, model, to, a, pickle, located, at, path]",valid,https://github.com/openai/baselines/blob/33010...,171
2,openai/baselines,baselines/common/models.py,nature_cnn,"def nature_cnn(unscaled_images, **conv_kwargs)...",python,"def nature_cnn(unscaled_images, **conv_kwargs)...","[def, nature_cnn, (, unscaled_images, ,, *, *,...",CNN from Nature paper.,"[CNN, from, Nature, paper, .]",valid,https://github.com/openai/baselines/blob/33010...,168
3,openai/baselines,baselines/common/models.py,mlp,"def mlp(num_layers=2, num_hidden=64, activatio...",python,"def mlp(num_layers=2, num_hidden=64, activatio...","[def, mlp, (, num_layers, =, 2, ,, num_hidden,...",Stack of fully-connected layers to be used in ...,"[Stack, of, fully, -, connected, layers, to, b...",valid,https://github.com/openai/baselines/blob/33010...,105
4,openai/baselines,baselines/common/models.py,lstm,"def lstm(nlstm=128, layer_norm=False):\n """"...",python,"def lstm(nlstm=128, layer_norm=False):\n """"...","[def, lstm, (, nlstm, =, 128, ,, layer_norm, =...",Builds LSTM (Long-Short Term Memory) network t...,"[Builds, LSTM, (, Long, -, Short, Term, Memory...",valid,https://github.com/openai/baselines/blob/33010...,194
5,openai/baselines,baselines/common/models.py,conv_only,"def conv_only(convs=[(32, 8, 4), (64, 4, 2), (...",python,"def conv_only(convs=[(32, 8, 4), (64, 4, 2), (...","[def, conv_only, (, convs, =, [, (, 32, ,, 8, ...",convolutions-only net\n\n Parameters:\n ...,"[convolutions, -, only, net]",valid,https://github.com/openai/baselines/blob/33010...,109


## Getting AST of functions

In [12]:
df_train = df_train.head(100000)
df_val = df_val.head(5000)
df_test = df_test.head(5000)

In [13]:
import ast


## Filter out AST Errors (caused by comments in foreign languages)

In [16]:
def get_ast(code_str):
    try:
        return ast.dump(ast.parse(code_str), indent=2)
    except SyntaxError as e:
        return ''
    
df_train['ast'] = df_train['func_code_string'].progress_apply(get_ast)
df_val['ast'] = df_val['func_code_string'].progress_apply(get_ast)
df_test['ast'] = df_test['func_code_string'].progress_apply(get_ast)

df_train = df_train[df_train['ast'] != '']#.iloc[0]['whole_func_string']
df_val = df_val[df_val['ast'] != '']
df_test = df_test[df_test['ast'] != '']

  0%|          | 0/100000 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

In [17]:
def get_tokenized_ast(code_str):
    def tokenize_ast(node):
        tokens = []
        for child in ast.iter_child_nodes(node):
            # Tokenize the node type
            tokens.append(type(child).__name__)
            # Recursively tokenize children
            tokens.extend(tokenize_ast(child))
        return tokens

    # Tokenize the AST starting from the root
    root = ast.parse(code_str)
    return tokenize_ast(root)

df_train['ast_tokens'] = df_train['func_code_string'].progress_apply(get_tokenized_ast)
df_val['ast_tokens'] = df_val['func_code_string'].progress_apply(get_tokenized_ast)
df_test['ast_tokens'] = df_test['func_code_string'].progress_apply(get_tokenized_ast)

  0%|          | 0/98803 [00:00<?, ?it/s]

  0%|          | 0/4880 [00:00<?, ?it/s]

  0%|          | 0/4983 [00:00<?, ?it/s]

In [18]:
print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

(98803, 14)
(4880, 14)
(4983, 14)


## Adding Doc Strings

#### Enrich with helpful information for docstring

In [32]:
def add_params(df):
    def calc_num_params(tokens) :
        try:
            close_idx = np.where(tokens ==')')[0][0]
            open_idx = np.where(tokens=='(')[0][0]
            segment = tokens[open_idx + 1:close_idx]  # Slice the array to get the segment between '(' and ')'
            commas = np.count_nonzero(segment == ',')
            return commas+1
        except:
            # print(tokens)
            return -1

    df['fn_name'] = df['func_name'].apply(lambda x: x.split('.')[-1])
    df['class_fn'] = df['func_name'].apply(lambda x: '.' in x)
    df['num_params'] = df['func_code_tokens'].apply(calc_num_params)
    return df

df_train = add_params(df_train)
df_val = add_params(df_val)
df_test = add_params(df_test)

In [33]:
def calc_doc_string(df):
    dfs = []

    def check_if_function_used(tokens, fn_name):
        m1 = tokens[:-1] == fn_name
        m2 = tokens[1:] == '('
        combined_mask = m1 & m2

        indices = np.where(combined_mask)[0]
        if indices.size == 0:
            return -1
        return indices[0]

    all_repos = set(df['repository_name'])
    # print(df)
    # print(all_repos)

    for repo_name in tqdm(all_repos):
        filtered = df[df['repository_name'] == repo_name]
        filtered['used_function_docstring'] = filtered.apply(lambda row: {}, axis=1)

        # Get documentation string
        all_fns = set(filtered['fn_name'])
        for fn in all_fns:
            for i, row in filtered.iterrows():
                if row['fn_name'] == fn:
                    continue
                # print(fn)
                # print(row)
                try:
                    fn_body = row['func_code_tokens'][np.where(row['func_code_tokens'] == ':')[0][0]+1:]
                except:
                    continue
                
                index = check_if_function_used(fn_body, fn)

                if index != -1: 
                    # Check num params
                    if fn_body[index-1] == '.': 
                        # function called a class method
                        doc_string = filtered[(filtered['fn_name'] == fn) & (filtered['class_fn'])].reset_index(drop=True)
                        try:
                            filtered.loc[i, 'used_function_docstring'][fn] = doc_string.iloc[0]['func_documentation_tokens']
                        except:
                            pass
                    else :
                        # function called a non-class method
                        doc_string = filtered[(filtered['fn_name'] == fn) & (filtered['class_fn'] == False)].reset_index(drop=True)
                        try:
                            filtered.loc[i, 'used_function_docstring'][fn] = doc_string.iloc[0]['func_documentation_tokens']
                        except:
                            pass
        dfs.append(filtered)
    return pd.concat(dfs).reset_index(drop=True)

df_train = calc_doc_string(df_train)
df_val = calc_doc_string(df_val)
df_test = calc_doc_string(df_test)


  0%|          | 0/4054 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['used_function_docstring'] = filtered.apply(lambda row: {}, axis=1)


  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/36 [00:00<?, ?it/s]

In [55]:
df_train.head(10)

Unnamed: 0,repository_name,func_path_in_repository,func_name,whole_func_string,language,func_code_string,func_code_tokens,func_documentation_string,func_documentation_tokens,split_name,func_code_url,ast,fn_name,class_fn,num_params,used_function_docstring,ast_tokens
0,DirectlineDev/python-myeventhub,myeventhub/client.py,Client._call,"def _call(self, method, **kwargs):\n """"...",python,"def _call(self, method, **kwargs):\n """"...","[def, _call, (, self, ,, method, ,, *, *, kwar...",Вызов метода API,"[Вызов, метода, API]",train,https://github.com/DirectlineDev/python-myeven...,Module(\n body=[\n FunctionDef(\n nam...,_call,True,3,{},"[FunctionDef, arguments, arg, arg, arg, Expr, ..."
1,DirectlineDev/python-myeventhub,myeventhub/client.py,Client.get_action_list,"def get_action_list(self, page=1):\n """"...",python,"def get_action_list(self, page=1):\n """"...","[def, get_action_list, (, self, ,, page, =, 1,...",Получение списка событий\n :param page:...,"[Получение, списка, событий, :, param, page, :...",train,https://github.com/DirectlineDev/python-myeven...,Module(\n body=[\n FunctionDef(\n nam...,get_action_list,True,2,"{'_call': ['Вызов', 'метода', 'API']}","[FunctionDef, arguments, arg, arg, Constant, E..."
2,DirectlineDev/python-myeventhub,myeventhub/client.py,Client.get_action_by_id,"def get_action_by_id(self, action_id):\n ...",python,"def get_action_by_id(self, action_id):\n ...","[def, get_action_by_id, (, self, ,, action_id,...",Получение детального описания события,"[Получение, детального, описания, события]",train,https://github.com/DirectlineDev/python-myeven...,Module(\n body=[\n FunctionDef(\n nam...,get_action_by_id,True,2,"{'_call': ['Вызов', 'метода', 'API']}","[FunctionDef, arguments, arg, arg, Expr, Const..."
3,DirectlineDev/python-myeventhub,myeventhub/client.py,Client.all_actions,"def all_actions(self):\n """""" Генератор,...",python,"def all_actions(self):\n """""" Генератор,...","[def, all_actions, (, self, ), :, count, =, se...","Генератор, возвращающий все события","[Генератор, возвращающий, все, события]",train,https://github.com/DirectlineDev/python-myeven...,Module(\n body=[\n FunctionDef(\n nam...,all_actions,True,1,"{'get_action_list': ['Получение', 'списка', 'с...","[FunctionDef, arguments, arg, Expr, Constant, ..."
4,tommyod/streprogen,streprogen/program.py,Program._set_scalers,"def _set_scalers(self):\n """"""\n ...",python,"def _set_scalers(self):\n """"""\n ...","[def, _set_scalers, (, self, ), :, # Set defau...",Set the variables self._scalers as given by se...,"[Set, the, variables, self, ., _scalers, as, g...",train,https://github.com/tommyod/streprogen/blob/21b...,Module(\n body=[\n FunctionDef(\n nam...,_set_scalers,True,1,{},"[FunctionDef, arguments, arg, Expr, Constant, ..."
5,tommyod/streprogen,streprogen/program.py,Program._validate,"def _validate(self):\n """"""\n The...",python,"def _validate(self):\n """"""\n The...","[def, _validate, (, self, ), :, # Validate the...",The purpose of this method is to verify that t...,"[The, purpose, of, this, method, is, to, verif...",train,https://github.com/tommyod/streprogen/blob/21b...,Module(\n body=[\n FunctionDef(\n nam...,_validate,True,1,"{'weekly_growth': ['Calculate', 'the', 'weekly...","[FunctionDef, arguments, arg, Expr, Constant, ..."
6,tommyod/streprogen,streprogen/program.py,Program.add_days,"def add_days(self, *days):\n """"""Add one...",python,"def add_days(self, *days):\n """"""Add one...","[def, add_days, (, self, ,, *, days, ), :, for...",Add one or several days to the program.\n \...,"[Add, one, or, several, days, to, the, program...",train,https://github.com/tommyod/streprogen/blob/21b...,Module(\n body=[\n FunctionDef(\n nam...,add_days,True,2,{},"[FunctionDef, arguments, arg, arg, Expr, Const..."
7,tommyod/streprogen,streprogen/program.py,Program.repstring_penalty,"def repstring_penalty(reps, intensities, desir...",python,"def repstring_penalty(reps, intensities, desir...","[def, repstring_penalty, (, reps, ,, intensiti...","Penalty function which calculates how ""bad"" a ...","[Penalty, function, which, calculates, how, ba...",train,https://github.com/tommyod/streprogen/blob/21b...,Module(\n body=[\n FunctionDef(\n nam...,repstring_penalty,True,5,"{'spread': ['Returns', 'the', 'maximal', 'spre...","[FunctionDef, arguments, arg, arg, arg, arg, a..."
8,tommyod/streprogen,streprogen/program.py,Program._render_dynamic,"def _render_dynamic(self, dynamic_exercise, mi...",python,"def _render_dynamic(self, dynamic_exercise, mi...","[def, _render_dynamic, (, self, ,, dynamic_exe...",Render a single dynamic exercise.\n Thi...,"[Render, a, single, dynamic, exercise, ., This...",train,https://github.com/tommyod/streprogen/blob/21b...,Module(\n body=[\n FunctionDef(\n nam...,_render_dynamic,True,6,"{'repstring_penalty': ['Penalty', 'function', ...","[FunctionDef, arguments, arg, arg, arg, arg, a..."
9,tommyod/streprogen,streprogen/program.py,Program._initialize_render_dictionary,def _initialize_render_dictionary(self):\n ...,python,def _initialize_render_dictionary(self):\n ...,"[def, _initialize_render_dictionary, (, self, ...",Initialize a dictionary for rendered values.\n...,"[Initialize, a, dictionary, for, rendered, val...",train,https://github.com/tommyod/streprogen/blob/21b...,Module(\n body=[\n FunctionDef(\n nam...,_initialize_render_dictionary,True,1,{},"[FunctionDef, arguments, arg, Expr, Constant, ..."


In [56]:
df_val.head(10)

Unnamed: 0,repository_name,func_path_in_repository,func_name,whole_func_string,language,func_code_string,func_code_tokens,func_documentation_string,func_documentation_tokens,split_name,func_code_url,ast,fn_name,class_fn,num_params,used_function_docstring,ast_tokens
0,aleju/imgaug,imgaug/augmentables/kps.py,compute_geometric_median,"def compute_geometric_median(X, eps=1e-5):\n ...",python,"def compute_geometric_median(X, eps=1e-5):\n ...","[def, compute_geometric_median, (, X, ,, eps, ...",Estimate the geometric median of points in 2D....,"[Estimate, the, geometric, median, of, points,...",valid,https://github.com/aleju/imgaug/blob/786be74aa...,Module(\n body=[\n FunctionDef(\n nam...,compute_geometric_median,False,2,{},"[FunctionDef, arguments, arg, arg, Constant, E..."
1,aleju/imgaug,imgaug/augmentables/kps.py,Keypoint.project,"def project(self, from_shape, to_shape):\n ...",python,"def project(self, from_shape, to_shape):\n ...","[def, project, (, self, ,, from_shape, ,, to_s...",Project the keypoint onto a new position on a ...,"[Project, the, keypoint, onto, a, new, positio...",valid,https://github.com/aleju/imgaug/blob/786be74aa...,Module(\n body=[\n FunctionDef(\n nam...,project,True,3,"{'project_coords': ['Project', 'coordinates', ...","[FunctionDef, arguments, arg, arg, arg, Expr, ..."
2,aleju/imgaug,imgaug/augmentables/kps.py,Keypoint.shift,"def shift(self, x=0, y=0):\n """"""\n ...",python,"def shift(self, x=0, y=0):\n """"""\n ...","[def, shift, (, self, ,, x, =, 0, ,, y, =, 0, ...",Move the keypoint around on an image.\n\n ...,"[Move, the, keypoint, around, on, an, image, .]",valid,https://github.com/aleju/imgaug/blob/786be74aa...,Module(\n body=[\n FunctionDef(\n nam...,shift,True,3,"{'deepcopy': ['Create', 'a', 'deep', 'copy', '...","[FunctionDef, arguments, arg, arg, arg, Consta..."
3,aleju/imgaug,imgaug/augmentables/kps.py,Keypoint.draw_on_image,"def draw_on_image(self, image, color=(0, 255, ...",python,"def draw_on_image(self, image, color=(0, 255, ...","[def, draw_on_image, (, self, ,, image, ,, col...",Draw the keypoint onto a given image.\n\n ...,"[Draw, the, keypoint, onto, a, given, image, .]",valid,https://github.com/aleju/imgaug/blob/786be74aa...,Module(\n body=[\n FunctionDef(\n nam...,draw_on_image,True,5,"{'copy': ['Create', 'a', 'shallow', 'copy', 'o...","[FunctionDef, arguments, arg, arg, arg, arg, a..."
4,aleju/imgaug,imgaug/augmentables/kps.py,Keypoint.generate_similar_points_manhattan,"def generate_similar_points_manhattan(self, nb...",python,"def generate_similar_points_manhattan(self, nb...","[def, generate_similar_points_manhattan, (, se...",Generate nearby points to this keypoint based ...,"[Generate, nearby, points, to, this, keypoint,...",valid,https://github.com/aleju/imgaug/blob/786be74aa...,Module(\n body=[\n FunctionDef(\n nam...,generate_similar_points_manhattan,True,4,"{'deepcopy': ['Create', 'a', 'deep', 'copy', '...","[FunctionDef, arguments, arg, arg, arg, arg, C..."
5,aleju/imgaug,imgaug/augmentables/kps.py,Keypoint.copy,"def copy(self, x=None, y=None):\n """"""\n...",python,"def copy(self, x=None, y=None):\n """"""\n...","[def, copy, (, self, ,, x, =, None, ,, y, =, N...",Create a shallow copy of the Keypoint object.\...,"[Create, a, shallow, copy, of, the, Keypoint, ...",valid,https://github.com/aleju/imgaug/blob/786be74aa...,Module(\n body=[\n FunctionDef(\n nam...,copy,True,3,"{'deepcopy': ['Create', 'a', 'deep', 'copy', '...","[FunctionDef, arguments, arg, arg, arg, Consta..."
6,aleju/imgaug,imgaug/augmentables/kps.py,Keypoint.deepcopy,"def deepcopy(self, x=None, y=None):\n ""...",python,"def deepcopy(self, x=None, y=None):\n ""...","[def, deepcopy, (, self, ,, x, =, None, ,, y, ...",Create a deep copy of the Keypoint object.\n\n...,"[Create, a, deep, copy, of, the, Keypoint, obj...",valid,https://github.com/aleju/imgaug/blob/786be74aa...,Module(\n body=[\n FunctionDef(\n nam...,deepcopy,True,3,{},"[FunctionDef, arguments, arg, arg, arg, Consta..."
7,aleju/imgaug,imgaug/augmentables/kps.py,KeypointsOnImage.on,"def on(self, image):\n """"""\n Pro...",python,"def on(self, image):\n """"""\n Pro...","[def, on, (, self, ,, image, ), :, shape, =, n...",Project keypoints from one image to a new one....,"[Project, keypoints, from, one, image, to, a, ...",valid,https://github.com/aleju/imgaug/blob/786be74aa...,Module(\n body=[\n FunctionDef(\n nam...,on,True,2,"{'normalize_shape': ['Normalize', 'a', 'shape'...","[FunctionDef, arguments, arg, arg, Expr, Const..."
8,aleju/imgaug,imgaug/augmentables/kps.py,KeypointsOnImage.draw_on_image,"def draw_on_image(self, image, color=(0, 255, ...",python,"def draw_on_image(self, image, color=(0, 255, ...","[def, draw_on_image, (, self, ,, image, ,, col...",Draw all keypoints onto a given image.\n\n ...,"[Draw, all, keypoints, onto, a, given, image, .]",valid,https://github.com/aleju/imgaug/blob/786be74aa...,Module(\n body=[\n FunctionDef(\n nam...,draw_on_image,True,5,"{'copy': ['Create', 'a', 'shallow', 'copy', 'o...","[FunctionDef, arguments, arg, arg, arg, arg, a..."
9,aleju/imgaug,imgaug/augmentables/kps.py,KeypointsOnImage.shift,"def shift(self, x=0, y=0):\n """"""\n ...",python,"def shift(self, x=0, y=0):\n """"""\n ...","[def, shift, (, self, ,, x, =, 0, ,, y, =, 0, ...",Move the keypoints around on an image.\n\n ...,"[Move, the, keypoints, around, on, an, image, .]",valid,https://github.com/aleju/imgaug/blob/786be74aa...,Module(\n body=[\n FunctionDef(\n nam...,shift,True,3,"{'deepcopy': ['Create', 'a', 'deep', 'copy', '...","[FunctionDef, arguments, arg, arg, arg, Consta..."


In [57]:
df_test.head(10)

Unnamed: 0,repository_name,func_path_in_repository,func_name,whole_func_string,language,func_code_string,func_code_tokens,func_documentation_string,func_documentation_tokens,split_name,func_code_url,ast,fn_name,class_fn,num_params,used_function_docstring,ast_tokens
0,python/performance,performance/compare.py,tdist95conf_level,"def tdist95conf_level(df):\n """"""Approximate...",python,"def tdist95conf_level(df):\n """"""Approximate...","[def, tdist95conf_level, (, df, ), :, df, =, i...",Approximate the 95% confidence interval for St...,"[Approximate, the, 95%, confidence, interval, ...",test,https://github.com/python/performance/blob/2a9...,Module(\n body=[\n FunctionDef(\n nam...,tdist95conf_level,False,1,{},"[FunctionDef, arguments, arg, Expr, Constant, ..."
1,python/performance,performance/compare.py,pooled_sample_variance,"def pooled_sample_variance(sample1, sample2):\...",python,"def pooled_sample_variance(sample1, sample2):\...","[def, pooled_sample_variance, (, sample1, ,, s...",Find the pooled sample variance for two sample...,"[Find, the, pooled, sample, variance, for, two...",test,https://github.com/python/performance/blob/2a9...,Module(\n body=[\n FunctionDef(\n nam...,pooled_sample_variance,False,2,{},"[FunctionDef, arguments, arg, arg, Expr, Const..."
2,python/performance,performance/compare.py,tscore,"def tscore(sample1, sample2):\n """"""Calculat...",python,"def tscore(sample1, sample2):\n """"""Calculat...","[def, tscore, (, sample1, ,, sample2, ), :, if...",Calculate a t-test score for the difference be...,"[Calculate, a, t, -, test, score, for, the, di...",test,https://github.com/python/performance/blob/2a9...,Module(\n body=[\n FunctionDef(\n nam...,tscore,False,2,"{'pooled_sample_variance': ['Find', 'the', 'po...","[FunctionDef, arguments, arg, arg, Expr, Const..."
3,python/performance,performance/compare.py,is_significant,"def is_significant(sample1, sample2):\n """"""...",python,"def is_significant(sample1, sample2):\n """"""...","[def, is_significant, (, sample1, ,, sample2, ...",Determine whether two samples differ significa...,"[Determine, whether, two, samples, differ, sig...",test,https://github.com/python/performance/blob/2a9...,Module(\n body=[\n FunctionDef(\n nam...,is_significant,False,2,"{'tdist95conf_level': ['Approximate', 'the', '...","[FunctionDef, arguments, arg, arg, Expr, Const..."
4,python/performance,performance/benchmarks/bm_mdp.py,topoSort,"def topoSort(roots, getParents):\n """"""Retur...",python,"def topoSort(roots, getParents):\n """"""Retur...","[def, topoSort, (, roots, ,, getParents, ), :,...",Return a topological sorting of nodes in a gra...,"[Return, a, topological, sorting, of, nodes, i...",test,https://github.com/python/performance/blob/2a9...,Module(\n body=[\n FunctionDef(\n nam...,topoSort,False,2,{},"[FunctionDef, arguments, arg, arg, Expr, Const..."
5,python/performance,performance/benchmarks/bm_nqueens.py,permutations,"def permutations(iterable, r=None):\n """"""pe...",python,"def permutations(iterable, r=None):\n """"""pe...","[def, permutations, (, iterable, ,, r, =, None...","permutations(range(3), 2) --> (0,1) (0,2) (1,0...","[permutations, (, range, (, 3, ), 2, ), --, >,...",test,https://github.com/python/performance/blob/2a9...,Module(\n body=[\n FunctionDef(\n nam...,permutations,False,2,{},"[FunctionDef, arguments, arg, arg, Constant, E..."
6,python/performance,performance/benchmarks/bm_nqueens.py,n_queens,"def n_queens(queen_count):\n """"""N-Queens so...",python,"def n_queens(queen_count):\n """"""N-Queens so...","[def, n_queens, (, queen_count, ), :, cols, =,...",N-Queens solver.\n\n Args:\n queen_c...,"[N, -, Queens, solver, .]",test,https://github.com/python/performance/blob/2a9...,Module(\n body=[\n FunctionDef(\n nam...,n_queens,False,1,"{'permutations': ['permutations', '(', 'range'...","[FunctionDef, arguments, arg, Expr, Constant, ..."
7,python/performance,performance/benchmarks/bm_go.py,UCTNode.play,"def play(self, board):\n """""" uct tree s...",python,"def play(self, board):\n """""" uct tree s...","[def, play, (, self, ,, board, ), :, color, =,...",uct tree search,"[uct, tree, search]",test,https://github.com/python/performance/blob/2a9...,Module(\n body=[\n FunctionDef(\n nam...,play,True,2,"{'random_playout': ['random', 'play', 'until',...","[FunctionDef, arguments, arg, arg, Expr, Const..."
8,python/performance,performance/benchmarks/bm_go.py,UCTNode.select,"def select(self, board):\n """""" select m...",python,"def select(self, board):\n """""" select m...","[def, select, (, self, ,, board, ), :, if, sel...","select move; unexplored children first, then a...","[select, move, ;, unexplored, children, first,...",test,https://github.com/python/performance/blob/2a9...,Module(\n body=[\n FunctionDef(\n nam...,select,True,2,{},"[FunctionDef, arguments, arg, arg, Expr, Const..."
9,python/performance,performance/benchmarks/bm_go.py,UCTNode.random_playout,"def random_playout(self, board):\n """""" ...",python,"def random_playout(self, board):\n """""" ...","[def, random_playout, (, self, ,, board, ), :,...",random play until both players pass,"[random, play, until, both, players, pass]",test,https://github.com/python/performance/blob/2a9...,Module(\n body=[\n FunctionDef(\n nam...,random_playout,True,2,{},"[FunctionDef, arguments, arg, arg, Expr, Const..."


## Pickle Dataframes for later use

In [19]:
df_train.to_pickle("df_train.pkl")
df_val.to_pickle("df_val.pkl")
df_test.to_pickle("df_test.pkl")

## Load Dataframes 

In [8]:
df_train = pd.read_pickle("df_train.pkl")
df_val = pd.read_pickle("df_val.pkl")
df_test = pd.read_pickle("df_test.pkl")

In [20]:
df_train

Unnamed: 0,repository_name,func_path_in_repository,func_name,whole_func_string,language,func_code_string,func_code_tokens,func_documentation_string,func_documentation_tokens,split_name,func_code_url,func_code_len,ast,ast_tokens
0,proycon/pynlpl,pynlpl/formats/folia.py,AbstractElement.addidsuffix,"def addidsuffix(self, idsuffix, recursive = Tr...",python,"def addidsuffix(self, idsuffix, recursive = Tr...","[def, addidsuffix, (, self, ,, idsuffix, ,, re...","Appends a suffix to this element's ID, and opt...","[Appends, a, suffix, to, this, element, s, ID,...",train,https://github.com/proycon/pynlpl/blob/7707f69...,44,Module(\n body=[\n FunctionDef(\n nam...,"[FunctionDef, arguments, arg, arg, arg, Consta..."
1,proycon/pynlpl,pynlpl/formats/folia.py,AbstractElement.setparents,"def setparents(self):\n """"""Correct all ...",python,"def setparents(self):\n """"""Correct all ...","[def, setparents, (, self, ), :, for, c, in, s...",Correct all parent relations for elements with...,"[Correct, all, parent, relations, for, element...",train,https://github.com/proycon/pynlpl/blob/7707f69...,29,Module(\n body=[\n FunctionDef(\n nam...,"[FunctionDef, arguments, arg, Expr, Constant, ..."
2,proycon/pynlpl,pynlpl/formats/folia.py,AbstractElement.setdoc,"def setdoc(self,newdoc):\n """"""Set a dif...",python,"def setdoc(self,newdoc):\n """"""Set a dif...","[def, setdoc, (, self, ,, newdoc, ), :, self, ...",Set a different document. Usually no need to c...,"[Set, a, different, document, ., Usually, no, ...",train,https://github.com/proycon/pynlpl/blob/7707f69...,53,Module(\n body=[\n FunctionDef(\n nam...,"[FunctionDef, arguments, arg, arg, Expr, Const..."
3,proycon/pynlpl,pynlpl/formats/folia.py,AbstractElement.hastext,"def hastext(self,cls='current',strict=True, co...",python,"def hastext(self,cls='current',strict=True, co...","[def, hastext, (, self, ,, cls, =, 'current', ...",Does this element have text (of the specified ...,"[Does, this, element, have, text, (, of, the, ...",train,https://github.com/proycon/pynlpl/blob/7707f69...,106,Module(\n body=[\n FunctionDef(\n nam...,"[FunctionDef, arguments, arg, arg, arg, arg, C..."
4,proycon/pynlpl,pynlpl/formats/folia.py,AbstractElement.hasphon,"def hasphon(self,cls='current',strict=True,cor...",python,"def hasphon(self,cls='current',strict=True,cor...","[def, hasphon, (, self, ,, cls, =, 'current', ...",Does this element have phonetic content (of th...,"[Does, this, element, have, phonetic, content,...",train,https://github.com/proycon/pynlpl/blob/7707f69...,103,Module(\n body=[\n FunctionDef(\n nam...,"[FunctionDef, arguments, arg, arg, arg, arg, C..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102284,pyokagan/pyglreg,glreg.py,Registry.text,"def text(self):\n """"""Formatted API decl...",python,"def text(self):\n """"""Formatted API decl...","[def, text, (, self, ), :, out, =, [, ], out, ...",Formatted API declarations.\n\n Equival...,"[Formatted, API, declarations, .]",train,https://github.com/pyokagan/pyglreg/blob/68fa5...,76,Module(\n body=[\n FunctionDef(\n nam...,"[FunctionDef, arguments, arg, Expr, Constant, ..."
102285,pyokagan/pyglreg,glreg.py,Registry.get_type,"def get_type(self, name, api=None):\n ""...",python,"def get_type(self, name, api=None):\n ""...","[def, get_type, (, self, ,, name, ,, api, =, N...","Returns Type `name`, with preference for the T...","[Returns, Type, name, with, preference, for, t...",train,https://github.com/pyokagan/pyglreg/blob/68fa5...,46,Module(\n body=[\n FunctionDef(\n nam...,"[FunctionDef, arguments, arg, arg, arg, Consta..."
102286,pyokagan/pyglreg,glreg.py,Registry.get_features,"def get_features(self, api=None):\n """"""...",python,"def get_features(self, api=None):\n """"""...","[def, get_features, (, self, ,, api, =, None, ...",Returns filtered list of features in this regi...,"[Returns, filtered, list, of, features, in, th...",train,https://github.com/pyokagan/pyglreg/blob/68fa5...,35,Module(\n body=[\n FunctionDef(\n nam...,"[FunctionDef, arguments, arg, arg, Constant, E..."
102287,pyokagan/pyglreg,glreg.py,Registry.get_extensions,"def get_extensions(self, support=None):\n ...",python,"def get_extensions(self, support=None):\n ...","[def, get_extensions, (, self, ,, support, =, ...",Returns filtered list of extensions in this re...,"[Returns, filtered, list, of, extensions, in, ...",train,https://github.com/pyokagan/pyglreg/blob/68fa5...,35,Module(\n body=[\n FunctionDef(\n nam...,"[FunctionDef, arguments, arg, arg, Constant, E..."


## Convert tokens to sequence of vectors

Metrics

In [23]:
mean_func_len = df_train['func_code_tokens'].apply(lambda x : len(x)).mean()
mean_ast_len = df_train['ast_tokens'].apply(lambda x : len(x)).mean()

print(mean_func_len)
print(mean_ast_len)

96.57097456554963
101.22493244132264


Padding + Truncate + sos + eos

In [24]:
def preprocess(toks, max_len):
    toks =np.insert(toks, 0, "<sos>")
    
    if len(toks) >= max_len : 
        toks = toks[:max_len-1]
        toks = np.append(toks,"<eos>")
    else:
        toks = np.append(toks,"<eos>")
        while len(toks) < max_len:
            toks = np.append(toks,'<pad>')
    return toks

df_train['processed_func_code_tokens'] = df_train['func_code_tokens'].progress_apply(lambda x: preprocess(x, 200))
df_val['processed_func_code_tokens'] = df_val['func_code_tokens'].progress_apply(lambda x: preprocess(x, 200))
df_test['processed_func_code_tokens'] = df_test['func_code_tokens'].progress_apply(lambda x: preprocess(x, 200))

df_train['processed_ast_code_tokens'] = df_train['ast_tokens'].progress_apply(lambda x: preprocess(x, 200))
df_val['processed_ast_code_tokens'] = df_val['ast_tokens'].progress_apply(lambda x: preprocess(x, 200))
df_test['processed_ast_code_tokens'] = df_test['ast_tokens'].progress_apply(lambda x: preprocess(x, 200))

  0%|          | 0/98803 [00:00<?, ?it/s]

  0%|          | 0/4880 [00:00<?, ?it/s]

  0%|          | 0/4983 [00:00<?, ?it/s]

  0%|          | 0/98803 [00:00<?, ?it/s]

  0%|          | 0/4880 [00:00<?, ?it/s]

  0%|          | 0/4983 [00:00<?, ?it/s]

Training AST vectorization

In [25]:
# from gensim.models import Word2Vec

In [38]:
# training_asts = list(df_train['processed_ast_code_tokens'].progress_apply(lambda x: list(x)))
# ast_model = Word2Vec(sentences=training_asts, vector_size=64, window=20, min_count=1, workers=4)
# ast_model.save("ast_word2vec.model")

  0%|          | 0/98803 [00:00<?, ?it/s]

In [39]:
# training_func_codes = list(df_train['processed_func_code_tokens'].progress_apply(lambda x: list(x)))
# func_model = Word2Vec(sentences=training_func_codes, vector_size=64, window=20, min_count=1, workers=4)
# func_model.save("func_code_word2vec.model")

  0%|          | 0/98803 [00:00<?, ?it/s]

## Creating 1-hot Vectors