In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import os
os.chdir("../")

In [3]:
train_gen_data = pd.read_pickle("dataset/all_gen_test_prompt_split.pkl")
train_human_data = pd.read_pickle("dataset/all_human_test_prompt_split.pkl")
all_data = pd.concat([train_gen_data,train_human_data])

In [4]:
from utils import ASTGraph,ASTNode
from tree_sitter import Language, Parser
PY_LANGUAGE = Language('parser/build/python.so', 'python')
parser = Parser()
parser.set_language(PY_LANGUAGE)

In [24]:
from utils import PygmentsTokenizer
pygments_tokenizer = PygmentsTokenizer('Python')

def ast_parse(func_str):
    tree = parser.parse(bytes(func_str,'utf-8'))
    ast = ASTGraph.from_tree_sitter(tree)
    return ast

def get_identifier_map(func_str):
    ast_tree = ast_parse(func_str)
    token_list = pygments_tokenizer.tokenize(func_str)

    identifier_map = {}
    for token_id,token in enumerate(token_list):
        if "token.name" in str(token.token_type).lower() and ('builtin' not in str(token.token_type).lower()):
            ast_node_id = ast_tree.find_smallest_encompassing_interval(token.source_span)
            if token.string not in identifier_map:
                identifier_map[token.string] = [(token_id,ast_node_id)]
            else:
                identifier_map[token.string].append((token_id,ast_node_id))

    func_identifier_map = {}
    var_identifier_map = {}
    for ident_name,ident_list in identifier_map.items():
        # for (token_id,ast_node_id) in ident_list:
        token_id,ast_node_id = ident_list[0]
        cur_node_id = ast_node_id
        parent_node_id = ast_tree.nodes[cur_node_id].parent
        parent_node = ast_tree.nodes[parent_node_id]
        # print(token_list[token_id],parent_node.node_type,ast_tree.nodes[ast_node_id].node_type)
        if parent_node.node_type in ['assignment','for_statement','parameters','typed_parameter',"pattern_list","for_in_clause"]:
            var_identifier_map[ident_name] = [token_list[item[0]].orig_pos for item in ident_list]
        elif parent_node.node_type in ['function_definition']:
            func_identifier_map[ident_name] = [token_list[item[0]].orig_pos for item in ident_list]

    return {'func':func_identifier_map,'var':var_identifier_map}

        # if str(token_list[token_id].token_type) == "Token.Name.Function":
        #     if token_list[token_id].string not in ['!','<=>','initialize','==']:
        #         func_identifier_map[ident_name] = [item[0] for item in ident_list]
        # elif parent_node.node_type == 'call':
        #     continue
        # else:
        #     var_identifier_map[ident_name] = [item[0] for item in ident_list]
    #
    # return func_identifier_map,var_identifier_map


In [6]:
gen_identifier_map = train_gen_data['full_code'].apply(get_identifier_map)
human_identifier_map = train_human_data['full_code'].apply(get_identifier_map)

In [7]:
train_gen_data['identifier_map'] = gen_identifier_map
train_human_data['identifier_map'] = human_identifier_map

In [8]:
train_gen_data.to_pickle("dataset/all_gen_test_prompt_split.pkl")

In [9]:
train_human_data.to_pickle("dataset/all_human_test_prompt_split.pkl")

In [11]:
gen_identifier_map

33467    {'func': {'sphere_pick_polar': [4]}, 'var': {'...
27868    {'func': {'poly': [18], 'find_zero': [257]}, '...
35734    {'func': {'median': [4]}, 'var': {'cols': [342...
8191     {'func': {'multiply': [4], 'transpose': [1075,...
5453     {'func': {'compile_geometry': [4]}, 'var': {'l...
                               ...                        
27300    {'func': {'compare_one': [5]}, 'var': {'a': [1...
42139    {'func': {'_file_size': [4]}, 'var': {'file_pa...
8258     {'func': {'find_windows_executable': [4]}, 'va...
32160    {'func': {'_file_size': [4, 226]}, 'var': {'fi...
21995    {'func': {'median': [4]}, 'var': {'values': [1...
Name: full_code, Length: 10758, dtype: object

## APPS Extract

In [3]:
import pandas as pd
hum_data = pd.read_pickle("dataset/apps_hum.pkl")
gen_data = pd.read_pickle("dataset/apps_gen.pkl")

In [25]:
human_identifier_map = hum_data.extracted_full_func.apply(get_identifier_map)

In [26]:
hum_data['identifier_map'] = human_identifier_map

In [27]:
def is_empty(x):
    return len(x['func']) + len(x['var']) == 0
failed_cases = hum_data[hum_data['identifier_map'].apply(is_empty)].extracted_full_func.tolist()

In [28]:
failed_cases

["print(input().replace('2017','2018'))",
 'print("x"*len(input()))',
 'input()\nprint(int(\'\'.join("0" if c == "R" else "1" for c in input()[::-1]), 2))\n',
 'print("A" if input().isupper() else "a")',
 'if int(input()) == 0:\n    print((1))\nelse:\n    print((0))\n',
 'input();print("Four"if"Y"in input()else"Three")',
 "print('YNeos'[len(set(input()))==1::2])"]

In [29]:
gen_identifier_map = gen_data.extracted_full_func.apply(get_identifier_map)

In [30]:
gen_data['identifier_map'] = gen_identifier_map

In [31]:
def is_empty(x):
    return len(x['func']) + len(x['var']) == 0
failed_cases = gen_data[gen_data['identifier_map'].apply(is_empty)].extracted_full_func.tolist()

In [40]:
hum_data.to_pickle("dataset/apps_hum.pkl")
gen_data.to_pickle("dataset/apps_gen.pkl")