In [1]:
import os
import clang
from clang.cindex import *
from copy import deepcopy
import time

In [2]:
Config.set_library_file("/home/dipu/anaconda3/lib/python3.9/site-packages/clang/native/libclang.so")

In [32]:
index = Index.create()
tu = index.parse("operator.c")
root_cursor = tu.cursor

In [33]:
def print_ast(node, indent):
    try:
        current_name = "".join([node.spelling for x in list(node.get_tokens())]) if len(list(node.get_tokens())) > 0 else node.spelling
        # print(" "*indent + node.spelling + " " + str(node.kind) + " " + str(node.type.spelling))
        print(" "*indent + node.spelling + " " + str(node.kind))
        for c in node.get_children():
            print_ast(c, indent+2)
    except ValueError:
        pass

print_ast(root_cursor, 0)

operator.c CursorKind.TRANSLATION_UNIT
  __u_char CursorKind.TYPEDEF_DECL
  __u_short CursorKind.TYPEDEF_DECL
  __u_int CursorKind.TYPEDEF_DECL
  __u_long CursorKind.TYPEDEF_DECL
  __int8_t CursorKind.TYPEDEF_DECL
  __uint8_t CursorKind.TYPEDEF_DECL
  __int16_t CursorKind.TYPEDEF_DECL
  __uint16_t CursorKind.TYPEDEF_DECL
  __int32_t CursorKind.TYPEDEF_DECL
  __uint32_t CursorKind.TYPEDEF_DECL
  __int64_t CursorKind.TYPEDEF_DECL
  __uint64_t CursorKind.TYPEDEF_DECL
  __int_least8_t CursorKind.TYPEDEF_DECL
    __int8_t CursorKind.TYPE_REF
  __uint_least8_t CursorKind.TYPEDEF_DECL
    __uint8_t CursorKind.TYPE_REF
  __int_least16_t CursorKind.TYPEDEF_DECL
    __int16_t CursorKind.TYPE_REF
  __uint_least16_t CursorKind.TYPEDEF_DECL
    __uint16_t CursorKind.TYPE_REF
  __int_least32_t CursorKind.TYPEDEF_DECL
    __int32_t CursorKind.TYPE_REF
  __uint_least32_t CursorKind.TYPEDEF_DECL
    __uint32_t CursorKind.TYPE_REF
  __int_least64_t CursorKind.TYPEDEF_DECL
    __int64_t CursorKind.TYPE_R

In [35]:
operator_cursors = []

def get_binary_expressions(node, parent, grandparent, result):
    try:
        if node.kind == CursorKind.BINARY_OPERATOR:

            children_list = [i for i in node.get_children()]

            if len(children_list) == 2:
                left_offset = len([i for i in children_list[0].get_tokens()])
                #print([i.spelling for i in children_list[0].get_tokens()], [i.spelling for i in node.get_tokens()])
                operator_name = [i for i in node.get_tokens()][left_offset].spelling
                
                operator_cursors.append(grandparent)

                current_operation = {
                                     "operator": operator_name,
                                     "operands": [],
                                     "parent": parent.kind.name if parent is not None else "",
                                     "grandparent": grandparent.kind.name if grandparent is not None else "",
                                     "complex_operand": False,
                                     "location": node.extent
                                    }

                for c in children_list:
                    """ To only allow binary operation between single operators on left and right """
                    if c.kind == CursorKind.BINARY_OPERATOR or c.kind == CursorKind.PAREN_EXPR:
                        # current_operation = {}
                        current_operation["complex_operand"] = True
                        # break

                    operand = "".join([x.spelling for x in list(c.get_tokens())]) if len(list(c.get_tokens())) > 0 else c.spelling
                    
                    # to keep starting and ending quotes attached after exporting to csv format
                    if len(operand) >= 3 and operand.startswith('\"') and operand.endswith('\"'):
                        operand = '\"' + operand[1:-1].replace('\"', "\"\"") + '\"'

                    current_operation["operands"].append({"name": operand, "data_type": c.type.spelling, "cursor_kind": c.kind.name})

                # if current_operation != {}:
                result.append(current_operation)
        
        for c in node.get_children():
            get_binary_expressions(c, node, parent, result)

    except ValueError:
        pass

In [36]:
binary_operation_list = []

get_binary_expressions(root_cursor, None, None, binary_operation_list)

binary_operation_list

[{'operator': '*',
  'operands': [{'name': 'a',
    'data_type': 'int',
    'cursor_kind': 'UNEXPOSED_EXPR'},
   {'name': 'b', 'data_type': 'int', 'cursor_kind': 'UNEXPOSED_EXPR'}],
  'parent': 'UNEXPOSED_EXPR',
  'grandparent': 'VAR_DECL',
  'complex_operand': False,
  'location': <SourceRange start <SourceLocation file 'operator.c', line 7, column 15>, end <SourceLocation file 'operator.c', line 7, column 20>>}]

In [37]:
operator_cursors

[<clang.cindex.Cursor at 0x7f00c04aabc0>]

In [38]:
import graphviz

def ast_to_dot(node, dot):
    dot.node(str(node.hash), node.kind.name)
    for child in node.get_children():
        dot.edge(str(node.hash), str(child.hash))
        ast_to_dot(child, dot)

def generate_ast_graph(root):
    dot = graphviz.Digraph()
    ast_to_dot(root, dot)
    return dot

dot = generate_ast_graph(operator_cursors[0])
print(dot.render('example', view=True))

example.pdf


In [17]:
for operation in binary_operation_list:
    loc = operation["location"]
            
    positive_sample = [operation["operands"][0]["name"], operation["operator"], operation["operands"][1]["name"],
                      operation["operands"][0]["data_type"], operation["operands"][1]["data_type"],
                      operation["parent"], operation["grandparent"], str(operation["complex_operand"]),
                        str(loc.start.line), str(loc.start.column), str(loc.end.line), str(loc.end.column), str(0)]

    positive_sample = "\t".join(positive_sample)

    print(positive_sample)
    
    

a	*	b	int	int	UNEXPOSED_EXPR	VAR_DECL	False	7	15	7	20	0
a	<=	b	int	int	VAR_DECL	DECL_STMT	False	8	13	8	19	0


----
## Negative Sample Generation - Wrong Binary Operator
----

In [19]:
def generate_wrong_binary_operator_dataset(root_dir):
    total_files, total_samples = 0, 0
    global current_file
    
    with open("wrong_binary_operator_dataset.csv", 'a') as wrong_binary_operator_dataset:
        wrong_binary_operator_dataset.write("file\tleft\toperator\tright\ttype_left\ttype_right\tparent\tgrandparent\tcomplex_operand\tstart_line\tstart_column\tend_line\tend_column\tlabels")
        
        for root, dirs, files in os.walk(root_dir):
                for file in files:
                    if file.endswith(".c"):
                        total_files += 1

                        file_path = os.path.join(root, file)
                        current_file = file_path

                        if total_files > 0:
                            
                            with open(file_path, 'rb') as f:
                                # for debugging
                                # print(f.name)
                                # time.sleep(0.1)
                                
                                content = str(f.read())

                                if content.count("\\n") <= 10_000:
                                    
                                    try:
                                        start_cursor = index.parse(file_path).cursor

                                        binary_operation_list = []

                                        get_binary_expressions(start_cursor, None, None, binary_operation_list)
                                        
                                        for operation in binary_operation_list:
                                            if (len(operation["operands"][0]["name"]) <= 100) and \
                                                (len(operation["operands"][1]["name"]) <= 100): #and operation["complex_operand"] == False:
                                                
                                                loc = operation["location"]
                                                    
                                                """positive_sample = [f.name.split("/AI/MinorProject/c-corpus/")[1], operation["operands"][0]["name"],
                                                                   operation["operator"], operation["operands"][1]["name"],
                                                                  operation["operands"][0]["data_type"], operation["operands"][1]["data_type"],
                                                                  operation["parent"], operation["grandparent"], str(operation["complex_operand"]),
                                                                  str(loc.start.line), str(loc.start.column), str(loc.end.line), str(loc.end.column), str(0)]"""
                                                
                                                positive_sample = [f.name.split("/AI/MinorProject/c-corpus/")[1], operation["operands"][0]["name"],
                                                                   operation["operator"], operation["operands"][1]["name"],
                                                                  operation["operands"][0]["data_type"], operation["operands"][1]["data_type"],
                                                                  operation["parent"], operation["grandparent"], str(operation["complex_operand"]),
                                                                  str(loc.start.line), str(loc.start.column), str(loc.end.line), str(loc.end.column), str(0)]

                                                positive_sample = "\t".join(positive_sample)

                                                wrong_binary_operator_dataset.write("\n" + positive_sample)
                                                # print(positive_sample)

                                                total_samples += 1
                                                
                                    except Exception as e:
                                        # print("---Error occurred---", str(e))
                                        pass

                        current_file = file_path

                        if total_files % 1000 == 0:
                            print("Total files:", total_files, ",", "Total samples:", total_samples)

In [20]:
root_dir = '/home/dipu/Documents/AI/MinorProject/c-corpus/'

generate_wrong_binary_operator_dataset(root_dir)

Total files: 1000 , Total samples: 3097
Total files: 2000 , Total samples: 10851
Total files: 3000 , Total samples: 16445
Total files: 4000 , Total samples: 22357
Total files: 5000 , Total samples: 27414
Total files: 6000 , Total samples: 31695
Total files: 7000 , Total samples: 34797
Total files: 8000 , Total samples: 45099
Total files: 9000 , Total samples: 54480
Total files: 10000 , Total samples: 67944
Total files: 11000 , Total samples: 105509
Total files: 12000 , Total samples: 133809
Total files: 13000 , Total samples: 162833
Total files: 14000 , Total samples: 191701
Total files: 15000 , Total samples: 210414
Total files: 16000 , Total samples: 221651
Total files: 17000 , Total samples: 242592
Total files: 18000 , Total samples: 262181
Total files: 19000 , Total samples: 276865
Total files: 20000 , Total samples: 283896
Total files: 21000 , Total samples: 296265
Total files: 22000 , Total samples: 304324
Total files: 23000 , Total samples: 311887
Total files: 24000 , Total samp

Exception ignored on calling ctypes callback functionException ignored in sys.unraisablehook: <built-in function unraisablehook>
Traceback (most recent call last):
  File "/home/dipu/anaconda3/lib/python3.9/site-packages/ipykernel/iostream.py", line 563, in write
    self._schedule_flush()
  File "/home/dipu/anaconda3/lib/python3.9/site-packages/ipykernel/iostream.py", line 469, in _schedule_flush
    self.pub_thread.schedule(_schedule_in_thread)
  File "/home/dipu/anaconda3/lib/python3.9/site-packages/ipykernel/iostream.py", line 207, in schedule
    if self.thread.is_alive():
  File "/home/dipu/anaconda3/lib/python3.9/threading.py", line 1135, in is_alive
    if self._is_stopped or not self._started.is_set():
RecursionError: maximum recursion depth exceeded
Exception ignored on calling ctypes callback function: <function Cursor.get_children.<locals>.visitor at 0x7f27f75cd670>
Traceback (most recent call last):
  File "/home/dipu/anaconda3/lib/python3.9/site-packages/clang/cindex.py",

Total files: 43000 , Total samples: 587697
Total files: 44000 , Total samples: 593729
Total files: 45000 , Total samples: 597309
Total files: 46000 , Total samples: 598407
Total files: 47000 , Total samples: 598407
Total files: 48000 , Total samples: 603698


terminate called after throwing an instance of 'std::bad_alloc'
  what():  std::bad_alloc
libclang: crash detected during parsing: {
  'source_filename' : '/home/dipu/Documents/AI/MinorProject/c-corpus/cleaned/playbook-dev-tools/bootstrap/gcc/gcc/gcc/testsuite/gcc.dg/large-size-array-4.c'
  'command_line_args' : ['clang'],
  'unsaved_files' : [],
  'options' : 0,
}
terminate called recursively
libclang: crash detected during parsing: {
  'source_filename' : '/home/dipu/Documents/AI/MinorProject/c-corpus/cleaned/playbook-dev-tools/bootstrap/gcc/gcc/gcc/testsuite/gcc.dg/large-size-array-2.c'
  'command_line_args' : ['clang'],
  'unsaved_files' : [],
  'options' : 0,
}


Total files: 49000 , Total samples: 607921
Total files: 50000 , Total samples: 621796
Total files: 51000 , Total samples: 626070
Total files: 52000 , Total samples: 629170
Total files: 53000 , Total samples: 635144
Total files: 54000 , Total samples: 639364
Total files: 55000 , Total samples: 642566
Total files: 56000 , Total samples: 653929
Total files: 57000 , Total samples: 656929
Total files: 58000 , Total samples: 657597
Total files: 59000 , Total samples: 658210
Total files: 60000 , Total samples: 662608
Total files: 61000 , Total samples: 672881
Total files: 62000 , Total samples: 677392
Total files: 63000 , Total samples: 684429
Total files: 64000 , Total samples: 695166
Total files: 65000 , Total samples: 701153
Total files: 66000 , Total samples: 707740
Total files: 67000 , Total samples: 707893
Total files: 68000 , Total samples: 708176
Total files: 69000 , Total samples: 715061
Total files: 70000 , Total samples: 721425
Total files: 71000 , Total samples: 724335
Total files

Total files: 234000 , Total samples: 2197858
Total files: 235000 , Total samples: 2203464
Total files: 236000 , Total samples: 2214699
Total files: 237000 , Total samples: 2221592
Total files: 238000 , Total samples: 2229804
Total files: 239000 , Total samples: 2237749
Total files: 240000 , Total samples: 2250557
Total files: 241000 , Total samples: 2260689
Total files: 242000 , Total samples: 2263480
Total files: 243000 , Total samples: 2272432
Total files: 244000 , Total samples: 2274594
Total files: 245000 , Total samples: 2276975
Total files: 246000 , Total samples: 2287004
Total files: 247000 , Total samples: 2292212
Total files: 248000 , Total samples: 2295940
Total files: 249000 , Total samples: 2299442
Total files: 250000 , Total samples: 2309156
Total files: 251000 , Total samples: 2316920
Total files: 252000 , Total samples: 2322128
Total files: 253000 , Total samples: 2326767
Total files: 254000 , Total samples: 2339492
Total files: 255000 , Total samples: 2345899
Total file

Total files: 417000 , Total samples: 4243956
Total files: 418000 , Total samples: 4256647
Total files: 419000 , Total samples: 4271755
Total files: 420000 , Total samples: 4286797
Total files: 421000 , Total samples: 4306884
Total files: 422000 , Total samples: 4328107
Total files: 423000 , Total samples: 4334137
Total files: 424000 , Total samples: 4338834
Total files: 425000 , Total samples: 4342820
Total files: 426000 , Total samples: 4349098
Total files: 427000 , Total samples: 4355124
Total files: 428000 , Total samples: 4364935
Total files: 429000 , Total samples: 4378144
Total files: 430000 , Total samples: 4381830
Total files: 431000 , Total samples: 4388190
Total files: 432000 , Total samples: 4401253
Total files: 433000 , Total samples: 4406427
Total files: 434000 , Total samples: 4410972
Total files: 435000 , Total samples: 4421012
Total files: 436000 , Total samples: 4422185
Total files: 437000 , Total samples: 4429459
Total files: 438000 , Total samples: 4436288
Total file

Exception ignored on calling ctypes callback functionException ignored in sys.unraisablehook: <built-in function unraisablehook>
Traceback (most recent call last):
  File "/home/dipu/anaconda3/lib/python3.9/site-packages/ipykernel/iostream.py", line 563, in write
    self._schedule_flush()
  File "/home/dipu/anaconda3/lib/python3.9/site-packages/ipykernel/iostream.py", line 469, in _schedule_flush
    self.pub_thread.schedule(_schedule_in_thread)
  File "/home/dipu/anaconda3/lib/python3.9/site-packages/ipykernel/iostream.py", line 207, in schedule
    if self.thread.is_alive():
  File "/home/dipu/anaconda3/lib/python3.9/threading.py", line 1135, in is_alive
    if self._is_stopped or not self._started.is_set():
RecursionError: maximum recursion depth exceeded
Exception ignored on calling ctypes callback function: <function Cursor.get_children.<locals>.visitor at 0x7f27f750be50>
Traceback (most recent call last):
  File "/home/dipu/anaconda3/lib/python3.9/site-packages/clang/cindex.py",

Total files: 546000 , Total samples: 5347851
Total files: 547000 , Total samples: 5353336
Total files: 548000 , Total samples: 5357024
Total files: 549000 , Total samples: 5357892
Total files: 550000 , Total samples: 5357892
Total files: 551000 , Total samples: 5363615


terminate called recursively
libclang: crash detected during parsing: {
  'source_filename' : '/home/dipu/Documents/AI/MinorProject/c-corpus/cleaned/gcc/gcc/testsuite/gcc.dg/large-size-array-4.c'
  'command_line_args' : ['clang'],
  'unsaved_files' : [],
  'options' : 0,
}
terminate called recursively
libclang: crash detected during parsing: {
  'source_filename' : '/home/dipu/Documents/AI/MinorProject/c-corpus/cleaned/gcc/gcc/testsuite/gcc.dg/large-size-array-2.c'
  'command_line_args' : ['clang'],
  'unsaved_files' : [],
  'options' : 0,
}


Total files: 552000 , Total samples: 5367832
Total files: 553000 , Total samples: 5381749
Total files: 554000 , Total samples: 5386098
Total files: 555000 , Total samples: 5389198
Total files: 556000 , Total samples: 5395997
Total files: 557000 , Total samples: 5404142
Total files: 558000 , Total samples: 5421112
Total files: 559000 , Total samples: 5432336
Total files: 560000 , Total samples: 5437714
Total files: 561000 , Total samples: 5449873
Total files: 562000 , Total samples: 5453217
Total files: 563000 , Total samples: 5460585
Total files: 564000 , Total samples: 5475397
Total files: 565000 , Total samples: 5480068
Total files: 566000 , Total samples: 5488537
Total files: 567000 , Total samples: 5490275
Total files: 568000 , Total samples: 5510476
Total files: 569000 , Total samples: 5518247
Total files: 570000 , Total samples: 5526542
Total files: 571000 , Total samples: 5528715
Total files: 572000 , Total samples: 5530085
Total files: 573000 , Total samples: 5532614
Total file

-----
## Negative Sample Generation
-----

In [30]:
import numpy as np
import pandas as pd
import random

In [None]:
# error occurs while running line below due to bad data in some columns
# some cells after this will find those rows and remove them
df = pd.read_csv("wrong_binary_operator_dataset.csv", sep="\t")

In [4]:
with open("wrong_binary_operator_dataset_filtered.csv", "r") as f:
    content = f.read()
    content = content.split("\n")
    print("Total:", len(content))
    s = {}
    
    for line in content:
        total = len(line.split("\t"))
        
        if total in s.keys():
            s[total] += 1
        else:
            s[total] = 1
        
#         if(total == 1):
#             print(line.split("\t"))

    print(s)

Total: 1
{14: 1}


In [4]:
with open("wrong_binary_operator_dataset_filtered.csv", "w") as write_file:
    write_file.write("file\tleft\toperator\tright\ttype_left\ttype_right\tparent\tgrandparent\tcomplex_operand\tstart_line\tstart_column\tend_line\tend_column\tlabels\n")
    
    with open("wrong_binary_operator_dataset.csv", "r") as f:
        total, count = 0, 0
        content = f.readlines()

        for line in content:
            tokens = line.split("\t")

            if(len(tokens) == 14):
                if count >= 1:
                    write_file.write(line)
                count += 1
                
            total += 14
            
#             if total <= 15:
#                 print(line)
#             else:
#                 break
                
        print(total, count)

6855215 6854855


In [None]:
df = pd.read_csv("wrong_binary_operator_dataset_filtered.csv", sep="\t")
df

In [None]:
# with open("wrong_binary_operator_dataset_filtered.csv", "w") as write_file:
#     write_file.write("file\tfunction_name\targ1\targ2\targ_type\tparam1\tparam2\tlabels\n")
    
#     with open("function_binary_args_swap_dataset.csv", "r") as f:
#         total, count = 0, 0
#         content = f.readlines()

#         for line in content:
#             tokens = line.split("\t")

#             if(len(tokens) == 8 and tokens[-1] == "0\n"):
#                 write_file.write(line)
#                 count += 1
                
#             total += 1
                
#         print(total, count)

In [None]:
# df = df.sample(frac=1, random_state=42).reset_index(drop=True)
# df

In [None]:
df.columns[1:-5]

In [None]:
df.drop_duplicates(subset=df.columns[1:-1], ignore_index=True, inplace=True)
df

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.1, random_state=42)

df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [None]:
df_train

In [None]:
df_test

In [None]:
swappable_operand = {
    "==": ["<", ">", "<=", ">=", "!="],
    "<": ["==", ">", "<=", ">=", "!="],
    ">": ["==", "<", "<=", ">=", "!="],
    "<=": ["==", "<", ">", ">=", "!="],
    ">=": ["==", "<", ">", "<=", "!="],
    "!=": ["==", "<", ">", "<=", ">="]
}

In [None]:
def generate_random_swappable_operand(operand):
    possible_swaps = swappable_operand[operand]
    rd = random.randint(0, len(possible_swaps) - 1)
    return possible_swaps[rd]

In [None]:
print("Total:", len(df_train.index))

neg_train_array = []
count = 0

for index, row in df_train.iterrows():
    row["operand"] = generate_random_swappable_operand(row["operand"])
    
    neg_train_array.append(row.values)
    
    count += 1
    if count % 25000 == 0:
        print(count)

df_train_neg = pd.DataFrame(neg_train_array, columns=df.columns)
df_train = pd.concat([df_train, df_train_neg])
df_train

In [None]:
print("Total:", len(df_test.index))

neg_test_array = []
count = 0

for index, row in df_test.iterrows():
    row["operand"] = generate_random_swappable_operand(row["operand"])
    
    neg_test_array.append(row.values)
    
    count += 1
    if count % 25000 == 0:
        print(count)

df_test_neg = pd.DataFrame(neg_test_array, columns=df.columns)
df_test = pd.concat([df_test, df_test_neg])
df_test

In [29]:
df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True)
df_train

3

In [None]:
df_test = df_test.sample(frac=1, random_state=42).reset_index(drop=True)
df_test

In [None]:
df_train.to_csv("wrong_binary_operator_dataset_filtered_train.csv", sep="\t", index=False)
df_test.to_csv("wrong_binary_operator_dataset_filtered_test.csv", sep="\t", index=False)

In [None]:
pd.read_csv("wrong_binary_operator_dataset_filtered_train.csv", sep="\t")

In [None]:
pd.read_csv("wrong_binary_operator_dataset_filtered_train.csv", sep="\t")