In [None]:
!apt-get install -y -qq git
!git clone https://github.com/ConsenSys/python-solidity-parser.git
%ls
%cd python-solidity-parser
%ls
!pip install .
from solidity_parser import parser

import sys
import pprint

def parse_solidity_code(source_code):
    try:
        ast = parser.parse(source_code)
        return ast
    except Exception as e:
        print(f"Error parsing Solidity code: {e}",)
        return None

import pandas as pd
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

train_df = pd.read_csv('/content/drive/MyDrive/Practicum/SascDatasets/validation_data_source_codes.csv')

# Set of Solidity keywords
solidity_keywords = [
    # Variable Types
    'address', 'bool', 'int', 'int8', 'int16', 'int32', 'int64', 'int128', 'int256', 'uint', 'uint8', 'uint16', 'uint32', 'uint64', 'uint128', 'uint256',
    'float', 'double', 'fixed', 'ufixed', 'byte', 'bytes', 'bytes1', 'bytes2', 'bytes4', 'bytes8', 'bytes16', 'bytes32', 'string', 'mapping', 'struct', 'enum',

    # Control Structures
    'if', 'else', 'while', 'do', 'for', 'switch', 'case', 'default', 'break', 'continue', 'return',
    'throw', 'require', 'revert', 'modifier',

    # Visibility Specifiers
    'public', 'external', 'internal', 'private',

    # Function Modifiers
    'pure', 'view', 'payable', 'constant', 'anonymous'

    # Special Keywords
    'this', 'super', 'selfdestruct', 'assembly',

    # Events
    'event', 'indexed',

    # Built-in Functions
    'msg', 'msg.sender', 'msg.value', 'sender', 'value', 'now', 'block', 'tx', 'origin', 'gasleft', 'assert', 'require', 'revert',
    'keccak256', 'sha256', 'ecrecover', 'addmod', 'mulmod', 'create', 'call', 'delegatecall', 'callcode', 'send',
    'staticcall', 'selfdestruct', 'balance', 'div', 'mod', 'exp', 'sqrt', 'gas', 'this', 'abs', 'min', 'max',
    'balance', 'transfer', 'block', 'number', 'blockHash', 'timestamp',

    # Other
    'constructor', 'fallback', 'receive', 'pragma', 'after', 'alias', 'apply', 'auto', 'case', 'copyof', 'default', 'defined', 'final', 'implements', 'in', 'inline', 'let',
    'macro', 'match', 'mutable', 'null', 'of', 'partial', 'promise', 'reference', 'relocatable', 'sealed', 'sizeof', 'static', 'supports', 'switch', 'typedef', 'typeof', 'var'
]

def get_function_nodes(ast):
    function_nodes = []

    def traverse(node):
        if isinstance(node, dict):
            if 'type' in node and node['type'] == 'FunctionDefinition':
                function_nodes.append(node)
            elif 'type' in node and node['type'] == 'ModifierDefinition':
                function_nodes.append(node)
            for key, value in node.items():
                if isinstance(value, list):
                    for item in value:
                        traverse(item)
                elif isinstance(value, dict):
                    traverse(value)

    traverse(ast)
    return function_nodes

def serialize_function_node(node, keywords):
    serialized_node = []
    has_string_literal_type = False
    has_hex_literal_type = False
    has_hex_number_type = False
    has_decimal_number_type = False

    for key, value in node.items():
        if key == 'name' and value not in keywords:
            value = 'XX'
        elif key == 'namePath' and value not in keywords:
            value = 'XX'
        elif key == 'memberName' and value not in keywords:
            value = 'XX'
        # elif key == 'functionName' and value not in keywords:
        #     value = 'assembly_func'
        elif key == 'decl':
            value = 'parser_error'
        elif key == 'number':
            value = 'num_literal'
        elif key == 'type' and value == 'stringLiteral':
            has_string_literal_type = True
        elif key == 'type' and value == 'hexLiteral':
            has_hex_literal_type = True
        elif key == 'type' and value == 'HexNumber':
            has_hex_number_type = True
        elif key == 'type' and value == 'DecimalNumber':
            has_decimal_number_type = True
        elif key == 'value' and has_string_literal_type:
            value = 'string_literal'
            has_string_literal_type = False
        elif key == 'value' and has_hex_literal_type:
            value = 'hex_literal'
            has_hex_literal_type = False
        elif key == 'value' and has_hex_number_type:
            value = 'hex_value'
            has_hex_number_type = False
        elif key == 'value' and has_decimal_number_type:
            value = 'dec_value'
            has_decimal_number_type = False

        if isinstance(value, list):
            if not value:
                serialized_value = 'empty_list'
            else:
                serialized_value = ' '.join(serialize_function_node(item, keywords) for item in value if isinstance(item, dict))
        elif isinstance(value, dict):
            serialized_value = serialize_function_node(value, keywords)
        else:
            serialized_value = str(value)
        serialized_node.append(f'{key} {serialized_value}')
    return ' '.join(serialized_node)

def serialize_ast(ast, solidity_keywords):

    function_nodes = get_function_nodes(ast)
    serialized_ast = [serialize_function_node(node, solidity_keywords) for node in function_nodes]
    return ' '.join(serialized_ast)


def preprocessDataframe(df, startIndex, rowCount, output_path):
    processed_sequences = []
    for i in range(startIndex, startIndex+rowCount):
        # Step 1: Parse the source code
        ast = parse_solidity_code(df.iloc[i]['source_code'])

        # Step 2: Serialize the AST
        try:
            serialized_ast = serialize_ast(ast, solidity_keywords)
        except Exception as e:
            print("Error serializing AST: ", e)
            serialized_ast = "unserializable"

        processed_sequences.append(serialized_ast)

        # Print progress after every 1000 rows
        if (i - startIndex + 1) % 500 == 0:
            print(f"{i - startIndex + 1} rows processed")

    # Create a new DataFrame with the processed sequences and 'slither' column
    processed_df = pd.DataFrame({'processed_sequence': processed_sequences,
                                 'labels': df['slither'].iloc[startIndex:startIndex + rowCount]})

    # Save the new DataFrame to a CSV file
    processed_df.to_csv(output_path, index=False)

startIndex = 0
rowCount = 10606
training_output_path = f'/content/drive/MyDrive/Practicum/ASTDataChunked/validation_ast_sequences.csv'
preprocessDataframe(train_df, startIndex, rowCount, training_output_path)

Cloning into 'python-solidity-parser'...
remote: Enumerating objects: 198, done.[K
remote: Counting objects: 100% (60/60), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 198 (delta 55), reused 49 (delta 49), pack-reused 138[K
Receiving objects: 100% (198/198), 270.51 KiB | 3.14 MiB/s, done.
Resolving deltas: 100% (103/103), done.
[0m[01;34mdrive[0m/  [01;34mpython-solidity-parser[0m/  [01;34msample_data[0m/
/content/python-solidity-parser
README.md         [0m[01;34msamples[0m/  setup.py          [01;34msolidity_parser[0m/
requirements.txt  [01;34mscripts[0m/  [01;34msolidity-antlr4[0m/
Processing /content/python-solidity-parser
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting antlr4-python3-runtime==4.9.3 (from solidity-parser==0.1.1)
  Downloading antlr4-python3-runtime-4.9.3.tar.gz (117 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.0/117.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Prepa

line 1249:1 missing 'constant' at 'SExactGauge'
line 1249:13 mismatched input 'is' expecting '='
line 1249:30 mismatched input ',' expecting 'constant'
line 1249:45 mismatched input '{' expecting 'constant'
line 1252:19 mismatched input 'for' expecting 'constant'
line 1252:27 mismatched input ';' expecting 'constant'
line 1255:25 mismatched input 'for' expecting 'constant'
line 1255:36 mismatched input ';' expecting 'constant'
line 1261:12 extraneous input 'internal' expecting 'constant'
line 1264:12 extraneous input 'internal' expecting 'constant'
line 1267:12 extraneous input 'internal' expecting 'constant'
line 1270:12 extraneous input 'internal' expecting 'constant'
line 1273:12 extraneous input 'internal' expecting 'constant'
line 1279:12 mismatched input 'override' expecting 'constant'
line 1279:34 mismatched input ';' expecting 'constant'
line 1282:12 mismatched input 'override' expecting 'constant'
line 1282:37 mismatched input ';' expecting 'constant'
line 1285:12 mismatched i

Error parsing Solidity code: 'NoneType' object has no attribute 'getText'


line 1140:1 missing 'constant' at 'SExactGauge'
line 1140:13 mismatched input 'is' expecting '='
line 1140:30 mismatched input ',' expecting 'constant'
line 1140:45 mismatched input '{' expecting 'constant'
line 1143:19 mismatched input 'for' expecting 'constant'
line 1143:27 mismatched input ';' expecting 'constant'
line 1146:25 mismatched input 'for' expecting 'constant'
line 1146:36 mismatched input ';' expecting 'constant'
line 1152:12 extraneous input 'internal' expecting 'constant'
line 1155:12 extraneous input 'internal' expecting 'constant'
line 1158:12 extraneous input 'internal' expecting 'constant'
line 1161:12 extraneous input 'internal' expecting 'constant'
line 1167:12 mismatched input 'override' expecting 'constant'
line 1167:34 mismatched input ';' expecting 'constant'
line 1170:12 mismatched input 'override' expecting 'constant'
line 1170:37 mismatched input ';' expecting 'constant'
line 1173:12 mismatched input 'override' expecting 'constant'
line 1173:36 mismatched i

Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
500 rows processed
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
1000 rows processed
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
1500 rows processed


line 480:11 no viable alternative at input 'functionfallback'
line 487:4 extraneous input 'fallback' expecting {'~', 'from', '{', '}', '(', 'error', 'for', 'function', '[', 'address', 'mapping', 'calldata', 'if', 'try', 'while', 'unchecked', 'assembly', 'do', 'return', 'throw', 'emit', 'revert', 'var', 'bool', 'string', 'byte', '++', '--', 'new', '+', '-', 'after', 'delete', '!', 'callback', Int, Uint, Byte, Fixed, Ufixed, BooleanLiteral, DecimalNumber, HexNumber, HexLiteralFragment, 'break', 'continue', 'leave', 'payable', 'type', 'constructor', 'receive', Identifier, StringLiteralFragment}
line 780:11 no viable alternative at input 'functionfallback'


Error parsing Solidity code: 'NoneType' object is not subscriptable
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
2000 rows processed
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
2500 rows processed
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'


line 488:11 no viable alternative at input 'functionfallback'
line 496:4 extraneous input 'fallback' expecting {'~', 'from', '{', '}', '(', 'error', 'for', 'function', '[', 'address', 'mapping', 'calldata', 'if', 'try', 'while', 'unchecked', 'assembly', 'do', 'return', 'throw', 'emit', 'revert', 'var', 'bool', 'string', 'byte', '++', '--', 'new', '+', '-', 'after', 'delete', '!', 'callback', Int, Uint, Byte, Fixed, Ufixed, BooleanLiteral, DecimalNumber, HexNumber, HexLiteralFragment, 'break', 'continue', 'leave', 'payable', 'type', 'constructor', 'receive', Identifier, StringLiteralFragment}
line 782:11 no viable alternative at input 'functionfallback'


Error parsing Solidity code: 'NoneType' object is not subscriptable
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
3000 rows processed
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
3500 rows processed
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'


line 56:30 token recognition error at: '#'
line 56:29 extraneous input '&' expecting {'~', 'from', '(', 'error', '[', 'address', 'calldata', 'revert', 'var', 'bool', 'string', 'byte', '++', '--', 'new', '+', '-', 'after', 'delete', '!', 'callback', Int, Uint, Byte, Fixed, Ufixed, BooleanLiteral, DecimalNumber, HexNumber, HexLiteralFragment, 'leave', 'payable', 'type', 'constructor', 'receive', Identifier, StringLiteralFragment}
line 56:44 token recognition error at: '#'
line 56:43 mismatched input '&' expecting {'from', 'error', 'calldata', 'revert', 'callback', 'override', 'constant', 'immutable', 'leave', 'internal', 'payable', 'private', 'public', 'constructor', 'receive', Identifier}
line 57:26 token recognition error at: '#'
line 57:25 extraneous input '&' expecting {'~', 'from', '(', 'error', '[', 'address', 'calldata', 'revert', 'var', 'bool', 'string', 'byte', '++', '--', 'new', '+', '-', 'after', 'delete', '!', 'callback', Int, Uint, Byte, Fixed, Ufixed, BooleanLiteral, Decima

Error parsing Solidity code: 'NoneType' object has no attribute 'getText'
Error parsing Solidity code: maximum recursion depth exceeded in comparison
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
4000 rows processed
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attrib

line 224:48 mismatched input '.' expecting {';', '{', 'returns'}
line 224:62 mismatched input '{' expecting {'from', 'error', 'calldata', 'revert', 'callback', 'override', 'constant', 'immutable', 'leave', 'internal', 'payable', 'private', 'public', 'constructor', 'receive', Identifier}
line 230:23 extraneous input '(' expecting {'from', 'error', 'calldata', 'revert', 'callback', 'override', 'constant', 'immutable', 'leave', 'internal', 'payable', 'private', 'public', 'constructor', 'receive', Identifier}
line 230:39 mismatched input '(' expecting {';', '='}
line 230:54 mismatched input ')' expecting {'from', 'error', 'calldata', 'revert', 'callback', 'override', 'constant', 'immutable', 'leave', 'internal', 'payable', 'private', 'public', 'constructor', 'receive', Identifier}
line 230:64 mismatched input '(' expecting {'from', 'error', 'calldata', 'revert', 'callback', 'override', 'constant', 'immutable', 'leave', 'internal', 'payable', 'private', 'public', 'constructor', 'receive', I

Error parsing Solidity code: 'NoneType' object has no attribute 'getText'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
4500 rows processed
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'list' object has no attribute 'getText'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'i

line 420:11 no viable alternative at input 'functionfallback'
line 427:4 extraneous input 'fallback' expecting {'~', 'from', '{', '}', '(', 'error', 'for', 'function', '[', 'address', 'mapping', 'calldata', 'if', 'try', 'while', 'unchecked', 'assembly', 'do', 'return', 'throw', 'emit', 'revert', 'var', 'bool', 'string', 'byte', '++', '--', 'new', '+', '-', 'after', 'delete', '!', 'callback', Int, Uint, Byte, Fixed, Ufixed, BooleanLiteral, DecimalNumber, HexNumber, HexLiteralFragment, 'break', 'continue', 'leave', 'payable', 'type', 'constructor', 'receive', Identifier, StringLiteralFragment}
line 570:11 no viable alternative at input 'functionfallback'


Error parsing Solidity code: 'NoneType' object is not subscriptable
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error pa

line 4354:138 missing ';' at '('
line 4354:213 no viable alternative at input '[]'
line 4354:225 no viable alternative at input '[]'
line 4354:239 mismatched input ')' expecting {'from', 'error', 'memory', 'storage', 'calldata', 'revert', 'callback', 'leave', 'payable', 'constructor', 'receive', Identifier}


Error parsing Solidity code: 'NoneType' object has no attribute 'getText'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
7000 rows processed
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attri

In [1]:
import pandas as pd
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')
train_df = pd.read_csv('/content/drive/MyDrive/Practicum/ASTDataChunked/validation_ast_sequences.csv')

print(train_df.shape)


Mounted at /content/drive
(10606, 2)


In [2]:
print(train_df.head(2))

                                  processed_sequence        labels
0  type FunctionDefinition name constructor param...           [4]
1  type ModifierDefinition name XX parameters typ...  [5, 3, 2, 1]


In [4]:
import pandas as pd
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

path = f'/content/drive/MyDrive/Practicum/ASTDataChunked/validation_ast_sequences.csv'
print(f"reading file as: {path}")
train_df = pd.read_csv(path)
print(f'shape: {train_df.shape}')

# remove nan values
train_df = train_df[train_df['processed_sequence'].apply(lambda x: isinstance(x, str))]
print(f'shape after removing NaN values: {train_df.shape}')

import pickle

# Load the saved tokenizer from the file
tokenizer_filename = '/content/drive/MyDrive/Practicum/Utils/ast_tokenizer_train.pkl'
with open(tokenizer_filename, 'rb') as file:
    tokenizer = pickle.load(file)

vocabulary_size = len(tokenizer.word_index)
print("Vocabulary size:", vocabulary_size)

sequences = tokenizer.texts_to_sequences(train_df['processed_sequence'])

from keras.utils import pad_sequences

MSL = 21500
padded_sequences = pad_sequences(sequences, maxlen=MSL, padding='post', truncating='post')

from sklearn.preprocessing import MultiLabelBinarizer
import ast
import numpy as np

# Pad the sequences to a fixed length
labels = [0, 1, 2, 3, 4, 5]

def convertLabelListToArray(labelList):
    labelArrays = labelList.apply(lambda x: np.array(x.strip('[]').split(',')).astype(int))
    mlb = MultiLabelBinarizer(classes=labels)
    binaryLabels = mlb.fit_transform(labelArrays)
    return binaryLabels

binary_labels = convertLabelListToArray(train_df['labels'])

sequence_pd = pd.DataFrame(padded_sequences)
labels = pd.DataFrame(binary_labels)

print(sequence_pd.shape)
print(labels.shape)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
reading file as: /content/drive/MyDrive/Practicum/ASTDataChunked/validation_ast_sequences.csv
shape: (10606, 2)
shape after removing NaN values: (10175, 2)
Vocabulary size: 1742
(10175, 21500)
(10175, 6)


In [5]:
import h5py

# Create the HDF5 file
hdf5_path = '/content/drive/MyDrive/Practicum/HDF5DataAst/validation.h5'
with h5py.File(hdf5_path, 'w') as hdf5_file:
    # Save sequence_pd to an HDF5 dataset
    hdf5_file.create_dataset('data', data=padded_sequences)

    # Save labels to an HDF5 dataset
    hdf5_file.create_dataset('labels', data=binary_labels)

In [6]:
hdf5_file_training = h5py.File('/content/drive/MyDrive/Practicum/HDF5DataAst/validation.h5', 'r')
training_data = hdf5_file_training['data']
training_labels = hdf5_file_training['labels']
print(training_data.shape)
print(training_labels.shape)

(10175, 21500)
(10175, 6)
