In [44]:
!apt-get install -y -qq git
!git clone https://github.com/ConsenSys/python-solidity-parser.git
%ls
%cd python-solidity-parser
%ls
!pip install .
from solidity_parser import parser

import sys
import pprint

def parse_solidity_code(source_code):
    try:
        ast = parser.parse(source_code)
        return ast
    except Exception as e:
        print(f"Error parsing Solidity code: {e}",)
        return None

import pandas as pd
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

train_df = pd.read_csv('/content/drive/MyDrive/Practicum/SascDatasets/train_data_source_codes.csv')

# Set of Solidity keywords
solidity_keywords = [
    # Variable Types
    'address', 'bool', 'int', 'int8', 'int16', 'int32', 'int64', 'int128', 'int256', 'uint', 'uint8', 'uint16', 'uint32', 'uint64', 'uint128', 'uint256',
    'float', 'double', 'fixed', 'ufixed', 'byte', 'bytes', 'bytes1', 'bytes2', 'bytes4', 'bytes8', 'bytes16', 'bytes32', 'string', 'mapping', 'struct', 'enum',

    # Control Structures
    'if', 'else', 'while', 'do', 'for', 'switch', 'case', 'default', 'break', 'continue', 'return',
    'throw', 'require', 'revert', 'modifier',

    # Visibility Specifiers
    'public', 'external', 'internal', 'private',

    # Function Modifiers
    'pure', 'view', 'payable', 'constant', 'anonymous'

    # Special Keywords
    'this', 'super', 'selfdestruct', 'assembly',

    # Events
    'event', 'indexed',

    # Built-in Functions
    'msg', 'msg.sender', 'msg.value', 'sender', 'value', 'now', 'block', 'tx', 'origin', 'gasleft', 'assert', 'require', 'revert',
    'keccak256', 'sha256', 'ecrecover', 'addmod', 'mulmod', 'create', 'call', 'delegatecall', 'callcode', 'send',
    'staticcall', 'selfdestruct', 'balance', 'div', 'mod', 'exp', 'sqrt', 'gas', 'this', 'abs', 'min', 'max',
    'balance', 'transfer', 'block', 'number', 'blockHash', 'timestamp',

    # Other
    'constructor', 'fallback', 'receive', 'pragma', 'after', 'alias', 'apply', 'auto', 'case', 'copyof', 'default', 'defined', 'final', 'implements', 'in', 'inline', 'let',
    'macro', 'match', 'mutable', 'null', 'of', 'partial', 'promise', 'reference', 'relocatable', 'sealed', 'sizeof', 'static', 'supports', 'switch', 'typedef', 'typeof', 'var'
]

def get_function_nodes(ast):
    function_nodes = []

    def traverse(node):
        if isinstance(node, dict):
            if 'type' in node and node['type'] == 'FunctionDefinition':
                function_nodes.append(node)
            elif 'type' in node and node['type'] == 'ModifierDefinition':
                function_nodes.append(node)
            for key, value in node.items():
                if isinstance(value, list):
                    for item in value:
                        traverse(item)
                elif isinstance(value, dict):
                    traverse(value)

    traverse(ast)
    return function_nodes

def serialize_function_node(node, keywords):
    serialized_node = []
    has_string_literal_type = False
    has_hex_literal_type = False
    has_hex_number_type = False
    has_decimal_number_type = False

    for key, value in node.items():
        if key == 'name' and value not in keywords:
            value = 'XX'
        elif key == 'namePath' and value not in keywords:
            value = 'XX'
        elif key == 'memberName' and value not in keywords:
            value = 'XX'
        # elif key == 'functionName' and value not in keywords:
        #     value = 'assembly_func'
        elif key == 'decl':
            value = 'parser_error'
        elif key == 'number':
            value = 'num_literal'
        elif key == 'type' and value == 'stringLiteral':
            has_string_literal_type = True
        elif key == 'type' and value == 'hexLiteral':
            has_hex_literal_type = True
        elif key == 'type' and value == 'HexNumber':
            has_hex_number_type = True
        elif key == 'type' and value == 'DecimalNumber':
            has_decimal_number_type = True
        elif key == 'value' and has_string_literal_type:
            value = 'string_literal'
            has_string_literal_type = False
        elif key == 'value' and has_hex_literal_type:
            value = 'hex_literal'
            has_hex_literal_type = False
        elif key == 'value' and has_hex_number_type:
            value = 'hex_value'
            has_hex_number_type = False
        elif key == 'value' and has_decimal_number_type:
            value = 'dec_value'
            has_decimal_number_type = False

        if isinstance(value, list):
            if not value:
                serialized_value = 'empty_list'
            else:
                serialized_value = ' '.join(serialize_function_node(item, keywords) for item in value if isinstance(item, dict))
        elif isinstance(value, dict):
            serialized_value = serialize_function_node(value, keywords)
        else:
            serialized_value = str(value)
        serialized_node.append(f'{key} {serialized_value}')
    return ' '.join(serialized_node)

def serialize_ast(ast, solidity_keywords):

    function_nodes = get_function_nodes(ast)
    serialized_ast = [serialize_function_node(node, solidity_keywords) for node in function_nodes]
    return ' '.join(serialized_ast)


def preprocessDataframe(df, startIndex, rowCount, output_path):
    processed_sequences = []
    for i in range(startIndex, startIndex+rowCount):
        # Step 1: Parse the source code
        ast = parse_solidity_code(df.iloc[i]['source_code'])

        # Step 2: Serialize the AST
        try:
            serialized_ast = serialize_ast(ast, solidity_keywords)
        except Exception as e:
            print("Error serializing AST: ", e)
            serialized_ast = "unserializable"

        processed_sequences.append(serialized_ast)

        # Print progress after every 1000 rows
        if (i - startIndex + 1) % 500 == 0:
            print(f"{i - startIndex + 1} rows processed")

    # Create a new DataFrame with the processed sequences and 'slither' column
    processed_df = pd.DataFrame({'processed_sequence': processed_sequences,
                                 'labels': df['slither'].iloc[startIndex:startIndex + rowCount]})

    # Save the new DataFrame to a CSV file
    processed_df.to_csv(output_path, index=False)

csvIndex = 3
startIndex = 45000
rowCount = 15000
training_output_path = f'/content/drive/MyDrive/Practicum/ASTDataChunked/training_ast_sequences_{csvIndex}.csv'
preprocessDataframe(train_df, startIndex, rowCount, training_output_path)

Cloning into 'python-solidity-parser'...
remote: Enumerating objects: 198, done.[K
remote: Counting objects: 100% (60/60), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 198 (delta 55), reused 49 (delta 49), pack-reused 138[K
Receiving objects: 100% (198/198), 270.51 KiB | 14.24 MiB/s, done.
Resolving deltas: 100% (103/103), done.
[0m[01;34mbuild[0m/                   [01;34msamples[0m/          [01;34msolidity_parser[0m/
[01;34mpython-solidity-parser[0m/  [01;34mscripts[0m/          [01;34msolidity_parser.egg-info[0m/
README.md                setup.py
requirements.txt         [01;34msolidity-antlr4[0m/
/content/python-solidity-parser/python-solidity-parser/python-solidity-parser
README.md         [0m[01;34msamples[0m/  setup.py          [01;34msolidity_parser[0m/
requirements.txt  [01;34mscripts[0m/  [01;34msolidity-antlr4[0m/
Processing /content/python-solidity-parser/python-solidity-parser/python-solidity-parser
  Preparing metad

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity c

line 1:20 mismatched input '.' expecting ';'


Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
1500 rows processed
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no at

line 2664:14 extraneous input ',' expecting {'from', '{', '}', '(', 'error', 'for', 'function', 'address', 'calldata', 'if', 'assembly', 'return', 'revert', 'byte', 'let', '=:', 'switch', 'callback', DecimalNumber, HexNumber, HexLiteralFragment, 'break', 'continue', 'leave', 'payable', 'constructor', 'receive', Identifier, StringLiteralFragment}
line 2664:18 extraneous input ',' expecting {<EOF>, 'pragma', '~', 'import', 'from', '{', '}', 'abstract', 'contract', 'interface', 'library', '(', 'error', 'using', 'for', 'struct', 'modifier', 'function', 'event', 'enum', '[', 'address', 'mapping', 'calldata', 'if', 'try', 'while', 'unchecked', 'assembly', 'do', 'return', 'throw', 'emit', 'revert', 'var', 'bool', 'string', 'byte', '++', '--', 'new', '+', '-', 'after', 'delete', '!', 'let', '=:', 'switch', 'callback', Int, Uint, Byte, Fixed, Ufixed, BooleanLiteral, DecimalNumber, HexNumber, HexLiteralFragment, 'break', 'continue', 'leave', 'payable', 'type', 'constructor', 'fallback', 'receive

Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'

line 128:24 extraneous input ',' expecting {<EOF>, 'pragma', '~', 'import', 'from', '{', '}', 'abstract', 'contract', 'interface', 'library', '(', 'error', 'using', 'for', 'struct', 'modifier', 'function', 'event', 'enum', '[', 'address', 'mapping', 'calldata', 'if', 'try', 'while', 'unchecked', 'assembly', 'do', 'return', 'throw', 'emit', 'revert', 'var', 'bool', 'string', 'byte', '++', '--', 'new', '+', '-', 'after', 'delete', '!', 'let', '=:', 'switch', 'callback', Int, Uint, Byte, Fixed, Ufixed, BooleanLiteral, DecimalNumber, HexNumber, HexLiteralFragment, 'break', 'continue', 'leave', 'payable', 'type', 'constructor', 'fallback', 'receive', Identifier, StringLiteralFragment}
line 130:25 extraneous input ',' expecting {<EOF>, 'pragma', '~', 'import', 'from', '{', '}', 'abstract', 'contract', 'interface', 'library', '(', 'error', 'using', 'for', 'struct', 'modifier', 'function', 'event', 'enum', '[', 'address', 'mapping', 'calldata', 'if', 'try', 'while', 'unchecked', 'assembly', 'd

Error parsing Solidity code: 'NoneType' object has no attribute 'assemblyIdentifierList'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'list' object has no attribute 'getText'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
4000 rows processed
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has 

line 157:51 missing ';' at '('
line 157:77 no viable alternative at input '(uint256[][]'
line 157:78 mismatched input ')' expecting {'from', 'error', 'memory', 'storage', 'calldata', 'revert', 'callback', 'leave', 'payable', 'constructor', 'receive', Identifier}
line 157:79 extraneous input ')' expecting {';', '='}


Error parsing Solidity code: object of type 'NoneType' has no len()
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error pa

line 665:24 extraneous input ',' expecting {<EOF>, 'pragma', '~', 'import', 'from', '{', '}', 'abstract', 'contract', 'interface', 'library', '(', 'error', 'using', 'for', 'struct', 'modifier', 'function', 'event', 'enum', '[', 'address', 'mapping', 'calldata', 'if', 'try', 'while', 'unchecked', 'assembly', 'do', 'return', 'throw', 'emit', 'revert', 'var', 'bool', 'string', 'byte', '++', '--', 'new', '+', '-', 'after', 'delete', '!', 'let', '=:', 'switch', 'callback', Int, Uint, Byte, Fixed, Ufixed, BooleanLiteral, DecimalNumber, HexNumber, HexLiteralFragment, 'break', 'continue', 'leave', 'payable', 'type', 'constructor', 'fallback', 'receive', Identifier, StringLiteralFragment}
line 667:25 extraneous input ',' expecting {<EOF>, 'pragma', '~', 'import', 'from', '{', '}', 'abstract', 'contract', 'interface', 'library', '(', 'error', 'using', 'for', 'struct', 'modifier', 'function', 'event', 'enum', '[', 'address', 'mapping', 'calldata', 'if', 'try', 'while', 'unchecked', 'assembly', 'd

Error parsing Solidity code: 'NoneType' object has no attribute 'assemblyIdentifierList'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
Error parsing Solidity code: 'NoneType' object has no attribute 'identifier'
7500 rows processed
Error parsing Solidity code: 'NoneType' obje

In [43]:
train_df = pd.read_csv('/content/drive/MyDrive/Practicum/SascDatasets/train_data_source_codes.csv')
print(train_df.shape)

(74991, 3)


In [4]:
import pandas as pd
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

df = pd.read_csv('/content/drive/MyDrive/Practicum/ASTDataChunked/training_ast_sequences_3.csv')
print(df.shape)
for i in range(180, 200):
    print(df.iloc[i][0])



Mounted at /content/drive
(15000, 2)
type FunctionDefinition name XX parameters type ParameterList parameters type Parameter typeName type ElementaryTypeName name address name XX storageLocation None isStateVar False isIndexed False returnParameters type ParameterList parameters type Parameter typeName type ElementaryTypeName name bool name XX storageLocation None isStateVar False isIndexed False body type Block statements type VariableDeclarationStatement variables type VariableDeclaration typeName type ElementaryTypeName name uint256 name XX storageLocation None initialValue None type InLineAssemblyStatement language None body type AssemblyBlock operations type AssemblyAssignment names type Identifier name XX expression type AssemblyExpression functionName extcodesize arguments type AssemblyExpression functionName account arguments empty_list type BinaryOperation operator > left type Identifier name XX right type NumberLiteral number num_literal subdenomination None visibility intern

In [5]:
df = df[df['processed_sequence'].apply(lambda x: isinstance(x, str))]
print(len(df))

14080


In [34]:
ctr = 0
for i in range(1871):
    s = df.iloc[i]['processed_sequence']
    if "684" in s:
        print(i)
        ctr += 1;

print(ctr)

1227
1788
2


In [35]:
print(df.iloc[1227]['processed_sequence'])

type FunctionDefinition name XX parameters type ParameterList parameters type Parameter typeName type ArrayTypeName baseTypeName type ElementaryTypeName name address length type NumberLiteral number num_literal subdenomination None name XX storageLocation None isStateVar False isIndexed False returnParameters type ParameterList parameters type Parameter typeName type ElementaryTypeName name address name XX storageLocation None isStateVar False isIndexed False body type Block statements type IndexAccess base type Identifier name XX index type NumberLiteral number num_literal subdenomination None visibility internal modifiers empty_list isConstructor False isFallback False isReceive False stateMutability pure type FunctionDefinition name XX parameters type ParameterList parameters type Parameter typeName type ArrayTypeName baseTypeName type ElementaryTypeName name address length type NumberLiteral number num_literal subdenomination None name XX storageLocation None isStateVar False isInd

In [6]:
from keras.utils import pad_sequences
import pandas as pd
from keras.preprocessing.text import Tokenizer

# from google.colab import drive

# # Mount Google Drive
# drive.mount('/content/drive')

# Tokenize the opcode sequences
tokenizer = Tokenizer()
print(df.shape)
tokenizer.fit_on_texts(df['processed_sequence'])

(14080, 2)


In [7]:
vocabulary_size = len(tokenizer.word_index)
print(vocabulary_size)

1078


In [42]:
vocab = tokenizer.word_index
count = 0

for word, index in vocab.items():
    print(f'{word}: {index}')
    count += 1
    if count == 615:
        break

type: 1
name: 2
xx: 3
false: 4
identifier: 5
expression: 6
empty: 7
list: 8
none: 9
elementarytypename: 10
typename: 11
storagelocation: 12
parameters: 13
isstatevar: 14
isindexed: 15
parameter: 16
arguments: 17
names: 18
functioncall: 19
parameterlist: 20
operator: 21
binaryoperation: 22
left: 23
right: 24
expressionstatement: 25
body: 26
literal: 27
block: 28
statements: 29
statemutability: 30
uint256: 31
returnparameters: 32
visibility: 33
functiondefinition: 34
modifiers: 35
isconstructor: 36
isfallback: 37
isreceive: 38
memberaccess: 39
membername: 40
address: 41
value: 42
number: 43
numberliteral: 44
num: 45
subdenomination: 46
string: 47
indexaccess: 48
base: 49
index: 50
stringliteral: 51
variabledeclaration: 52
memory: 53
view: 54
variabledeclarationstatement: 55
variables: 56
initialvalue: 57
internal: 58
require: 59
external: 60
assemblyexpression: 61
functionname: 62
public: 63
bool: 64
uint: 65
sender: 66
condition: 67
true: 68
ifstatement: 69
truebody: 70
falsebody: 71
by

In [8]:
val_df = pd.read_csv('/content/drive/MyDrive/Practicum/SascDatasets/validation_data.csv')
print(val_df.shape)

(10861, 2)


In [36]:
code = train_df.iloc[69227]['source_code']
ast = parse_solidity_code(code)
# sequence = serialize_ast(ast, solidity_keywords)
pprint.pprint(ast, width=10, sort_dicts=False)
# print(code)


{'type': 'SourceUnit',
 'children': [{'type': 'PragmaDirective',
               'name': 'solidity',
               'value': '^0.4.24'},
              {'type': 'ContractDefinition',
               'name': 'SafeMath',
               'baseContracts': [],
               'subNodes': [{'type': 'FunctionDefinition',
                             'name': 'mul',
                             'parameters': {'type': 'ParameterList',
                                            'parameters': [{'type': 'Parameter',
                                                            'typeName': {'type': 'ElementaryTypeName',
                                                                         'name': 'uint256'},
                                                            'name': 'a',
                                                            'storageLocation': None,
                                                            'isStateVar': False,
                                                            

In [16]:
code = train_df.iloc[68003]['source_code']
ast = parse_solidity_code(code)
sequence =
print(code)

pragma solidity 0.4.26;

/**
* Get % profit every month with a Fortune 333 contract!
* GitHub https://github.com/fortune333/fortune333
* Site https://fortune333.online/
*
* - OBTAINING 99.9% PER 1 MONTH. (percentages are charged in equal parts every 1 sec)
* 3.33%        per 1 day
* 0.13875%     per 1 hour
* 0.0023125%   per 1 minute
* 0.000038515% per 1 sec
* - lifetime payments
* - unprecedentedly reliable
* - bring luck
* - first minimum contribution from 0.01 eth, all next from 0.01 eth.
* - Currency and Payment - ETH
* - Contribution allocation schemes:
* - 100% of payments - 6% percent for support and 12% percent referral system.
* Unique referral system!
* 3.33% is paid to the referral (inviting) wallet - right there! Instantly!
* 3.33% is added to the first contribution of the referral (new investor).
* For example: Your first contribution is 1 Ether.
* The one who invited you gets 0.033 Ethers on his wallet, that is, a wallet that the investor will indicate when they first inv