<a href="https://colab.research.google.com/github/jerry-zsj/colab-files/blob/main/finetune_demo_reduced.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Transformers installation
! pip install transformers datasets evaluate
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git

# Update versions
! pip install -U accelerate
! pip install -U transformers

# ! pip freeze

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Prepare Dataset and Tokenizer

In [None]:
from datasets import load_dataset
import evaluate
import numpy as np
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import pipeline

import csv
import sys
csv.field_size_limit(sys.maxsize)

def read_csv_as_tuples(filename):
    """ Read Athena downloaded CSV file into a list of tuples (without headers)."""
    data = []
    with open(filename) as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        headers = next(reader)
        data = [row for row in reader]
    return data

vocab_csv_filename = "large_vocab.csv"
train_data_csv_filename = "large_data_train_balanced.csv"
test_data_csv_filename = "large_data_test.csv"


# usage: https://huggingface.co/docs/datasets/en/loading#csv
data_files = {"train": train_data_csv_filename, "test": test_data_csv_filename}
dataset = load_dataset("csv", data_files=data_files)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
# add custom tokens
new_tokens = [r[0] for r in read_csv_as_tuples(vocab_csv_filename)]
new_tokens_added = tokenizer.add_tokens(new_tokens)
print(f"Added {new_tokens_added} new tokens to the vocabulary.")

# add special tokens representing unknown words
special_tokens_dict = {'additional_special_tokens': ['[DPT]', '[CAL]', '[ADR]', '[SIG]', '[VAL]']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
# tokenizer.SPECIAL_TOKENS_ATTRIBUTES
# tokenizer.additional_special_tokens

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Added 33306 new tokens to the vocabulary.


## Process Datasets

In [None]:
vocab = tokenizer.get_vocab()
special_token_map = {
    0: '[DPT]', # unknown trace depth
    1: '[CAL]', # unknown call type
    2: '[ADR]', # unknown from/to address
    3: '[ADR]', # unknown from/to address
    4: '[SIG]', # unknown func signature
    5: '[VAL]', # unknown value
}
# def preprocess_function(examples):
#     text = examples["text"]
#     return tokenizer(text, truncation=True)
def preprocess_function(examples):
    text = examples["text"]
    ptext = [' '.join([token if token in vocab else special_token_map[idx%6] for idx, token in enumerate(t.split())]) for t in text]
    return tokenizer(ptext, truncation=True, padding="max_length")

tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1822 [00:00<?, ? examples/s]

Map:   0%|          | 0/857 [00:00<?, ? examples/s]

In [None]:
# Preview tokenized dataset
tokenized_dataset['test'][0]

## Prepare Trainer

In [None]:
# https://huggingface.co/evaluate-metric
# Accuracy = (TP + TN) / (TP + TN + FP + FN)
# Recall = TP / (TP + FN)
# Precision = TP / (TP + FP)
accuracy = evaluate.load("accuracy")
recall = evaluate.load("recall")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy_results = accuracy.compute(predictions=predictions, references=labels)
    recall_results = recall.compute(predictions=predictions, references=labels)
    results = {}
    results.update(accuracy_results)
    results.update(recall_results)
    return results

id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

## Start Training

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)
# https://huggingface.co/transformers/v2.11.0/_modules/transformers/tokenization_utils.html#PreTrainedTokenizer.add_tokens
model.resize_token_embeddings(len(tokenizer))  # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.

# small_train_dataset = tokenized_dataset["train"].shuffle(seed=42).select(range(100))
# small_eval_dataset = tokenized_dataset["test"].shuffle(seed=42).select(range(100))
small_train_dataset = tokenized_dataset["train"]
small_eval_dataset = tokenized_dataset["test"]

training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Recall
1,No log,0.090392,0.975496,0.0
2,No log,0.05774,0.988331,0.52381
3,No log,0.058633,0.989498,0.571429
4,No log,0.052035,0.988331,0.52381
5,0.113800,0.044011,0.989498,0.571429


TrainOutput(global_step=570, training_loss=0.1060052332125212, metrics={'train_runtime': 559.3757, 'train_samples_per_second': 16.286, 'train_steps_per_second': 1.019, 'total_flos': 1206778001756160.0, 'train_loss': 0.1060052332125212, 'epoch': 5.0})

In [None]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/370M [00:00<?, ?B/s]

events.out.tfevents.1718764722.cf625cff3154.717.1:   0%|          | 0.00/13.3k [00:00<?, ?B/s]

events.out.tfevents.1718764016.cf625cff3154.717.0:   0%|          | 0.00/7.03k [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ruiminszb/my_awesome_model/commit/0af94e2cd8a5021741ab1786030ab2e2403b4a8a', commit_message='End of training', commit_description='', oid='0af94e2cd8a5021741ab1786030ab2e2403b4a8a', pr_url=None, pr_revision=None, pr_num=None)

## Inference

In [None]:
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)


"""
Evaluate on test dataset
"""
test_data = read_csv_as_tuples(test_data_csv_filename)
labels = [r[0] for r in test_data]
raw_texts = [r[1] for r in test_data]
texts = [' '.join([token if token in vocab else special_token_map[idx%6] for idx, token in enumerate(t.split())]) for t in raw_texts]

inf_results = classifier(texts, truncation=True)  # Set truncation=True to truncate longer input to 512 tokens to avoid tensor size overflow
inf_labels = [label2id.get(r['label']) for r in inf_results]
results = {'0_0': 0, '0_1': 0, '1_0': 0, '1_1': 0} # true negative, false positive, false negative, true positive
for i, label in enumerate(labels):
    inf_label = inf_labels[i]
    results[f"{label}_{inf_label}"] += 1

print(results)
print(f"Accuracy: {accuracy.compute(predictions=inf_labels, references=labels)}")
print(f"Recall: {recall.compute(predictions=inf_labels, references=labels)}")
# {'0_0': 833, '0_1': 3, '1_0': 7, '1_1': 14}
# Accuracy: {'accuracy': 0.9883313885647608}
# Recall: {'recall': 0.6666666666666666}

{'0_0': 836, '0_1': 0, '1_0': 9, '1_1': 12}
Accuracy: {'accuracy': 0.9894982497082847}
Recall: {'recall': 0.5714285714285714}


In [None]:
# Compare with old model
model_name = "ruiminszb/my_awesome_model"
revision = "5ffb6bdb347d9ac6e7089bd7a18d4572a91f38de"
model_cmp = AutoModelForSequenceClassification.from_pretrained(model_name, revision=revision)
tokenizer_cmp = AutoTokenizer.from_pretrained(model_name, revision=revision)
pipe = pipeline("text-classification", model=model_cmp, tokenizer=tokenizer_cmp)

inf_results = pipe(texts, truncation=True)  # Set truncation=True to truncate longer input to 512 tokens to avoid tensor size overflow
inf_labels = [label2id.get(r['label']) for r in inf_results]
results = {'0_0': 0, '0_1': 0, '1_0': 0, '1_1': 0} # true negative, false positive, false negative, true positive
for i, label in enumerate(labels):
    inf_label = inf_labels[i]
    results[f"{label}_{inf_label}"] += 1

print(results)
print(f"Accuracy: {accuracy.compute(predictions=inf_labels, references=labels)}")
print(f"Recall: {recall.compute(predictions=inf_labels, references=labels)}")

model.safetensors:   0%|          | 0.00/370M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'0_0': 640, '0_1': 156, '1_0': 3, '1_1': 12}
Accuracy: {'accuracy': 0.8039457459926017}
Recall: {'recall': 0.8}


In [None]:
# Random Tx Inference
import requests

transaction_payload_postgres_sql_template = """
WITH traces as (
    select block_time, transaction_hash, trace_index, trace_address, trace_type, call_type, from_address, to_address, substr(coalesce(input, 'NULL'), 1, 10) sig,
        CASE WHEN value >= 100 THEN
            ROUND(value, 2-CAST(LENGTH(CAST(value AS VARCHAR)) AS INT))
        ELSE value END as value_rounded
    from ethereum_mainnet.traces tr
    WHERE transaction_hash = '{}'
    order by block_time desc, transaction_hash, trace_index
),
preprocessed as (
    -- pretty print
    select block_time, transaction_hash, trace_index,
        concat('depth-', array_to_string(array_prepend(0, trace_address), '-')) as trace_depth,
        upper(coalesce(call_type, trace_type)) as call_type,
        from_address,
        to_address,
        sig,
        substr(cast(value_rounded as varchar), 1, 2) as value_sig,
        length(cast(value_rounded as varchar)) as value_scale
    from traces
    order by block_time desc, transaction_hash, trace_index
),
serialized_traces as (
    select block_time, transaction_hash, trace_index,
        array_to_string(ARRAY[
            trace_depth,
            call_type,
            from_address,
            to_address,
            sig,
            concat(cast(value_sig as varchar), '_', cast(value_scale as varchar))
        ], ' ') as trace_payload
    from preprocessed
),
serialized_txns as (
    select block_time, transaction_hash,
        array_to_string(array_agg(trace_payload order by trace_index), ' ') as transaction_payload
    from serialized_traces
    group by block_time, transaction_hash
    order by block_time desc, transaction_hash
)
select transaction_payload from serialized_txns;
"""

def must_fetch_database_execution_result(sql: str) -> list[dict]:
    url = "https://api.zettablock.com/api/v1/databases/realtimeEvmDB/queries"
    payload = {"query": sql}
    headers = {
        "accept": "application/json",
        "X-API-KEY": "c5eed57c-9507-47e6-a137-c7b88627254a",
        "content-type": "application/json"
    }
    response = requests.post(url, json=payload, headers=headers)
    if not response.status_code == 200:
        raise Exception(f"Error: [{response.status_code}] {response.text}")
    id = response.json()['id']
    execute_url = f"https://api.zettablock.com/api/v1/queries/{id}/execute?includeColumnName=true&includeMetadata=false"
    response = requests.post(execute_url, headers=headers)
    if not response.status_code == 200:
        raise Exception(f"Error: [{response.status_code}] {response.text}")
    csvlines = response.text.splitlines()
    field_names = csvlines[0].split(',')
    results = [{h:x for (h,x) in zip(field_names,row.split(','))} for row in csvlines[1:]]
    return results

def get_payload(txhash):
    sql = transaction_payload_postgres_sql_template.format(txhash)
    res = must_fetch_database_execution_result(sql)
    if not res:
        raise Exception(f"No results returned for {txhash}")
    return res[0]['transaction_payload']

def infer_tx(pipe, txhash):
    payload = get_payload(txhash)
    processed_payload = ' '.join([token if token in vocab else special_token_map[idx%6] for idx, token in enumerate(payload.split())])
    return pipe(processed_payload, truncation=True)

results = [
    infer_tx(classifier, '0x76d158b9d5ca1193f925a274b5e9613ca9877edc90f75414c2c4473a1b41034d'),
    infer_tx(classifier, '0x38a03aed2d21ed131a54d3baabb7ccfd325b7eaf222307c554f34e0a00e7e971'),
    infer_tx(classifier, '0x2ceb422046d45a5a69d25de65e54f1ef5ef1d31ee42d74cc669dd8788bbddf51'),
]
_ = [print(r) for r in results]

[{'label': 'NEGATIVE', 'score': 0.8765076994895935}]
[{'label': 'NEGATIVE', 'score': 0.5488406419754028}]
[{'label': 'NEGATIVE', 'score': 0.9933870434761047}]
