In [None]:
! nvidia-smi
! pip install datasets transformers
! pip install sentencepiece
! git clone https://github.com/kaisunresearch/dash.git

Mon Oct 24 01:27:20 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import transformers
import numpy as np
import torch
import random
import math
import json
import os
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import mean_squared_error

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    mse = mean_squared_error(labels, predictions)
    return {"msle": mse}

class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

batch_size = 64
T = 3000
LM = "bert-base-multilingual-uncased"
septoken = '[SEP]'

with open("dash/features.json", "r", encoding='utf8') as f:
    tldcnt = json.load(f)["tldcnt"]

In [None]:
def work(subset = "dn", m = "mbert+", evaldev = True, evaltest = True, SEED = 42):
    assert(subset in ["dn", "ea", "nft"])
    assert(m in ["mbert", "mbert+"])
    random.seed(SEED)
    torch.manual_seed(SEED)
    np.random.seed(SEED)

    if subset == "dn":
        delimiter = "."
        with open("dash/data/v1.0/dash_dn.json", "r", encoding='utf8') as f:
            dataraw = json.load(f)
    elif subset == "ea":
        delimiter = "@"
        with open("dash/data/v1.0/dash_ea.json", "r", encoding='utf8') as f:
            dataraw = json.load(f)
    elif subset == "nft":
        delimiter = "."
        with open("dash/data/v1.0/dash_nft.json", "r", encoding='utf8') as f:
            dataraw = json.load(f)

    dataraw = [[x["asset"], x["price"], x["date"]] for x  in dataraw["data"]]

    auto_split = [0.05, 0.05]
    split = [dataraw[int(len(dataraw)*(1-auto_split[0]-auto_split[1]))-1][2], \
            dataraw[int(len(dataraw)*(1-auto_split[1]))-1][2]]

    data = []
    splitindex = [None, None]
    for i in range(len(dataraw)):
        xraw = dataraw[i][0].lower()
        yraw = dataraw[i][1]
        data += [{"label": math.log(float(yraw)), "sentence": xraw}]
        if dataraw[i][2] <= split[0]:
            splitindex[0] = i + 1
        if dataraw[i][2] <= split[1]:
            splitindex[1] = i + 1

    stages = [1, 2] if m == "mbert+" else [1]

    task = subset + "-" + m

    for stage in stages:
        model_checkpoint = LM
        model_name = model_checkpoint.split("/")[-1]

        tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

        dataset = {}
        dataset["train"] = data[:splitindex[0]]
        dataset["validation"] = data[splitindex[0]:splitindex[1]]
        dataset["test"] = data[splitindex[1]:]

        if stage == 2:
            dataset["train"] = dataset["train"][-T:]

        encoded_dataset = {}
        for split in dataset:
            train_texts = []
            train_labels = []
            for i in range(len(dataset[split])):
                name = dataset[split][i]["sentence"].split(delimiter)
                if m == "mbert+":
                    train_texts += [name[0] + septoken + delimiter.join(name[1:]) + septoken + str(tldcnt[name[0]])]
                else:
                    train_texts += [name[0] + septoken + delimiter.join(name[1:])]                
                train_labels += [dataset[split][i]["label"]]
            train_encodings = tokenizer(train_texts, truncation=True, padding=True)
            encoded_dataset[split] = MyDataset(train_encodings, train_labels)

        if stage == 2:
            ckpt = [x[0] for x in os.walk(LM + "-finetuned-" + task) if "checkpoint" in x[0]]
            assert(len(ckpt) == 1)
            model_checkpoint = ckpt[0]

        model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=1, 
            problem_type = "regression")

        args = TrainingArguments(
            f"{model_name}-finetuned-{task}",
            evaluation_strategy = "epoch",
            save_strategy = "epoch",
            learning_rate=2e-5,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=1 if stage == 1 and m == "mbert+" else 3,
            gradient_accumulation_steps=1,
            weight_decay=0.01,
            load_best_model_at_end=True,
            metric_for_best_model="msle",
            greater_is_better=False,
            push_to_hub=False,
            save_total_limit=1,
            seed=SEED
        )
        
        trainer = Trainer(
            model,
            args,
            train_dataset=encoded_dataset["train"],
            eval_dataset=encoded_dataset["validation"],
            compute_metrics=compute_metrics
        )

        trainer.train()

    if evaldev:
        msle = 0
        cnt = 0
        predict = trainer.predict(encoded_dataset["validation"])
        print(len(predict[0]))
        for i in range(len(predict[0])):
            cnt += 1
            msle += (predict[0][i] - dataset["validation"][i]["label"]) ** 2
        msle /= cnt
        print("dev", msle)

    if evaltest:
        msle = 0
        cnt = 0
        predict = trainer.predict(encoded_dataset["test"])
        print(len(predict[0]))
        for i in range(len(predict[0])):
            cnt += 1
            msle += (predict[0][i] - dataset["test"][i]["label"]) ** 2
        msle /= cnt
        print("test", msle)

work()

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/672M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

Epoch,Training Loss,Validation Loss,Msle
1,1.8149,2.377084,2.377084


***** Running Evaluation *****
  Num examples = 7973
  Batch size = 64
Saving model checkpoint to bert-base-multilingual-uncased-finetuned-dn-mbert+/checkpoint-2237
Configuration saved in bert-base-multilingual-uncased-finetuned-dn-mbert+/checkpoint-2237/config.json
Model weights saved in bert-base-multilingual-uncased-finetuned-dn-mbert+/checkpoint-2237/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from bert-base-multilingual-uncased-finetuned-dn-mbert+/checkpoint-2237 (score: 2.3770840167999268).
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-multilingual-uncased/snapshots/800c34f3d5aa174fe531f560b44b8d14592225b7/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-multilingual-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_a

Epoch,Training Loss,Validation Loss,Msle
1,No log,2.093937,2.093936
2,No log,2.131336,2.131336
3,No log,2.234218,2.234218


***** Running Evaluation *****
  Num examples = 7973
  Batch size = 64
Saving model checkpoint to bert-base-multilingual-uncased-finetuned-dn-mbert+/checkpoint-47
Configuration saved in bert-base-multilingual-uncased-finetuned-dn-mbert+/checkpoint-47/config.json
Model weights saved in bert-base-multilingual-uncased-finetuned-dn-mbert+/checkpoint-47/pytorch_model.bin
Deleting older checkpoint [bert-base-multilingual-uncased-finetuned-dn-mbert+/checkpoint-2237] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 7973
  Batch size = 64
Saving model checkpoint to bert-base-multilingual-uncased-finetuned-dn-mbert+/checkpoint-94
Configuration saved in bert-base-multilingual-uncased-finetuned-dn-mbert+/checkpoint-94/config.json
Model weights saved in bert-base-multilingual-uncased-finetuned-dn-mbert+/checkpoint-94/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 7973
  Batch size = 64
Saving model checkpoint to bert-base-multilingual-uncased-finetuned

***** Running Prediction *****
  Num examples = 7927
  Batch size = 64


7973
dev [2.0939384]


7927
test [1.8489485]
