### Install Necessary Packages

In [None]:
!pip install transformers
!pip install datasets
!pip install accelerate -U

# Restart Session
import os
os.kill(os.getpid(), 9)

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.22.2-py3-none-a

### Mount Drive into Colab

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Import Libraries

In [None]:
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForSequenceClassification, Trainer, TrainingArguments, AdamW, TrainerCallback
from tqdm.notebook import tqdm  # Use notebook version of tqdm for better compatibility with Jupyter
from datetime import timedelta
import plotly.graph_objects as go
from datasets import Dataset
from torch.utils.data import DataLoader, Dataset as torchDS
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import seaborn as sns
import matplotlib.pyplot as plt
from functools import partial
from scipy.special import softmax
import torch
import torch.nn as nn

In [None]:
# reading embedded textual dataset which should contain (date, text, embeddings)
# TODO: generate the dataset if it doesn't exist
text_db_path = '/content/drive/MyDrive/combined_tweets_2020_labeled.csv'

text_df = pd.read_csv(text_db_path, usecols=["date", "text_split"])
text_df.rename(columns={"text_split": "text"}, inplace=True)
text_df.set_index('date', inplace=True)
text_df.index = pd.to_datetime(text_df.index)

In [None]:
# reading price data
price_db_path = '/content/drive/MyDrive/daily-2020.csv'

price_df = pd.read_csv(price_db_path, usecols=["timestamp", "close", "open", "high", "low", "volume"])
price_df.set_index('timestamp', inplace=True)
price_df.index = pd.to_datetime(price_df.index, unit='s')

In [None]:
text_df.shape

(64310, 1)

In [None]:
text_df

Unnamed: 0_level_0,text
date,Unnamed: 1_level_1
2020-01-01,is the year of bitcoin bitcoin is up in decade...
2020-01-01,activity and bases when can print they need go...
2020-01-01,address related to his announcement why they b...
2020-01-01,xrp btc btc price action is similar to bitcoin...
2020-01-01,about cryptocurrency and will be the beginning...
...,...
2020-12-31,like few ago it great way to talk about bitcoi...
2020-12-31,to to review the year in crypto take look at w...
2020-12-31,mas dinheiro bitcoin em coin green nature love...
2020-12-31,they re not the best of us it not so much abou...


In [None]:
price_df.shape

(366, 5)

In [None]:
price_df

Unnamed: 0_level_0,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-01,7189.43,7260.43,7170.15,7197.57,56801.329
2020-01-02,7197.57,7209.59,6922.00,6962.04,115295.677
2020-01-03,6962.34,7407.28,6863.44,7341.72,208493.458
2020-01-04,7341.60,7400.00,7269.21,7350.71,92586.033
2020-01-05,7350.54,7495.00,7303.00,7354.36,117765.972
...,...,...,...,...,...
2020-12-27,26508.84,28459.84,25850.00,26305.64,540264.148
2020-12-28,26301.76,27538.82,26117.10,27102.66,267563.468
2020-12-29,27101.45,27441.73,25913.01,27402.83,260759.449
2020-12-30,27402.83,29063.72,27401.00,28906.99,374737.655


In [None]:
# Shift the Bitcoin price data by one day forward
price_df_shifted = price_df.shift(-1)
price_df_shifted

Unnamed: 0_level_0,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-01,7197.57,7209.59,6922.00,6962.04,115295.677
2020-01-02,6962.34,7407.28,6863.44,7341.72,208493.458
2020-01-03,7341.60,7400.00,7269.21,7350.71,92586.033
2020-01-04,7350.54,7495.00,7303.00,7354.36,117765.972
2020-01-05,7354.36,7808.65,7345.00,7757.39,168150.317
...,...,...,...,...,...
2020-12-27,26301.76,27538.82,26117.10,27102.66,267563.468
2020-12-28,27101.45,27441.73,25913.01,27402.83,260759.449
2020-12-29,27402.83,29063.72,27401.00,28906.99,374737.655
2020-12-30,28906.99,29376.70,27848.00,28951.68,313648.835


In [None]:
class Labeler:
    def __init__(self, name):
        """
        Initialize the labeler.

        Args:
        name (str): The name of the labeler.
        """
        self.name = name

    def fit(self, data):
        """
        Fit the labeler to the data.

        This method should be overridden by subclasses to implement
        the actual fitting logic.

        Args:
        data (any): The data to fit the labeler to.
        """
        raise NotImplementedError("Subclasses must implement this method.")

    def transform(self, data):
        """
        Transform the data into labels.

        This method should be overridden by subclasses to implement
        the actual transformation logic.

        Args:
        data (any): The data to transform into labels.

        Returns:
        any: The labels.
        """
        raise NotImplementedError("Subclasses must implement this method.")

In [None]:
class TrueRangeLabeler(Labeler):
    def __init__(self):
        """
        Initialize the labeler.
        """
        super().__init__("true range labeler")

    def fit(self, data):
        """
        Fit the labeler to the data.

        Args:
        data (pd.DataFrame): The data to fit the labeler to.
        """
        self.data = data.copy()

    def transform(self):
        """
        Transform the data into labels.

        Returns:
        pd.DataFrame: The labels.
        """
        # Calculate the True Range
        self.data['high_low'] = self.data['high'] - self.data['low']
        self.data['high_prev_close'] = np.abs(self.data['high'] - self.data['close'].shift())
        self.data['low_prev_close'] = np.abs(self.data['low'] - self.data['close'].shift())
        self.data['true_range'] = self.data[['high_low', 'high_prev_close', 'low_prev_close']].max(axis=1)

        # Normalize the True Range to be between 0 and 1
        self.data['label'] = (self.data['true_range'] - self.data['true_range'].min()) / (self.data['true_range'].max() - self.data['true_range'].min())

        return self.data

In [None]:
db_labler = TrueRangeLabeler()
db_labler.fit(price_df_shifted)
true_range_db = db_labler.transform()

In [None]:
true_range_db

Unnamed: 0_level_0,open,high,low,close,volume,high_low,high_prev_close,low_prev_close,true_range,label
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-01-01,7197.57,7209.59,6922.00,6962.04,115295.677,287.59,,,287.59,0.056472
2020-01-02,6962.34,7407.28,6863.44,7341.72,208493.458,543.84,445.24,98.60,543.84,0.128982
2020-01-03,7341.60,7400.00,7269.21,7350.71,92586.033,130.79,58.28,72.51,130.79,0.012103
2020-01-04,7350.54,7495.00,7303.00,7354.36,117765.972,192.00,144.29,47.71,192.00,0.029423
2020-01-05,7354.36,7808.65,7345.00,7757.39,168150.317,463.65,454.29,9.36,463.65,0.106291
...,...,...,...,...,...,...,...,...,...,...
2020-12-27,26301.76,27538.82,26117.10,27102.66,267563.468,1421.72,1233.18,188.54,1421.72,0.377393
2020-12-28,27101.45,27441.73,25913.01,27402.83,260759.449,1528.72,339.07,1189.65,1528.72,0.407671
2020-12-29,27402.83,29063.72,27401.00,28906.99,374737.655,1662.72,1660.89,1.83,1662.72,0.445588
2020-12-30,28906.99,29376.70,27848.00,28951.68,313648.835,1528.70,469.71,1058.99,1528.70,0.407665


In [None]:
labeled_df = text_df.merge(true_range_db[['label']], left_index=True, right_index=True, how='left')

In [None]:
labeled_df

Unnamed: 0,text,label
2020-01-01,is the year of bitcoin bitcoin is up in decade...,0.056472
2020-01-01,activity and bases when can print they need go...,0.056472
2020-01-01,address related to his announcement why they b...,0.056472
2020-01-01,xrp btc btc price action is similar to bitcoin...,0.056472
2020-01-01,about cryptocurrency and will be the beginning...,0.056472
...,...,...
2020-12-31,like few ago it great way to talk about bitcoi...,
2020-12-31,to to review the year in crypto take look at w...,
2020-12-31,mas dinheiro bitcoin em coin green nature love...,
2020-12-31,they re not the best of us it not so much abou...,


In [None]:
labeled_df.label.value_counts()

label
0.026027    521
0.004774    343
0.147703    296
0.056862    296
0.028118    275
           ... 
0.131574    140
0.067621    137
0.102177    137
0.280633    136
0.113461    134
Name: count, Length: 363, dtype: int64

In [None]:
# Drop rows with NaN labels (corresponding to the last day)
labeled_df = labeled_df.dropna()

In [None]:
# Split the dataset into training and testing subsets with stratification
train_df, test_df = train_test_split(labeled_df, test_size=0.2, random_state=42, stratify=labeled_df['label'])
train_df

Unnamed: 0,text,label
2020-02-26,price analysis small with strong futuristic bi...,0.100759
2020-03-23,bitcoin cash kraken kraken bitcoin may be uniq...,0.108371
2020-01-21,over higher than bitcoin bitcoin btc de usar b...,0.041350
2020-03-16,en de ser las de un se btc con bitcoin en de s...,0.149540
2020-10-06,up for bitcoin para la de bitcoin en tu local ...,0.013212
...,...,...
2020-11-20,throughout the world with million that each co...,0.168620
2020-12-24,the first bitcoin card and get for my referral...,0.364920
2020-11-13,works with bitcoin square good day do you know...,0.152463
2020-05-14,very promising project so make hay while the s...,0.185479


In [None]:
train_df

Unnamed: 0,text,label
2020-02-26,price analysis small with strong futuristic bi...,0.100759
2020-03-23,bitcoin cash kraken kraken bitcoin may be uniq...,0.108371
2020-01-21,over higher than bitcoin bitcoin btc de usar b...,0.041350
2020-03-16,en de ser las de un se btc con bitcoin en de s...,0.149540
2020-10-06,up for bitcoin para la de bitcoin en tu local ...,0.013212
...,...,...
2020-11-20,throughout the world with million that each co...,0.168620
2020-12-24,the first bitcoin card and get for my referral...,0.364920
2020-11-13,works with bitcoin square good day do you know...,0.152463
2020-05-14,very promising project so make hay while the s...,0.185479


In [None]:
# Create Dataset objects from the split dataframes
train_dataset = Dataset.from_pandas(train_df[['text', 'label']])
test_dataset = Dataset.from_pandas(test_df[['text', 'label']])

In [None]:
# Tokenize the text field in the dataset
def tokenize_function(tokenizer, examples):
    # Tokenize the text and return only the necessary fields
    encoded = tokenizer(examples["text"], padding='max_length', max_length=512)
    return {"input_ids": encoded["input_ids"], "attention_mask": encoded["attention_mask"], "label": examples["label"]}

In [None]:
# tokenizing the dataset text to be used in train and test loops
tokenizer = AutoTokenizer.from_pretrained("ElKulako/cryptobert")
partial_tokenize_function = partial(tokenize_function, tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

In [None]:
# Tokenize the text in the datasets
tokenized_train_dataset = train_dataset.map(partial_tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(partial_tokenize_function, batched=True)

Map:   0%|          | 0/51273 [00:00<?, ? examples/s]

Map:   0%|          | 0/12819 [00:00<?, ? examples/s]

In [None]:
tokenized_train_dataset = tokenized_train_dataset.select(range(2000))
tokenized_test_dataset = tokenized_test_dataset.select(range(1000))

### Cusomized CryptoBERT Model

In [None]:
class Model:
    def __init__(self, name):
        """
        Initialize the model.

        Args:
        name (str): The name of the model.
        """
        self.name = name

    def train(self, data, labels):
        """
        Train the model on the given data and labels.

        This method should be overridden by subclasses to implement
        the actual training logic.

        Args:
        data (any): The data to train the model on.
        labels (any): The labels for the data.
        """
        raise NotImplementedError("Subclasses must implement this method.")

    def predict(self, data):
        """
        Use the model to make predictions on the given data.

        This method should be overridden by subclasses to implement
        the actual prediction logic.

        Args:
        data (any): The data to make predictions on.

        Returns:
        any: The predictions.
        """
        raise NotImplementedError("Subclasses must implement this method.")

    def evaluate(self, data, labels):
        """
        Evaluate the model on the given data and labels.

        This method should be overridden by subclasses to implement
        the actual evaluation logic.

        Args:
        data (any): The data to evaluate the model on.
        labels (any): The labels for the data.

        Returns:
        any: The evaluation results.
        """
        raise NotImplementedError("Subclasses must implement this method.")

    def compute_metrics(self, predictions, labels):
        """
        Compute metrics based on the model's predictions and the true labels.

        This method should be overridden by subclasses to implement
        the actual metrics computation logic.

        Args:
        predictions (any): The model's predictions.
        labels (any): The true labels.

        Returns:
        any: The computed metrics.
        """
        raise NotImplementedError("Subclasses must implement this method.")

In [None]:
base_address = ''

In [None]:
class CryptoBERT(Model):
    def __init__(self, model_addr="ElKulako/cryptobert", save_path=f'{base_address}/artifact/fine_tuned_model.pth', load_path=None, load_state_dict=False, input_task="classification"):
        super().__init__("huggingface ElKulako/cryptobert")
        self.model_addr = model_addr
        self.save_path = save_path
        self.load_path = load_path
        self.input_task = input_task

        # Load configuration
        config = AutoConfig.from_pretrained(model_addr)

        # Adjust configuration for regression task
        if input_task == "regression":
            config.num_labels = 1  # Adjust for regression task

        # Load model with modified configuration
        if load_state_dict:
            self.model = AutoModelForSequenceClassification.from_pretrained(model_addr, config=config)
            self.model.load_state_dict(torch.load(self.load_path))
        else:
            self.model = AutoModelForSequenceClassification.from_pretrained(model_addr, config=config, ignore_mismatched_sizes=True)

    def train(self, dataloader, device, learning_rate=1e-5, epochs=5):
        """
        Train the model on the given data and labels.

        Args:
        data (any): The data to train the model on.
        labels (any): The labels for the data.
        """
        results = {}
        # Move the model to the device
        self.model.to(device)
        # Set up the optimizer
        optimizer = AdamW(self.model.parameters(), lr=learning_rate)
        for epoch in tqdm(range(epochs)):  # Number of epochs
            all_labels = []
            all_preds = []
            all_probs = []  # For storing probabilities
            losses = []
            for batch in tqdm(dataloader):
                optimizer.zero_grad()
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                if self.input_task == "classification":
                    labels = batch['labels'].to(device)
                    outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)
                    loss = outputs.loss
                elif self.input_task == "regression":
                    labels = batch['labels'].to(device)  # Assuming true_range is provided in the batch
                    outputs = self.model(input_ids, attention_mask=attention_mask)
                    # Modify the loss function for regression task
                    loss = nn.MSELoss()(outputs.logits.squeeze(), labels.float())
                loss.backward()
                optimizer.step()

                # Store labels, predictions and probabilities for metrics calculation
                preds = torch.nn.functional.softmax(outputs.logits, dim=-1)
                losses.append(loss.item())
                all_probs.append(preds.detach().cpu().numpy())  # Store probabilities
                if self.input_task == "classification":
                    class_preds = torch.argmax(preds, dim=-1)
                elif self.input_task == "regression":
                    class_preds = outputs.logits.squeeze()  # For regression, use logits directly
                all_preds.append(class_preds.cpu().detach().numpy())
                all_labels.append(labels.cpu().detach().numpy())

            # Calculate and log metrics after each epoch
            all_labels = np.concatenate(all_labels)
            all_preds = np.concatenate(all_preds)
            all_probs = np.concatenate(all_probs)  # Concatenate probabilities
            if self.input_task == "classification":
                results[epoch] = self.compute_metrics_classification(all_labels, all_preds, all_probs)
            elif self.input_task == "regression":
                results[epoch] = self.compute_metrics_regression(all_labels, all_preds)

            # Save the model after each epoch
            # torch.save(self.model.state_dict(), self.save_path)

        # metrics for each epoch
        return results

    def evaluate(self, dataloader, device):
        """
        Evaluate the model on the given data and labels.

        Args:
        data (any): The data to evaluate the model on.
        labels (any): The labels for the data.

        Returns:
        any: The evaluation results.
        """
        # Evaluation loop
        results = {}
        self.model.to(device)
        eval_loss = 0
        all_labels = []
        all_preds = []
        all_probs = []  # For storing probabilities
        for batch in tqdm(dataloader):
            with torch.no_grad():
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                if self.input_task == "classification":
                    labels = batch['labels'].to(device)
                    outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)
                    eval_loss += outputs.loss.item()
                    # Get the predicted probabilities from the model's outputs
                    preds = torch.nn.functional.softmax(outputs.logits, dim=-1)
                    # Convert the probabilities to class labels
                    class_preds = torch.argmax(preds, dim=-1)
                    all_probs.append(preds.cpu().numpy())  # Store probabilities
                elif self.input_task == "regression":
                    labels = batch['labels'].to(device)
                    outputs = self.model(input_ids, attention_mask=attention_mask)
                    preds = outputs.logits.squeeze()  # For regression, use logits directly
                all_preds.append(preds.cpu().numpy())
                all_labels.append(labels.cpu().numpy())

        # Calculate metrics
        all_labels = np.concatenate(all_labels)
        all_preds = np.concatenate(all_preds)
        if self.input_task == "classification":
            all_probs = np.concatenate(all_probs)  # Concatenate probabilities
            results = self.compute_metrics_classification(all_labels, all_preds, all_probs)
        elif self.input_task == "regression":
            results = self.compute_metrics_regression(all_labels, all_preds)

        return results

    @staticmethod
    def compute_metrics_classification(labels, preds, probs):
        """
        Compute classification metrics based on the model's predictions and the true labels.

        Args:
        labels (any): The true labels.
        preds (any): The model's predictions.
        probs (any): The model's probabilities

        Returns:
        dict: The computed classification metrics.
        """
        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
        acc = accuracy_score(labels, preds)

        # Compute confusion matrix
        conf_matrix = confusion_matrix(labels, preds)

        # Create a dictionary of metrics
        metrics = {
            "accuracy": acc,
            "f1": f1,
            "precision": precision,
            "recall": recall,
            'confusion_matrix': conf_matrix
        }

        return metrics

    @staticmethod
    def compute_metrics_regression(labels, preds):
        """
        Compute regression metrics based on the model's predictions and the true labels.

        Args:
        labels (any): The true labels.
        preds (any): The model's predictions.

        Returns:
        dict: The computed regression metrics.
        """
        mae = mean_absolute_error(labels, preds)
        mse = mean_squared_error(labels, preds)

        # Create a dictionary of metrics
        metrics = {
            "mean_absolute_error": mae,
            "mean_squared_error": mse
        }

        return metrics


    def get_trainer(self, eval_dataset, train_dataset=None):
        print(f'the input task: {self.input_task}')
        def compute_metrics_regression(pred):
            labels = pred.label_ids
            preds = pred.predictions.squeeze()  # For regression, use predictions directly
            mae = mean_absolute_error(labels, preds)
            mse = mean_squared_error(labels, preds)
            return {
                'mean_absolute_error': mae,
                'mean_squared_error': mse
            }

        def compute_metrics_classification(pred):
            labels = pred.label_ids
            preds = pred.predictions.argmax(-1)
            probs = softmax(pred.predictions, axis=1)
            precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
            acc = accuracy_score(labels, preds)
            roc_auc = roc_auc_score(labels, probs, multi_class='ovr')
            conf_matrix = confusion_matrix(labels, preds)
            return {
                'accuracy': acc,
                'f1': f1,
                'precision': precision,
                'recall': recall,
                'roc_auc': roc_auc,
                'confusion_matrix': conf_matrix
            }

        # Choose compute_metrics function based on the task type
        if self.input_task == "classification":
            compute_metrics_func = compute_metrics_classification
        elif self.input_task == "regression":
            compute_metrics_func = compute_metrics_regression

        print(f'the compute metric fun: {compute_metrics_func}')
        # Define Trainer arguments
        trainer_args = TrainingArguments(
            output_dir=self.save_path,
        )

        # Create Trainer instance
        trainer = Trainer(
            model=self.model,                 # the non-fine-tuned model
            args=trainer_args,                # training arguments, defined above
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,        # test dataset
            compute_metrics=compute_metrics_func,   # the compute_metrics function
            callbacks=[]
        )

        return trainer


### Base Model Evaluation

In [None]:
# 5. Evaluation of Base CryptoBERT Model
base_model = CryptoBERT(input_task='regression')

config.json:   0%|          | 0.00/932 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ElKulako/cryptobert and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([1, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([1]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
base_model_trainer = base_model.get_trainer(tokenized_test_dataset)
# Evaluate the base model
base_model_eval_result = base_model_trainer.evaluate()

the input task: regression
the compute metric fun: <function CryptoBERT.get_trainer.<locals>.compute_metrics_regression at 0x7ebc6a1153f0>


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


RuntimeError: "nll_loss_forward_reduce_cuda_kernel_2d_index" not implemented for 'Float'

In [None]:
tokenized_test_dataset, tokenized_train_dataset

(Dataset({
     features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
     num_rows: 1000
 }),
 Dataset({
     features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
     num_rows: 2000
 }))

In [None]:
tokenized_train_dataset.column_names

['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask']

In [None]:
class TextDataset(torchDS):
    def __init__(self, hf_dataset):
        self.hf_dataset = hf_dataset

    def __len__(self):
        return len(self.hf_dataset)

    def __getitem__(self, idx):
        item = self.hf_dataset[idx]
        return {
            'input_ids': torch.tensor(item['input_ids']),
            'attention_mask': torch.tensor(item['attention_mask']),
            'labels': torch.tensor(item['label'])
        }

In [None]:
# Remove the '__index_level_0__' column from the dataset
if '__index_level_0__' in tokenized_train_dataset.column_names:
  tokenized_train_dataset = tokenized_train_dataset.remove_columns('__index_level_0__')

In [None]:
# Remove the '__index_level_0__' column from the dataset
if '__index_level_0__' in tokenized_test_dataset:
  tokenized_test_dataset = tokenized_test_dataset.remove_columns('__index_level_0__')

In [None]:
tokenized_train_dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 2000
})

In [None]:
train_dataset = TextDataset(tokenized_train_dataset)
test_dataset = TextDataset(tokenized_test_dataset)
test_dataset

<__main__.TextDataset at 0x7ebc5cdf5f90>

In [None]:
batch_size = 5
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Create DataLoader
eval_dataloader = DataLoader(test_dataset, batch_size=batch_size)

# Evaluate the model using the DataLoader
base_model_eval_results = base_model.evaluate(dataloader=eval_dataloader, device=device)

# Print evaluation results
print(base_model_eval_results)

  0%|          | 0/200 [00:00<?, ?it/s]

{'mean_absolute_error': 0.2945863, 'mean_squared_error': 0.11513738}


### Fine-tuning Process

In [None]:
# Instantiate the CryptoBERT model for regression task
fine_tuned_model = CryptoBERT(input_task='regression')

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ElKulako/cryptobert and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([1, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([1]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Use TrainingArgument and Trainer

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./regression_results",   # Output directory
    num_train_epochs=5,                  # Total number of epochs
    per_device_train_batch_size=32,      # Batch size per device during training
    save_steps=-1,                       # No saving of model checkpoints during training
    logging_steps=100,                   # Log training progress every 100 steps
    evaluation_strategy="epoch"          # Evaluate at the end of each epoch
)

# Create Trainer for fine-tuning
trainer = fine_tuned_model.get_trainer(train_dataset=tokenized_train_dataset, eval_dataset=tokenized_test_dataset)

# Fine-tune the model
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


the input task: regression
the compute metric fun: <function CryptoBERT.get_trainer.<locals>.compute_metrics_regression at 0x7ebc1755e3b0>


RuntimeError: "nll_loss_forward_reduce_cuda_kernel_2d_index" not implemented for 'Float'

In [None]:
eval_results = trainer.evaluate(eval_dataset=tokenized_test_dataset)

# Print evaluation results
print(eval_results)

### Use Custom Train and Evaluate Functions

In [None]:
# Create DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=batch_size)

# Train the model
train_results = fine_tuned_model.train(
    dataloader=train_dataloader,
    device=device
)



  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

In [None]:
batch_size = 5
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Create DataLoader
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

# Evaluate the model using the DataLoader
fine_tuned_model_test_results = fine_tuned_model.evaluate(dataloader=test_dataloader, device=device)

# Print test results
print(fine_tuned_model_test_results)

  0%|          | 0/200 [00:00<?, ?it/s]

{'mean_absolute_error': 0.112910084, 'mean_squared_error': 0.026027165}
