In [1]:
import os
import sys
import datetime
import random
from datetime import timedelta
from typing import List, Callable
from functools import partial

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas.tseries.offsets import DateOffset
from transformers import (
    AutoTokenizer,
    AutoModel,
    get_linear_schedule_with_warmup
)
from tqdm.auto import tqdm
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GroupKFold
from scipy.special import softmax
import torch
from torch import nn
from torch.utils.data import DataLoader
from datasets import Dataset, ClassLabel

from datasets import Dataset as HuggingfaceDataset
from torch.utils.data import DataLoader, Dataset as torchDS
from functools import partial
import torch
import re
import string
import emoji
import time
import neptune

from itertools import cycle
from sklearn.preprocessing import label_binarize
from transformers import AutoModelForSequenceClassification, AutoConfig, Trainer, TrainingArguments
from transformers.integrations import NeptuneCallback
from scipy.special import softmax
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, roc_curve, auc
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, mean_squared_error, mean_absolute_error
import torch
import torch.nn as nn
from torch.optim.lr_scheduler import LambdaLR
from typing import Optional

In [2]:
class Model:
    def __init__(self, name):
        """
        Initialize the model.

        Args:
        name (str): The name of the model.
        """
        self.name = name

    def train(self, data, labels):
        """
        Train the model on the given data and labels.

        This method should be overridden by subclasses to implement
        the actual training logic.

        Args:
        data (any): The data to train the model on.
        labels (any): The labels for the data.
        """
        raise NotImplementedError("Subclasses must implement this method.")

    def predict(self, data):
        """
        Use the model to make predictions on the given data.

        This method should be overridden by subclasses to implement
        the actual prediction logic.

        Args:
        data (any): The data to make predictions on.

        Returns:
        any: The predictions.
        """
        raise NotImplementedError("Subclasses must implement this method.")

    def evaluate(self, data, labels):
        """
        Evaluate the model on the given data and labels.

        This method should be overridden by subclasses to implement
        the actual evaluation logic.

        Args:
        data (any): The data to evaluate the model on.
        labels (any): The labels for the data.

        Returns:
        any: The evaluation results.
        """
        raise NotImplementedError("Subclasses must implement this method.")

    def compute_metrics(self, predictions, labels):
        """
        Compute metrics based on the model's predictions and the true labels.

        This method should be overridden by subclasses to implement
        the actual metrics computation logic.

        Args:
        predictions (any): The model's predictions.
        labels (any): The true labels.

        Returns:
        any: The computed metrics.
        """
        raise NotImplementedError("Subclasses must implement this method.")

class CryptoBERT(Model):
    def __init__(self, model_addr="ElKulako/cryptobert", save_path=f'./artifact/fine_tuned_model.pth', load_path=None, load_state_dict=False, input_task='classification'):
        super().__init__("huggingface ElKulako/cryptobert")
        self.model_addr = model_addr
        self.save_path = save_path
        self.load_path = load_path
        self.input_task = input_task
        self.metrics = {}  # Initialize the metrics dictionary
        self.labels = {}  # Initialize the labels dictionary
        self.preds = {}  # Initialize the predictions dictionary
        self.probs = {}  # Initialize the probabilities dictionary
        self.eval_labels = {}
        self.eval_preds = {}
        self.eval_probs = {}
        # Load configuration
        config = AutoConfig.from_pretrained(model_addr)

        # Adjust configuration for regression task
        if input_task == "regression":
            config.num_labels = 1  # Adjust for regression task
        elif input_task == "classification":
            config.num_labels = 3  # Adjust for classification task

        # Load model with modified configuration
        if load_state_dict:
            self.model = AutoModelForSequenceClassification.from_pretrained(self.model_addr, config=config)
            self.model.load_state_dict(torch.load(self.load_path))
        else:
            self.model = AutoModelForSequenceClassification.from_pretrained(self.model_addr, config=config, ignore_mismatched_sizes=True)

    def predict(self, data):
        raise NotImplementedError("Subclasses must implement this method.")

    def save_model(self, path):
        # Create the output directory if it doesn't exist
        dir_name = os.path.dirname(path)
        if not os.path.exists(dir_name):
            os.makedirs(dir_name)
        torch.save(self.model.state_dict(), path)

    def load_model(self, path):
        model_state = torch.load(path)
        self.model.load_state_dict(model_state)
        return self.model

    def train(self, dataloader, device, optimizer, scheduler, learning_rate=2e-5, model_name="train", neptune_run=None):
        """
        Train the model on the given data and labels.

        Args:
        dataloader (DataLoader): The DataLoader for the training data.
        device (torch.device): The device to train the model on.
        learning_rate (float): The learning rate for the optimizer.
        num_epochs (int): The number of epochs for training.
        num_folds (int): The number of folds for cross-validation.
        neptune_run (neptune.run.Run): The Neptune run instance.

        Returns:
        Tuple[List, List, List, List]: The labels, predictions, probabilities, and losses for each batch.
        """
        all_labels = []
        all_preds = []
        all_probs = []
        all_losses = []

        for batch in tqdm(dataloader, desc=f"Training Progress...", leave=False, dynamic_ncols=True):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()

            # Store labels, predictions and probabilities for metrics calculation
            preds = torch.nn.functional.softmax(outputs.logits, dim=-1)
            class_preds = torch.argmax(preds, dim=-1)

            all_probs.append(preds.detach().cpu().numpy())  # Store probabilities
            all_preds.append(class_preds.cpu().detach().numpy())
            all_labels.append(labels.cpu().detach().numpy())
            all_losses.append(loss.item())

            if neptune_run:
                # Log metrics to Neptune
                neptune_metrics = ["accuracy", "precision", "f1", "recall"]
                # Compute metrics
                metrics = self.compute_metrics_classification(np.concatenate(all_labels), np.concatenate(all_preds), np.concatenate(all_probs), neptune_metrics)
                for metric_name in neptune_metrics:
                    neptune_run[f"{model_name}/{metric_name}"].append(metrics.get(metric_name))
                neptune_run[f"{model_name}/loss"].append(loss.item())

        return all_labels, all_preds, all_probs, all_losses


    def evaluate(self, dataloader, device, model_name="base", neptune_run=None):
        """
        Evaluate the model on the given data and labels.

        Args:
        dataloader (DataLoader): The DataLoader for the evaluation data.
        device (torch.device): The device to evaluate the model on.

        Returns:
        Tuple[List, List, List, list]: The labels, predictions, probabilities, losses for each batch.
        """
        # Evaluation loop
        all_labels = []
        all_preds = []
        all_probs = []
        all_losses = []

        for batch in tqdm(dataloader, desc="Evaluating Progress...", leave=False, dynamic_ncols=True):
            with torch.no_grad():
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss

                # Get the predicted probabilities from the model's outputs
                preds = torch.nn.functional.softmax(outputs.logits, dim=-1)
                # Convert the probabilities to class labels
                class_preds = torch.argmax(preds, dim=-1)

                all_probs.append(preds.cpu().numpy())  # Store probabilities
                all_preds.append(class_preds.cpu().numpy())
                all_labels.append(labels.cpu().numpy())
                all_losses.append(loss.item())

            if neptune_run:
                # Log metrics to Neptune
                neptune_metrics = ["accuracy", "precision", "f1", "recall"]
                # Compute metrics
                metrics = self.compute_metrics_classification(np.concatenate(all_labels), np.concatenate(all_preds), np.concatenate(all_probs), neptune_metrics)
                for metric_name in neptune_metrics:
                    neptune_run[f"{model_name}/{metric_name}"].append(metrics.get(metric_name))
                neptune_run[f"{model_name}/loss"].append(loss.item())

        return all_labels, all_preds, all_probs, all_losses

    @staticmethod
    def compute_metrics_classification(labels, preds, probs, metrics_to_return=None):
        """
        Compute classification metrics based on the model's predictions and the true labels.

        Args:
        labels (any): The true labels.
        preds (any): The model's predictions.
        probs (any): The model's probabilities
        metrics_to_return (list): List of metric names to compute and return.

        Returns:
        dict: The computed classification metrics.
        """
        if metrics_to_return is None:
            metrics_to_return = ["accuracy", "f1", "precision", "recall", "roc_score", "confusion_matrix"]

        metrics = {}

        if "precision" in metrics_to_return or "recall" in metrics_to_return or "f1" in metrics_to_return:
            precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
            if "precision" in metrics_to_return:
                metrics["precision"] = precision
            if "recall" in metrics_to_return:
                metrics["recall"] = recall
            if "f1" in metrics_to_return:
                metrics["f1"] = f1

        if "accuracy" in metrics_to_return:
            metrics["accuracy"] = accuracy_score(labels, preds)

        if "roc_score" in metrics_to_return:
            metrics["roc_score"] = roc_auc_score(labels, probs, multi_class='ovr')

        if "confusion_matrix" in metrics_to_return:
            metrics["confusion_matrix"] = confusion_matrix(labels, preds)

        return metrics


    def compute_metrics_regression(labels, preds):
        """
        Compute regression metrics based on the model's predictions and the true labels.

        Args:
        labels (any): The true labels.
        preds (any): The model's predictions.

        Returns:
        dict: The computed regression metrics.
        """
        mae = mean_absolute_error(labels, preds)
        mse = mean_squared_error(labels, preds)

        # Create a dictionary of metrics
        metrics = {
            "mean_absolute_error": mae,
            "mean_squared_error": mse
        }

        return metrics

    @staticmethod
    def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps):
        def lr_lambda(current_step):
            if current_step < num_warmup_steps:
                return float(current_step) / float(max(1, num_warmup_steps))
            return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)))
        return LambdaLR(optimizer, lr_lambda)


    @staticmethod
    def plot_confusion_matrix(path, labels, preds):
        """
        Plot the confusion matrix for the given labels and predictions.

        Args:
        output_dir (str): The directory to save the confusion matrix plot.
        labels (list): The true labels.
        preds (list): The predicted labels.

        Returns:
        None
        """
        # Create the output directory if it doesn't exist
        output_dir = os.path.dirname(path)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        conf_matrix = confusion_matrix(labels, preds)
        disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=['Down', 'Neutral', 'Up'])
        fig, ax = plt.subplots(figsize=(10, 10))
        disp.plot(ax=ax, cmap='Blues', values_format='d')
        plt.title('Confusion Matrix')
        plt.savefig(path)
        plt.close()

    @staticmethod
    def plot_roc_curve(path, labels, probs):
        """
        Plot the ROC curve for the given labels and probabilities.

        Args:
        path (str): The path to save the ROC curve plots.
        labels (list): The true labels.
        probs (list): The predicted probabilities.

        Returns:
        None
        """
        # Create the output directory if it doesn't exist
        output_dir = os.path.dirname(path)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        plt.figure()
        # Binarize the labels for multi-class ROC AUC
        labels = label_binarize(labels, classes=np.unique(labels))
        n_classes = labels.shape[1]

        # Compute ROC curve and ROC area for each class
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        for i in range(n_classes):
            fpr[i], tpr[i], _ = roc_curve(labels[:, i], probs[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])

        # Plot all ROC curves
        colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
        for i, color in zip(range(n_classes), colors):
            plt.plot(fpr[i], tpr[i], color=color, lw=2,
                     label='ROC curve of class {0} (area = {1:0.2f})'
                     ''.format(i, roc_auc[i]))

        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic')
        plt.legend(loc="lower right")

        # Save the figure to the output directory with a unique name
        plt.savefig(path)
        plt.close()

In [3]:
class HFDataset(HuggingfaceDataset):
    def preprocess(self, return_emojis = False, return_hashtags = False):
        ads_keywords = ["nft", "bonus", "campaign", "invite", "friends"]
        def run_all_preprocess_functions(item):
            text_list = item['text']
            is_ads_tweet = False
            emojis_list = []
            hashtags_list = []
            for enum,text in enumerate(text_list):
                text = HFDataset.lowercase_tweet(text)
                text = HFDataset.remove_URL(text)
                emojis_list += HFDataset.extract_emojis_and_emoticons(text)
                hashtags_list += HFDataset.extract_hashtags(text)
                text = HFDataset.remove_user_ids(text)
                text = HFDataset.remove_punctuations(text)
                text_list[enum] = HFDataset.replace_with_BTC(text)
                if (not is_ads_tweet):
                    is_ads_tweet = HFDataset.is_ads(text, ads_keywords)
            if(return_emojis == False):
                if(return_hashtags == False):
                    return {"text" : text_list}
                else:
                    return {"text" : text_list, "hashtags" : hashtags_list}
            else:
                if(return_hashtags == False):
                    return {"text" : text_list, "emojis" : emojis_list}
                else:
                    return {"text" : text_list, "emojis" : emojis_list, "hashtags" : hashtags_list}

        preprocessed_data =  self.map(run_all_preprocess_functions, batched = True)
        return preprocessed_data
        # self.tokenizer = tokenizer

    @staticmethod
    def tokenize(tokenizer, dataset):
        # Tokenize the text field in the dataset
        def tokenize_function(tokenizer, item):
            # Tokenize the text and return only the necessary fields
            encoded = tokenizer(item["text"], padding="max_length", max_length=512)
            return {"input_ids": encoded["input_ids"], "attention_mask": encoded["attention_mask"], "label": item["label"]}

        # tokenizing the dataset text to be used in train and test loops
        partial_tokenize_function = partial(tokenize_function, tokenizer)
        tokenized_datasets = dataset.map(partial_tokenize_function, batched=True)

        return tokenized_datasets

    @staticmethod
    def remove_URL(text):
            return re.sub(r"(?:https?://|www\.)\S+\.\S+", "", text)

    @staticmethod
    def lowercase_tweet(text):
        return text.lower()

    @staticmethod
    def remove_punctuations(text):
        exclude = set(string.punctuation)
        for char in ['!','?','%','$','&']:
            exclude.remove(char)
        text_without_punctuations = ''.join(ch for ch in text if ch not in exclude)
        return text_without_punctuations

    @staticmethod
    def extract_hashtags(text):
        hashtags_list = re.findall(r"#(\w+)", text)
        return hashtags_list

    @staticmethod
    def replace_with_BTC(text):
        return re.sub(r"Bitcoin|bitcoin|btc|BitCoin", "BTC", text)

    @staticmethod
    def remove_user_ids(text):
        return re.sub(r"@\w+", "", text)

    @staticmethod
    def extract_emojis_and_emoticons(text):
        emojis = emoji.distinct_emoji_list(text)
        emoticons = []
        patterns = [
            r":\)+",  # :) or :-) - Smiling face (happiness, amusement, friendliness)
            r":\(-",  # :( or :-( - Frowning face (sadness, disappointment, disapproval)
            r";\)",   # ;) or ;-) - Wink (joke, flirtation, secrecy)
            r":D+|:-D+",  # :D or :-D - Big smile or grin (amusement, laughter, joy)
            r":P+|:-P+",  # :P or :-P - Sticking out tongue (silliness, teasing, raspberries)
            r":=\)",    # Equals sign smile (tentative smile, unsure)
            r"/:",      # Slash frown (disappointment, annoyance)
            r"\*-*",    # Asterisk kiss (hugs and kisses)
            r":\=",     # Equals sign sad (grimace, helplessness)
            r":\"",     # Double quote (air quotes, sarcasm)
            r"\*_*",   # Asterisk happy face (big smile, eyes closed)
            r"\(/:",    # Backslash frown (extreme frustration)
            r"\||_",    # Sleeping face (tired, bored)
            r"\^_^",    # Happy face with underscore eyes (content, smug)
            r":*-^",    # Cat face (playful, mischief)
            r"^-^",    # Simple happy face
            r":*_^",    # Wink with happy face
            r"^_*",    # Happy face with wink
            r"^-*",    # Confused face
        ]
        for pattern in patterns:
            emoticons.extend(re.findall(pattern, text))
        return emojis + emoticons

    @staticmethod
    def is_ads(text, ads_keywords):
        for ads_keyword in ads_keywords:
            if ads_keyword in text :
                return True

        return False


class TextDataset(torchDS):
    def __init__(self, hf_dataset):
        self.hf_dataset = hf_dataset

    def __len__(self):
        return len(self.hf_dataset)

    def __getitem__(self, idx):
        item = self.hf_dataset[idx]
        return {
            'input_ids': torch.tensor(item['input_ids']),
            'attention_mask': torch.tensor(item['attention_mask']),
            'labels': torch.tensor(item['label'])
        }

In [4]:
def pandas_data_loader(addr: str, columns: List[str], *transforms: Callable[[pd.DataFrame], pd.DataFrame]) -> pd.DataFrame:
    # Load the data from the CSV file
    df = pd.read_csv(addr, usecols=columns)

    # Apply each transform to the DataFrame
    for transform in transforms:
        df = transform(df)

    return df

# Transform index to datetime
def index_to_datetime(df, unit="s"):
    df.index = pd.to_datetime(df.index, unit=unit)
    return df
# Transform col to index
to_index = lambda col, df: df.set_index(col)
# Rename text_plit to text
rename = lambda original, new, df: df.rename(columns={original: new})

In [6]:
# Loading the price data
columns = ["timestamp", "close", "open", "high", "low", "volume"]
price_df = pandas_data_loader("../raw/daily-2020.csv", columns, partial(to_index, "timestamp"), index_to_datetime)

In [7]:
columns = ["text", "sentiment_label", "date"]
text_df = pandas_data_loader("../raw/labeled_tweets.csv", columns, partial(to_index, "date"), partial(index_to_datetime, unit='ns'), partial(rename, "text_split", "text"))

In [8]:
text_df = text_df["2020-01-01":"2021-01-01"]

In [9]:
class Labeler:
    def __init__(self, name):
        """
        Initialize the labeler.

        Args:
        name (str): The name of the labeler.
        """
        self.name = name

    def fit(self, data):
        """
        Fit the labeler to the data.

        This method should be overridden by subclasses to implement
        the actual fitting logic.

        Args:
        data (any): The data to fit the labeler to.
        """
        raise NotImplementedError("Subclasses must implement this method.")

    def transform(self, data):
        """
        Transform the data into labels.

        This method should be overridden by subclasses to implement
        the actual transformation logic.

        Args:
        data (any): The data to transform into labels.

        Returns:
        any: The labels.
        """
        raise NotImplementedError("Subclasses must implement this method.")

In [10]:
class TripleBarrierLabeler(Labeler):
    def __init__(self, volatility_period=7, upper_barrier_factor=1, lower_barrier_factor=1, vertical_barrier=7, min_trend_days=2, barrier_type='volatility', touch_type="HL", up_label=2, neutral_label=1, down_label=0):
        """
        Initialize the labeler.
        """
        super().__init__(name="triple barrier labeling")
        self.volatility_period = volatility_period
        self.upper_barrier_factor = upper_barrier_factor
        self.lower_barrier_factor = lower_barrier_factor
        self.vertical_barrier = vertical_barrier
        self.min_trend_days = min_trend_days
        self.barrier_type = barrier_type
        self.touch_type = touch_type
        self.up_label = up_label
        self.down_label = down_label
        self.neutral_label = neutral_label

    def calculate_barriers(self, df, i, window):
        """calculate the barriers based on either volatility or returns of the backward window

        Args:
            df (pd.DataFrame): Data
            i (pd.index): the index of the beginning of the window
            window (int): window size

        Returns:
            df: Data including barriers for the forward window
        """
        end_window = min(i+window, len(df)-1)  # Ensure the window does not exceed the dataframe

        # Calculate the mean volatility or daily returns over the volatility_period
        if self.barrier_type == 'volatility':
            current_value = df.loc[i, 'volatility']
        elif self.barrier_type == 'returns':
            current_value = df.loc[i, 'daily_returns']
        else:
            raise ValueError("Invalid barrier_type. Choose either 'volatility' or 'returns'")

        df.loc[i:end_window, 'upper_barrier'] = df.loc[i, 'close'] + (df.loc[i, 'close'] * current_value * self.upper_barrier_factor)
        df.loc[i:end_window, 'lower_barrier'] = df.loc[i, 'close'] - (df.loc[i, 'close'] * current_value * self.lower_barrier_factor)
        return df

    def label_observations(self, df, origin, i, label):
        df.loc[origin:i+1, 'label'] = label
        return df

    def get_daily_vol(self, close, span0=100):
        """
        Calculate the daily volatility of closing prices.

        Parameters:
        - close: A pandas Series of closing prices.
        - span0: The span for the EWM standard deviation.

        Returns:
        - A pandas Series of daily volatility estimates.
        """
        # Find the start of the previous day for each day
        prev_day_start = close.index.searchsorted(close.index - pd.Timedelta(days=1))
        prev_day_start = prev_day_start[prev_day_start > 0]

        # Create a series with the start of the previous day for each day
        prev_day_start = pd.Series(close.index[prev_day_start - 1], index=close.index[close.shape[0] - prev_day_start.shape[0]:])

        # Calculate daily returns
        daily_returns = close.loc[prev_day_start.index] / close.loc[prev_day_start.values].values - 1

        # Calculate EWM standard deviation of daily returns
        daily_vol = daily_returns.ewm(span=span0).std()

        return daily_returns, daily_vol

    def fit(self, sdf):
        df = sdf.copy()
        # Calculate daily returns and volatility
        df['daily_returns'], df['volatility'] = self.get_daily_vol(df.close, self.volatility_period)

        df = df.reset_index()
        # Initialize label and window start
        df['label'] = self.neutral_label
        df['window_start'] = False

        self.data = df

    def transform(self):
        """
        Transform the data into labels.

        Returns:
        pd.DataFrame: The labels.
        """
        window = self.vertical_barrier
        origin = 0
        touch_upper = lambda high, barrier: high >= barrier
        touch_lower = lambda low, barrier: low <= barrier
        # For each observation
        for i in range(0, len(self.data)):
            # Define your barriers at the beginning of each window
            if i == origin:
                self.data = self.calculate_barriers(self.data, i, window)
                self.data.loc[i, 'window_start'] = True  # Mark the start of the window

            # one of the conditions were met
            if touch_upper(self.data.loc[i, "high" if self.touch_type == 'HL' else 'close'], self.data.loc[i, "upper_barrier"]):
                if (i - origin > self.min_trend_days):
                    # label the observations
                    self.data = self.label_observations(self.data, origin, i, self.up_label)
                    # set new origin
                    origin = i + 1 if i + 1 < len(self.data) else i  # Check if i + 1 is within the DataFrame's index
                    # reset window
                    window = self.vertical_barrier
            elif touch_lower(self.data.loc[i, "low" if self.touch_type == 'HL' else 'close'], self.data.loc[i, "lower_barrier"]):
                if (i - origin > self.min_trend_days):
                    # label the observations
                    self.data = self.label_observations(self.data, origin, i, self.down_label)
                    # set new origin
                    origin = i + 1 if i + 1 < len(self.data) else i  # Check if i + 1 is within the DataFrame's index
                    # reset window
                    window = self.vertical_barrier

            # none of the conditions were met
            else:
                if window > 0:
                    # reduce window size by one
                    window = window - 1
                else:
                    # reset window
                    window = self.vertical_barrier
                    # label neutral from origin to origin + window
                    self.data.loc[origin:min(origin+window, len(self.data)-1), 'label'] = self.neutral_label  # Ensure the window does not exceed the dataframe
                    # set origin to the next id
                    origin = i + 1 if i + 1 < len(self.data) else i  # Check if i + 1 is within the DataFrame's index

        self.data = self.data.set_index("timestamp")
        return self.data

In [11]:
labeler = TripleBarrierLabeler(volatility_period=8, upper_barrier_factor=1, lower_barrier_factor=1.1, vertical_barrier=5, min_trend_days=2, barrier_type='volatility')

In [12]:
labeler.fit(price_df)
labeled_df = labeler.transform()

In [13]:
labeled_df.label.value_counts()

label
2    172
0    105
1     89
Name: count, dtype: int64

In [14]:
def calculate_rsi(close_series, length, threshold = 60):
    delta = close_series.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=length).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=length).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))

    # Descriptive values
    description = pd.Series(index=close_series.index, dtype='object')
    description[rsi > threshold] = 'bearish'
    description[rsi < 100 - threshold] = 'bullish'
    description[(rsi <= threshold) & (rsi >= 100 - threshold)] = 'neutral'

    return rsi, description

In [15]:
def calculate_returns(close_series, length, std_effect = 0.5):
    returns = close_series.pct_change()
    rolling_std = returns.rolling(window=length).std()
    rolling_std_effect = rolling_std * std_effect

    # Descriptive values
    description = pd.Series(index=close_series.index, dtype='object')
    description[returns > rolling_std_effect] = 'bullish'
    description[returns < -rolling_std_effect] = 'bearish'
    description[(returns <= rolling_std_effect) & (returns >= -rolling_std_effect)] = 'neutral'

    return returns, description

In [16]:
_, labeled_df["RSI"] = calculate_rsi(labeled_df.close, 8)

In [17]:
_, labeled_df["ROC"] = calculate_returns(labeled_df.close, 8)

In [18]:
labeled_df["RSI"].value_counts()

RSI
bearish    180
neutral    102
bullish     77
Name: count, dtype: int64

In [19]:
labeled_df["ROC"].value_counts()

ROC
neutral    155
bullish    119
bearish     84
Name: count, dtype: int64

In [20]:
labeled_df.rename(columns={'label': 'previous_label'}, inplace=True)

In [21]:
# Shift the labels such that for each day, the label is set to the next day's label
labeled_df["next_day_label"] = labeled_df.previous_label.shift(-1)
labeled_df["next_day_window_start"] = labeled_df.window_start.shift(-1)
labeled_df.loc[labeled_df.iloc[0].name, 'next_day_window_start'] = True

In [22]:
merged_df = text_df.merge(
    labeled_df[["next_day_label", 'next_day_window_start', 'previous_label', 'ROC', "RSI"]], left_index=True, right_index=True, how="left"
)

In [30]:
merged_df = text_df.merge(
    labeled_df[["next_day_label", 'next_day_window_start']], left_index=True, right_index=True, how="left"
)

In [23]:
merged_df.dropna(inplace=True)

In [24]:
merged_df.next_day_label.value_counts()

next_day_label
2.0    31674
0.0    17247
1.0    13839
Name: count, dtype: int64

In [25]:
def undersample_tweets(df):
    # Count the number of tweets for each trend
    trend_counts = df['next_day_label'].value_counts()

    # Identify the minority class
    minority_class = trend_counts.idxmin()
    minority_count = trend_counts.min()

    # Initialize an empty DataFrame to store the undersampled data
    undersampled_df = pd.DataFrame()

    # Iterate over the trends
    for trend in df['next_day_label'].unique():
        # If this is the minority class, add all tweets to the undersampled data
        if trend == minority_class:
            undersampled_df = pd.concat([undersampled_df, df[df['next_day_label'] == trend]])
        else:
            # Otherwise, randomly select a subset of tweets equal to the minority count
            subset = df[df['next_day_label'] == trend].sample(minority_count)
            undersampled_df = pd.concat([undersampled_df, subset])

    return undersampled_df

def sample_event_days(data, event_dates, window=1):
    """
    Sample data based on event dates with a specified window around each event.

    Parameters:
    - data (pd.DataFrame): DataFrame containing the data with a datetime index.
    - event_dates (list): List of event dates (as strings or pd.Timestamp).
    - window (int): Number of days before and after the event to include in the sample.

    Returns:
    - pd.DataFrame: Sampled data including the specified window around each event.
    """
    sampled_data = pd.DataFrame()

    for event_date in event_dates:
        event_date = pd.to_datetime(event_date)
        start_date = event_date - pd.Timedelta(days=window)
        end_date = event_date + pd.Timedelta(days=window)
        sampled_data = pd.concat([sampled_data, data.loc[start_date:end_date]])

    return sampled_data.drop_duplicates()

def extract_windows(df, max_windows=None):
    days = df.groupby(df.index.date).first()
    window_origins = days[days['next_day_window_start']].index
    windows = []
    for i in range(len(window_origins) - 1):
        # If max_windows is specified and we've reached the limit, break the loop
        if max_windows is not None and len(windows) >= max_windows:
            break
        # Get the start and end index for each window
        start_index = window_origins[i]
        end_index = days.index[days.index.get_loc(window_origins[i + 1]) - 1]
        # Append the window to the list
        windows.append(days.loc[start_index:end_index])
    # Append the last window if it doesn't exceed max_windows
    if max_windows is None or len(windows) < max_windows:
        windows.append(days.loc[window_origins[-1]:])
    return windows

def extract_tweets(windows, df, max_tweet_packs=None):
    extracted_tweets = []
    # Add a progress bar for the outer loop
    for window in tqdm(windows, desc="Processing windows"):
        # Get the dates within the window
        dates = window.index
        # Initialize a list to store the tweet packs for this window
        window_tweet_packs = []
        # Find the minimum number of tweets across all days in the window
        min_tweet_count = min(
            df.loc[date.strftime('%Y-%m-%d'), 'text'].size if isinstance(df.loc[date.strftime('%Y-%m-%d'), 'text'], (pd.Series, pd.DataFrame)) else 1
            for date in dates
        )
        # Limit the number of tweet packs to extract if max_tweet_packs is specified
        if max_tweet_packs is not None:
            min_tweet_count = min(min_tweet_count, max_tweet_packs)
        # Iterate over the range of the minimum tweet count
        for i in range(min_tweet_count):
            # Initialize a list to store the tweets for this tweet pack
            tweet_pack = []
            # Iterate over the dates in the window
            for date in dates:
                # Get the i-th tweet for this date
                tweet = df.loc[date.strftime('%Y-%m-%d'), ["text", "next_day_label"]].iloc[i]
                # Add the tweet to the tweet pack
                tweet_pack.append(tweet)
            # Add the tweet pack to the window tweet packs
            window_tweet_packs.append(tweet_pack)
        # Add the window tweet packs to the extracted tweets
        extracted_tweets.append(window_tweet_packs)
    return extracted_tweets

def shuffle_tweet_packs(tweet_packs, seed=None):
    # If a seed is provided, use it to initialize the random number generator
    if seed is not None:
        random.seed(seed)
    # Make a copy of the tweet packs list
    shuffled_packs = tweet_packs.copy()
    # Shuffle the copied list in-place
    random.shuffle(shuffled_packs)
    # Return the shuffled list
    return shuffled_packs

tweet_packs_to_df = lambda tweet_packs: pd.DataFrame([tweet for pack in tweet_packs for tweet in pack])

In [26]:
def generate_tweet_prompts(df):
    prompts = []

    for index, row in df.iterrows():
        tweet = row['text']
        previous_label = row['previous_label']
        roc = row['ROC']
        rsi = row['RSI']
        sentiment = row['sentiment_label']
        id_to_label = lambda x: "bullish" if x == 2 else "neutral" if x == 1 else "bearish"

        prompt = f" previous label: {id_to_label(previous_label)} ROC: {roc:} RSI: {rsi} sentiment: {id_to_label(sentiment)} tweet: {tweet}"

        prompts.append(prompt)

    df['prompt'] = prompts
    return df

In [77]:
balanced_df = undersample_tweets(merged_df)

In [78]:
balanced_df = generate_tweet_prompts(balanced_df)

In [79]:
balanced_df.sample(frac=1).iloc[20:24]

Unnamed: 0,text,sentiment_label,next_day_label,next_day_window_start,previous_label,ROC,RSI,prompt
2020-09-17,the only international and don mix you can ear...,2,0.0,False,0.0,neutral,bearish,previous label: bearish ROC: neutral RSI: bea...
2020-09-24,order your secure and eth hardware wallet only...,2,1.0,False,1.0,bullish,neutral,previous label: neutral ROC: bullish RSI: neu...
2020-12-04,cryptocurrency bitcoin follow the stock to flo...,1,1.0,False,1.0,bearish,bearish,previous label: neutral ROC: bearish RSI: bea...
2020-09-28,unknow wallet date bitcoin symbol btc detail c...,2,0.0,True,1.0,neutral,neutral,previous label: neutral ROC: neutral RSI: neu...


In [80]:
balanced_df.next_day_label.value_counts()

next_day_label
0.0    13839
2.0    13839
1.0    13839
Name: count, dtype: int64

In [81]:
windows = extract_windows(balanced_df)
tweets = extract_tweets(windows, balanced_df, 3)
flattened_tweet_packs = [tweet_pack for window in tweets for tweet_pack in window]
shuffled_tweet_packs = shuffle_tweet_packs(flattened_tweet_packs, seed=True)
shuffled_df = tweet_packs_to_df(shuffled_tweet_packs)
del(balanced_df, windows, flattened_tweet_packs, shuffled_tweet_packs)

Processing windows:   0%|          | 0/77 [00:00<?, ?it/s]

In [82]:
shuffled_df["day"] = shuffled_df.index.date
groups = shuffled_df.day

In [89]:
group_kfold = GroupKFold(n_splits=5)
train_folds = []
test_folds = []
# Split the dataset into training and evaluation sets
for train_index, test_index in group_kfold.split(shuffled_df, shuffled_df["next_day_label"], groups):
    train_folds.append(shuffled_df.iloc[train_index])
    test_folds.append(shuffled_df.iloc[test_index])

In [97]:
train_days = set(train_folds[0]["day"])
test_days = set(test_folds[0]["day"])
train_days.intersection(test_days)

set()

In [59]:
params = {
    "samples": shuffled_df.shape[0],
    "SEED":42,
    "TRAIN_TEST_SPLIT":0.2,
    "TRAINING_BATCH_SIZE":7,
    "EPOCHS":2,
    "LEARNING_RATE":1e-5,
    "FOLDS": 3
}

In [60]:
shuffled_df["label"] = shuffled_df.next_day_label
labeled_ds = HFDataset.from_pandas(shuffled_df[["text", "label"]])
labeled_ds = HFDataset.preprocess(labeled_ds)
labeled_ds = labeled_ds.class_encode_column('label')
tokenizer = AutoTokenizer.from_pretrained("ElKulako/cryptobert")
labeled_ds = HFDataset.tokenize(
    tokenizer, labeled_ds
)

Map:   0%|          | 0/356 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/356 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/356 [00:00<?, ? examples/s]

Map:   0%|          | 0/356 [00:00<?, ? examples/s]

In [39]:
labeled_ds = labeled_ds.shuffle()

In [40]:
labeled_ds

Dataset({
    features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 44019
})

In [65]:
groups

2020-11-12    2020-11-12
2020-11-13    2020-11-13
2020-11-14    2020-11-14
2020-11-15    2020-11-15
2020-11-16    2020-11-16
                 ...    
2020-12-13    2020-12-13
2020-03-25    2020-03-25
2020-03-26    2020-03-26
2020-03-27    2020-03-27
2020-03-28    2020-03-28
Name: day, Length: 356, dtype: object

In [66]:
group_kfold = GroupKFold(n_splits=5)
train_folds = []
test_folds = []
# Split the dataset into training and evaluation sets
for train_index, test_index in group_kfold.split(labeled_ds, labeled_ds["label"], groups):
    train_folds.append(labeled_ds.select(train_index))
    test_folds.append(labeled_ds.select(test_index))

In [69]:
train_folds[0]

Dataset({
    features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 284
})

In [41]:
kf = StratifiedKFold(n_splits=params.get("FOLDS", 5))
for train_index, test_index in kf.split(labeled_ds, labeled_ds["label"]):
    train_folds.append(labeled_ds.select(train_index))
    test_folds.append(labeled_ds.select(test_index))

In [44]:
def init_neptune_run(name, description, params, tags):
    """
    initializes and returns an instance of neptune run and sends the parameters
    """
    run = neptune.init_run(
    # proxies={
    #     "http": "http://tracker:nlOv5rC7cL3q3bYR@95.216.41.71:3128",
    #     "https": "http://tracker:nlOv5rC7cL3q3bYR@95.216.41.71:3128"
    # },
#     with_id=project_id,
    project="Financial-NLP/market-aware-embedding",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI2YWViODAxNC05MzNkLTRiZGMtOGI4My04M2U3MDViN2U3ODEifQ==",
    name=name,
    tags=tags,
    description=description
    )

    run["parameters"] = params
    return run

In [45]:
results = {
    "params": params,
    "base": {f"fold_{fold + 1}": {} for fold in range(params.get("FOLDS", 5))},
    "train": {f"fold_{fold + 1}": {f"epoch_{epoch + 1}": {} for epoch in range(params.get("EPOCHS", 5))} for fold in range(params.get("FOLDS", 5))},
    "eval": {f"fold_{fold + 1}": {f"epoch_{epoch + 1}": {} for epoch in range(params.get("EPOCHS", 5))} for fold in range(params.get("FOLDS", 5))},
    "selected_epochs": {f"fold_{fold + 1}": {} for fold in range(params.get("FOLDS", 5))}
}

In [44]:
model = CryptoBERT()

config.json:   0%|          | 0.00/932 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [None]:
!ls -la ~/.cache/huggingface/hub

total 20
drwxr-xr-x 4 root root 4096 Aug 20 14:21 .
drwxr-xr-x 3 root root 4096 Aug 20 14:19 ..
drwxr-xr-x 3 root root 4096 Aug 20 14:21 .locks
drwxr-xr-x 6 root root 4096 Aug 20 14:21 models--ElKulako--cryptobert
-rw-r--r-- 1 root root    1 Aug 20 14:19 version.txt


In [46]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [47]:
device

device(type='cpu')

In [None]:
exp_name="experiment_1.1"

In [None]:
neptune_run = init_neptune_run(exp_name, "experiment 1", params)
for index, test_fold in tqdm(enumerate(test_folds)):
    fold_num = index + 1
    model_name = "base"
    fold_epoch_addr = f"fold_{fold_num}"
    test_dataset = TextDataset(test_fold)
    test_dataloader = DataLoader(test_dataset, batch_size=params["TRAINING_BATCH_SIZE"])
    model.model.to(device)
    labels, preds, probs, losses = model.evaluate(dataloader=test_dataloader, device=device, model_name=f"{model_name}_{fold_epoch_addr}", neptune_run=neptune_run)
    results[model_name][f"{fold_epoch_addr}"] = model.compute_metrics_classification(np.concatenate(labels), np.concatenate(preds), np.concatenate(probs))
    model.plot_roc_curve(f"./result/figure/{exp_name}/{fold_epoch_addr}/{model_name}_roc_curve.png", np.concatenate(labels), np.concatenate(probs))
    model.plot_confusion_matrix(f"./result/figure/{exp_name}/{fold_epoch_addr}/{model_name}_matrix.png", np.concatenate(labels), np.concatenate(preds))
    neptune_run[f"{model_name}/{fold_epoch_addr}/roc_curve"].upload(f"./result/figure/{exp_name}/{fold_epoch_addr}/{model_name}_roc_curve.png") if neptune_run else None
    neptune_run[f"{model_name}/{fold_epoch_addr}/matrix"].upload(f"./result/figure/{exp_name}/{fold_epoch_addr}/{model_name}_matrix.png") if neptune_run else None


[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/Financial-NLP/market-aware-embedding/e/MAR-208


0it [00:00, ?it/s]

Evaluating Progress...:   0%|          | 0/87 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Evaluating Progress...:   0%|          | 0/87 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Evaluating Progress...:   0%|          | 0/87 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# model.model.classifier = FineTunedRobertaClassificationHead(model.model.classifier)
for param in model.model.roberta.encoder.layer[:11].parameters():
    param.requires_grad = False

In [None]:
optimizer = torch.optim.AdamW(model.model.parameters(), lr=params["LEARNING_RATE"])
num_epochs = params["EPOCHS"]
num_training_steps = (num_epochs * len(train_folds[0])) // params["TRAINING_BATCH_SIZE"]
num_warmup_steps = int(0.1 * num_training_steps)  # 10% of training steps
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)

In [None]:
exp_name="experiment_1.2"

In [None]:
neptune_run = init_neptune_run(exp_name, "experiment 1.1", params)
for index, _ in enumerate(train_folds):
    fold_num = index + 1
    model_name = "base"
    for epoch in tqdm(range(params.get("EPOCHS", 3)), desc="Epoch Progress..."):
        epoch_num = epoch + 1
        fold_epoch_addr = f"fold_{fold_num}/epoch_{epoch_num}"
        model_name="train"
        train_dataset = TextDataset(train_folds[index])
        train_dataloader = DataLoader(train_dataset, batch_size=params["TRAINING_BATCH_SIZE"])
        model.model.to(device)
        labels, preds, probs, losses = model.train(dataloader=train_dataloader, device=device, optimizer=optimizer, scheduler=scheduler, model_name=f"train_fold_{index + 1}", neptune_run=neptune_run)
        results[model_name][f"fold_{fold_num}"][f"epoch_{epoch_num}"] = model.compute_metrics_classification(np.concatenate(labels), np.concatenate(preds), np.concatenate(probs))
        model.plot_roc_curve(f"./result/figure/{exp_name}/{fold_epoch_addr}/{model_name}_roc_curve.png", np.concatenate(labels), np.concatenate(probs))
        model.plot_confusion_matrix(f"./result/figure/{exp_name}/{fold_epoch_addr}/{model_name}_matrix.png", np.concatenate(labels), np.concatenate(preds))
        neptune_run[f"{model_name}/{fold_epoch_addr}/roc_curve"].upload(f"./result/figure/{exp_name}/{fold_epoch_addr}/{model_name}_roc_curve.png") if neptune_run else None
        neptune_run[f"{model_name}/{fold_epoch_addr}/matrix"].upload(f"./result/figure/{exp_name}/{fold_epoch_addr}/{model_name}_matrix.png") if neptune_run else None

        model_name="eval"
        test_dataset = TextDataset(test_folds[index])
        test_dataloader = DataLoader(test_dataset, batch_size=params["TRAINING_BATCH_SIZE"])
        model.model.to(device)
        labels, preds, probs, losses = model.evaluate(dataloader=test_dataloader, device=device, model_name=f"eval_fold_{index + 1}", neptune_run=neptune_run)
        results[model_name][f"fold_{fold_num}"][f"epoch_{epoch_num}"] = model.compute_metrics_classification(np.concatenate(labels), np.concatenate(preds), np.concatenate(probs))
        model.plot_roc_curve(f"./result/figure/{exp_name}/{fold_epoch_addr}/{model_name}_roc_curve.png", np.concatenate(labels), np.concatenate(probs))
        model.plot_confusion_matrix(f"./result/figure/{exp_name}/{fold_epoch_addr}/{model_name}_matrix.png", np.concatenate(labels), np.concatenate(preds))
        neptune_run[f"{model_name}/{fold_epoch_addr}/roc_curve"].upload(f"./result/figure/{exp_name}/{fold_epoch_addr}/{model_name}_roc_curve.png") if neptune_run else None
        neptune_run[f"{model_name}/{fold_epoch_addr}/matrix"].upload(f"./result/figure/{exp_name}/{fold_epoch_addr}/{model_name}_matrix.png") if neptune_run else None



[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/Financial-NLP/market-aware-embedding/e/MAR-212


Epoch Progress...:   0%|          | 0/2 [00:00<?, ?it/s]

Training Progress...:   0%|          | 0/3809 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Evaluating Progress...:   0%|          | 0/1905 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Training Progress...:   0%|          | 0/3809 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Evaluating Progress...:   0%|          | 0/1905 [00:00<?, ?it/s]

Epoch Progress...:   0%|          | 0/2 [00:00<?, ?it/s]

Training Progress...:   0%|          | 0/3809 [00:00<?, ?it/s]

Evaluating Progress...:   0%|          | 0/1905 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
exp_name = "cua"

for frozen_layers in range(11, 7, -1):
    model = CryptoBERT()

    for param in model.model.roberta.encoder.layer[:frozen_layers].parameters():
        param.requires_grad = False

    optimizer = torch.optim.AdamW(model.model.parameters(), lr=params["LEARNING_RATE"])
    num_epochs = 1  # One epoch for each frozen layer configuration
    num_training_steps = (num_epochs * len(train_folds[0])) // params["TRAINING_BATCH_SIZE"]
    num_warmup_steps = int(0.1 * num_training_steps)  # 10% of training steps
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)

    # Initialize a separate Neptune run for each fold and frozen layer configuration
    neptune_run = init_neptune_run(name=f"Fold {1} - Frozen Layers {frozen_layers}", description="", params=params, tags=["unaware", "cross-validation", f"fold-{1}", f"frozen-{frozen_layers}"])

    epoch_num = 1
    fold_epoch_addr = f"fold_{1}/frozen_{frozen_layers}/epoch_{epoch_num}"

    # Training
    model_name = "train"
    train_dataset = TextDataset(train_folds[0])
    train_dataloader = DataLoader(train_dataset, batch_size=params["TRAINING_BATCH_SIZE"])
    model.model.to(device)

    labels, preds, probs, losses = model.train(dataloader=train_dataloader, device=device, optimizer=optimizer, scheduler=scheduler, model_name=model_name, neptune_run=neptune_run)
    results[model_name][f"fold_{1}"][f"frozen_{frozen_layers}"][f"epoch_{epoch_num}"] = model.compute_metrics_classification(np.concatenate(labels), np.concatenate(preds), np.concatenate(probs))

    model.plot_roc_curve(f"./result/figure/{exp_name}/{fold_epoch_addr}/{model_name}_roc_curve.png", np.concatenate(labels), np.concatenate(probs))
    model.plot_confusion_matrix(f"./result/figure/{exp_name}/{fold_epoch_addr}/{model_name}_matrix.png", np.concatenate(labels), np.concatenate(preds))

    if neptune_run:
        neptune_run[f"{model_name}/{fold_epoch_addr}/roc_curve"].upload(f"./result/figure/{exp_name}/{fold_epoch_addr}/{model_name}_roc_curve.png")
        neptune_run[f"{model_name}/{fold_epoch_addr}/matrix"].upload(f"./result/figure/{exp_name}/{fold_epoch_addr}/{model_name}_matrix.png")

    model.save_model(f"./model_{1}_frozen_{frozen_layers}_{epoch_num}")

    # Evaluation
    model_name = "eval"
    test_dataset = TextDataset(test_folds[0])
    test_dataloader = DataLoader(test_dataset, batch_size=params["TRAINING_BATCH_SIZE"])
    model.model.to(device)

    labels, preds, probs, losses = model.evaluate(dataloader=test_dataloader, device=device, model_name=model_name, neptune_run=neptune_run)
    results[model_name][f"fold_{1}"][f"frozen_{frozen_layers}"][f"epoch_{epoch_num}"] = model.compute_metrics_classification(np.concatenate(labels), np.concatenate(preds), np.concatenate(probs))

    model.plot_roc_curve(f"./result/figure/{exp_name}/{fold_epoch_addr}/{model_name}_roc_curve.png", np.concatenate(labels), np.concatenate(probs))
    model.plot_confusion_matrix(f"./result/figure/{exp_name}/{fold_epoch_addr}/{model_name}_matrix.png", np.concatenate(labels), np.concatenate(preds))

    if neptune_run:
        neptune_run[f"{model_name}/{fold_epoch_addr}/roc_curve"].upload(f"./result/figure/{exp_name}/{fold_epoch_addr}/{model_name}_roc_curve.png")
        neptune_run[f"{model_name}/{fold_epoch_addr}/matrix"].upload(f"./result/figure/{exp_name}/{fold_epoch_addr}/{model_name}_matrix.png")



[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/Financial-NLP/market-aware-embedding/e/MAR-248


Training Progress...:   0%|          | 0/4193 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr