# Temporal and market aware embeddings
1. load price and textual datasets
2. prefix tweets with their time and price context
3. fine-tune the model on text with and without the context
4. comparing the base model with the fine-tuned masking model on the new dataset 

In [1]:
import numpy as np
import pandas as pd
from functools import partial
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments, TrainerCallback
from datasets import Dataset
from torch.utils.data import DataLoader, Dataset as torchDS
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from tqdm.notebook import tqdm  # Use notebook version of tqdm for better compatibility with Jupyter
from torch.optim import AdamW
import neptune
import torch

In [2]:
import sys
import os
current_working_directory = os.getcwd()
sys.path.append(os.path.dirname(current_working_directory))
from src.util import *

### load price and textual datasets

In [3]:
text_df = pd.read_csv("../raw/combined_tweets_2020_labeled.csv", usecols=["date", "text_split"])
text_df.rename(columns={"text_split": "text"}, inplace=True)
text_df.set_index('date', inplace=True)
text_df.index = pd.to_datetime(text_df.index)

In [4]:
price_df = pd.read_csv("../raw/daily-2020.csv", usecols=["timestamp", "close", "open", "high", "low", "volume"])
price_df.set_index('timestamp', inplace=True)
price_df.index = pd.to_datetime(price_df.index, unit='s')

In [5]:
print(text_df.shape, price_df.shape)

(64310, 1) (366, 5)


### prefix tweets with their time and price context
how do we define temporal and price context in this scenario
- temporal context: as the month and year of each tweet (e.g. Mar, 2020)
- price context: moving average of the price, trend, percent change? No, tag the current trend based on triple barrier labeling

In [6]:
def extract_time_string(df):
    """
    Extract time string from date column to be used in the tweet
    """
    df['time'] = df.index.to_series().dt.strftime('%d,%b,%Y')
    return df

In [7]:
def prefix_text_column(df, time_col, trend_col, text_col):
    """
    Prefix a text column with temporal and market context.

    Parameters:
    df (DataFrame): The input DataFrame.
    time_col (str): The name of the time column.
    trend_col (str): The name of the trend column.
    text_col (str): The name of the text column.

    Returns:
    DataFrame: The DataFrame with the prefixed text column.
    """
    # Create a new column by combining the time, trend, and text columns
    df["context_aware"] = "time: " + df[time_col].astype(str) + " trend: " + df[trend_col].astype(str) + " text: " + df[text_col]

    # Return the DataFrame
    return df

In [8]:
def select_equal_samples(df, n_samples):
    """
    Select equal numbers of tweets from each day in the dataset.

    Parameters:
    df (DataFrame): The input DataFrame.
    n_samples (int): The number of samples to select from each day.

    Returns:
    DataFrame: The DataFrame with the selected samples.
    """
    # Get the unique dates
    unique_dates = df.index.unique()

    # Initialize an empty DataFrame to store the selected samples
    selected_samples = pd.DataFrame()

    # Iterate over each unique date
    for date in unique_dates:
        # Select n_samples from the current date
        samples = df.loc[date].sample(n_samples, replace=True)

        # Append the samples to the selected_samples DataFrame
        selected_samples = pd.concat([selected_samples, samples])

    # Return the selected_samples DataFrame
    return selected_samples

In [9]:
price_df = triple_barrier_labeling(price_df)
price_df["text_label"] = price_df.label.map({0: 'bearish', 1: 'neutral', 2: 'bullish'})

In [10]:
price_df["label"] = price_df.label.shift(-1)

In [11]:
price_df.dropna(inplace=True)

In [12]:
price_df

Unnamed: 0_level_0,open,high,low,close,volume,upper_barrier,lower_barrier,vertical_barrier,label,text_label
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-01-04,7341.60,7400.00,7269.21,7350.71,92586.033,7532.141244,7169.278756,2020-01-11,2.0,neutral
2020-01-05,7350.54,7495.00,7303.00,7354.36,117765.972,7547.877907,7160.842093,2020-01-12,2.0,bullish
2020-01-06,7354.36,7808.65,7345.00,7757.39,168150.317,7961.689059,7553.090941,2020-01-13,1.0,bullish
2020-01-07,7757.74,8215.33,7733.00,8152.49,280809.162,8535.874250,7769.105750,2020-01-14,1.0,neutral
2020-01-08,8150.90,8468.42,7870.11,8059.84,321225.114,8419.621270,7700.058730,2020-01-15,2.0,neutral
...,...,...,...,...,...,...,...,...,...,...
2020-12-26,24729.99,26926.00,24507.24,26508.83,367265.555,27952.234709,25065.425291,2021-01-02,1.0,bullish
2020-12-27,26508.84,28459.84,25850.00,26305.64,540264.148,27626.927027,24984.352973,2021-01-03,0.0,neutral
2020-12-28,26301.76,27538.82,26117.10,27102.66,267563.468,28115.311395,26090.008605,2021-01-04,2.0,bearish
2020-12-29,27101.45,27441.73,25913.01,27402.83,260759.449,27912.921908,26892.738092,2021-01-05,1.0,bullish


In [13]:
text_df = extract_time_string(text_df)

In [14]:
labeled_df = text_df.merge(price_df[['label', 'text_label']], left_index=True, right_index=True, how='left')

In [15]:
labeled_df = prefix_text_column(labeled_df, 'time', 'text_label', 'text')

In [16]:
labeled_df.dropna(inplace=True)

In [17]:
labeled_df.tail()

Unnamed: 0,text,time,label,text_label,context_aware
2020-12-30,want new york its first publicly available yen...,"30,Dec,2020",1.0,neutral,"time: 30,Dec,2020 trend: neutral text: want ne..."
2020-12-30,next decade of sustainable crypto innovation b...,"30,Dec,2020",1.0,neutral,"time: 30,Dec,2020 trend: neutral text: next de..."
2020-12-30,bitcoin too complete simple and earn up to tra...,"30,Dec,2020",1.0,neutral,"time: 30,Dec,2020 trend: neutral text: bitcoin..."
2020-12-30,no tie to the btc dollar ratio wonder if he wo...,"30,Dec,2020",1.0,neutral,"time: 30,Dec,2020 trend: neutral text: no tie ..."
2020-12-30,rich bitcoin cad bitcoin btc bitcoin everythin...,"30,Dec,2020",1.0,neutral,"time: 30,Dec,2020 trend: neutral text: rich bi..."


In [18]:
how_many_tweets_per_day = 100
sampled_df = select_equal_samples(labeled_df, how_many_tweets_per_day)

In [19]:
sampled_df.text_label.value_counts()

text_label
bullish    15500
neutral    10600
bearish    10100
Name: count, dtype: int64

In [20]:
dataset = Dataset.from_pandas(sampled_df[['text', 'context_aware', 'label']])

In [21]:
dataset = dataset.train_test_split(0.2)

### fine-tune the model on text with and without the context

In [22]:
def init_neptune_run(name, description, params):
    """
    initializes and returns an instance of neptune run and sends the parameters
    """
    run = neptune.init_run(
    proxies={
        "http": "http://tracker:nlOv5rC7cL3q3bYR@95.216.41.71:3128",
        "https": "http://tracker:nlOv5rC7cL3q3bYR@95.216.41.71:3128"
    },
    project="Financial-NLP/market-aware-embedding",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI2YWViODAxNC05MzNkLTRiZGMtOGI4My04M2U3MDViN2U3ODEifQ==",
    name=name,
    description=description
    )

    run["parameters"] = params
    return run

In [23]:
class TextDataset(torchDS):
    def __init__(self, hf_dataset):
        self.hf_dataset = hf_dataset

    def __len__(self):
        return len(self.hf_dataset)

    def __getitem__(self, idx):
        item = self.hf_dataset[idx]
        return {
            'input_ids': torch.tensor(item['input_ids']),
            'attention_mask': torch.tensor(item['attention_mask']),
            'labels': torch.tensor(item['label'])
        }

In [45]:
def compute_metrics(labels, preds, probs):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    probs = softmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    roc_auc = roc_auc_score(labels, probs, multi_class='ovr')
    conf_matrix = confusion_matrix(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'roc_auc': roc_auc,
        'conf_matrix': conf_matrix
    }

In [25]:
# Tokenize the text field in the dataset
def tokenize_function(tokenizer, examples, text_col="text"):
    # Tokenize the text and return only the necessary fields
    encoded = tokenizer(examples[text_col], padding='max_length', max_length=512)
    return {"input_ids": encoded["input_ids"], "attention_mask": encoded["attention_mask"], "label": examples["label"]}

In [48]:
def train_model(model, train_dataloader, epochs, learning_rate, neptune_run=None, device=None):
    epoch_results = {}

    if device is None:
        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    # Move the model to the device
    model.to(device)
    # Set up the optimizer
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    # Training loop
    for epoch in tqdm(range(epochs)):  # Number of epochs
        all_labels = []
        all_preds = []
        all_probs = []  # For storing probabilities
        for batch in tqdm(train_dataloader):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].long().to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            # Store labels, predictions and probabilities for metrics calculation
            preds = torch.nn.functional.softmax(outputs.logits, dim=-1)
            all_probs.append(preds.detach().cpu().numpy())  # Store probabilities
            class_preds = torch.argmax(preds, dim=-1)
            all_preds.append(class_preds.cpu().numpy())
            all_labels.append(labels.cpu().numpy())
            results = compute_metrics(np.concatenate(all_labels), np.concatenate(all_preds), np.concatenate(all_probs))
            if neptune_run != None:
                neptune_run["eval/accuracy"].log(results["accuracy"])
                neptune_run["eval/precision"].log(results["precision"])
                neptune_run["eval/recall"].log(results["recall"])
                neptune_run["eval/f1"].log(results["f1"])

        # Calculate and log metrics after each epoch
        all_labels = np.concatenate(all_labels)
        all_preds = np.concatenate(all_preds)
        all_probs = np.concatenate(all_probs)  # Concatenate probabilities
        results = compute_metrics(all_labels, all_preds, all_probs)
        if neptune_run != None:
            neptune_run["train/accuracy"].log(results["accuracy"])
            neptune_run["train/precision"].log(results["precision"])
            neptune_run["train/recall"].log(results["recall"])
            neptune_run["train/f1"].log(results["f1"])
        epoch_results[epoch] = results

    # Return both the trained model and the epoch results
    return model, epoch_results

In [28]:
def evaluate_model(model, eval_dataloader, neptune_run=None, device=None):
    eval_results = {}

    if device is None:
        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    # Move the model to the device
    model.to(device)

    # Evaluation loop
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():  # Disable gradient calculation
        all_labels = []
        all_preds = []
        all_probs = []  # For storing probabilities
        for batch in tqdm(eval_dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].long().to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

            # Store labels, predictions and probabilities for metrics calculation
            preds = torch.nn.functional.softmax(outputs.logits, dim=-1)
            all_probs.append(preds.detach().cpu().numpy())  # Store probabilities
            class_preds = torch.argmax(preds, dim=-1)
            all_preds.append(class_preds.cpu().numpy())
            all_labels.append(labels.cpu().numpy())
            results = compute_metrics(np.concatenate(all_labels), np.concatenate(all_preds), np.concatenate(all_probs))
            if neptune_run != None:
                neptune_run["eval/accuracy"].log(results["accuracy"])
                neptune_run["eval/precision"].log(results["precision"])
                neptune_run["eval/recall"].log(results["recall"])
                neptune_run["eval/f1"].log(results["f1"])
            

        # Calculate and log metrics after each epoch
        all_labels = np.concatenate(all_labels)
        all_preds = np.concatenate(all_preds)
        all_probs = np.concatenate(all_probs)  # Concatenate probabilities
        results = compute_metrics(all_labels, all_preds, all_probs)
        if neptune_run != None:
            neptune_run["eval/accuracy"].log(results["accuracy"])
            neptune_run["eval/precision"].log(results["precision"])
            neptune_run["eval/recall"].log(results["recall"])
            neptune_run["eval/f1"].log(results["f1"])

        eval_results = results
    return eval_results

In [29]:
# tokenizing the dataset text to be used in train and test loops
tokenizer = AutoTokenizer.from_pretrained("ElKulako/cryptobert")
partial_tokenize_function_text = partial(tokenize_function, tokenizer, text_col="text")
partial_tokenize_function_context = partial(tokenize_function, tokenizer, text_col="context_aware")

In [30]:
# Tokenizing
tokenized_train_text = dataset["train"].map(partial_tokenize_function_text, batched=True)
tokenized_test_text = dataset["test"].map(partial_tokenize_function_text, batched=True)
tokenized_train_context = dataset["train"].map(partial_tokenize_function_context, batched=True)
tokenized_test_context = dataset["test"].map(partial_tokenize_function_context, batched=True)

Map:   0%|          | 0/28960 [00:00<?, ? examples/s]

Map:   0%|          | 0/7240 [00:00<?, ? examples/s]

Map:   0%|          | 0/28960 [00:00<?, ? examples/s]

Map:   0%|          | 0/7240 [00:00<?, ? examples/s]

In [31]:
tokenized_train_text_dataset = TextDataset(tokenized_train_text)
tokenized_test_text_dataset = TextDataset(tokenized_test_text)
tokenized_train_context_dataset = TextDataset(tokenized_train_context)
tokenized_test_context_dataset = TextDataset(tokenized_test_context)

In [35]:
BATCH_SIZE=10
tokenized_train_text_dataloader = DataLoader(tokenized_train_text_dataset, batch_size=BATCH_SIZE, shuffle=False)
tokenized_test_text_dataloader = DataLoader(tokenized_test_text_dataset, batch_size=BATCH_SIZE, shuffle=True)
tokenized_train_context_dataloader = DataLoader(tokenized_train_context_dataset, batch_size=BATCH_SIZE, shuffle=False)
tokenized_test_context_dataloader = DataLoader(tokenized_test_context_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [36]:
learning_rate = 1e-5
epochs = 3

### comparing the base model with the fine-tuned masking model on the new dataset

In [37]:
params = {
    "learning_rate": learning_rate,
    "epochs": epochs,
    "batch_size": BATCH_SIZE,
    "train_batches": tokenized_train_text_dataloader,
    "test_batches": tokenized_test_text_dataloader
}

In [None]:
run = init_neptune_run(name="base_text_model", description="base model fine-tuned on textual data without temporal or market context", params=params)
text_trained_model = AutoModelForSequenceClassification.from_pretrained("ElKulako/cryptobert", num_labels=3)
text_trained_model, _ = train_model(text_trained_model, tokenized_train_text_dataloader, epochs, learning_rate, run)
evaluate_model(text_trained_model, tokenized_test_text_dataloader, neptune_run=run)

In [None]:
run = init_neptune_run(name="temporal_context_model", description="base model fine-tuned on textual data with temporal and market context", params=params)
context_trained_model = AutoModelForSequenceClassification.from_pretrained("ElKulako/cryptobert", num_labels=3)
context_trained_model, _, = train_model(context_trained_model, tokenized_train_context_dataloader, epochs, learning_rate, run)
evaluate_model(context_trained_model, tokenized_test_context_dataloader, neptune_run=run)

In [None]:
run = init_neptune_run(name="base_model", description="base model without fine-tuning", params=params)
base_model = AutoModelForSequenceClassification.from_pretrained("ElKulako/cryptobert", num_labels=3)
evaluate_model(base_model, tokenized_test_text_dataloader, neptune_run=run)