In [2]:
import os
import torch
from transformers import OPTForSequenceClassification, AutoTokenizer, AdamW
import wandb
import json
from torch.utils.data import Dataset, DataLoader, random_split
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# =============================================================================
# DIRECTORIES
# =============================================================================

# Base directories
try:
    program_dir = os.path.dirname(os.path.abspath(__file__))
    base_dir = os.path.abspath(os.path.join(program_dir, "../../../08 Models"))
except NameError:
    base_dir = os.path.abspath(os.path.join(os.getcwd(), "../.."))

# Define key directories for storing scripts, data, and logs
opt_dir = os.path.join(base_dir, "01 Scripts", "07 OPT Model", "OPT")
files_dir = os.path.join(os.path.dirname(base_dir), "02 Data")

# Training data directories
train_data = os.path.join(files_dir, "06 Trainable data", "generic trainble data")
synthetic_dir = os.path.join(files_dir, "06 Trainable data", "Synthetic Instructions")
# Assuming your tokenized data is in a JSONL file
opt_tokenized_path = os.path.join(files_dir, "06 Trainable data", "OPT tokenized data", "OPT_tokenized_data.jsonl")

# =============================================================================
# MODEL AND TRAINING PARAMETERS
# =============================================================================

MODEL_NAME = "facebook/opt-125m"  # Pre-trained OPT model from Hugging Face
MAX_LENGTH = 2048  # Maximum sequence length for input tokens
WANDB_PROJECT = "India-stocks"  # WandB project name

# Hyperparameter tuning values for training
LEARNING_RATES = [5e-5, 3e-5, 1e-5]  # Different learning rates to test
BATCH_SIZES = [8, 16]  # Different batch sizes to experiment with
EPOCHS_LIST = [3, 4]  # Different numbers of training epochs to compare

# =============================================================================
# DATASET CLASS
# =============================================================================

class StockPredictionDataset(Dataset):
    """
    Custom dataset class for stock prediction using tokenized financial data.
    Each line in the dataset file should be a JSON object containing at least:
      - "tokens": a list of tokenized values
      - "return_label": the binary classification label
    """
    def __init__(self, data_path, max_length):
        self.data = []
        with open(data_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()  # Remove leading/trailing whitespace
                if line:  # Skip empty lines
                    try:
                        self.data.append(json.loads(line))
                    except json.JSONDecodeError as e:
                        print(f"Skipping malformed JSON: {line} - Error: {e}")
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        tokens = item["tokens"]
        # Truncate tokens if necessary
        if len(tokens) > self.max_length:
            tokens = tokens[:self.max_length]
        input_ids = torch.tensor(tokens, dtype=torch.long)
        attention_mask = torch.ones_like(input_ids)
        padding_length = self.max_length - len(tokens)
        if padding_length > 0:
            pad_tensor = torch.zeros(padding_length, dtype=torch.long)
            input_ids = torch.cat((input_ids, pad_tensor))
            attention_mask = torch.cat((attention_mask, torch.zeros(padding_length, dtype=torch.long)))
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": torch.tensor(item["return_label"], dtype=torch.long),
        }

# =============================================================================
# EVALUATION FUNCTION
# =============================================================================

def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    predictions_list = []
    labels_list = []
    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
        total_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        predictions_list.extend(preds.cpu().numpy())
        labels_list.extend(labels.cpu().numpy())
    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(labels_list, predictions_list)
    precision, recall, f1, _ = precision_recall_fscore_support(labels_list, predictions_list, average='binary', zero_division=0)
    model.train()
    return avg_loss, accuracy, precision, recall, f1

# =============================================================================
# MAIN EXPERIMENTATION LOOP
# =============================================================================

def main():
    for lr in LEARNING_RATES:
        for batch_size in BATCH_SIZES:
            for epochs in EPOCHS_LIST:
                # Initialize WandB run with hyperparameters and team details
                wandb.init(
                    project=WANDB_PROJECT,
                    config={
                        "learning_rate": lr,
                        "batch_size": batch_size,
                        "epochs": epochs,
                        "max_length": MAX_LENGTH,
                        "model_name": MODEL_NAME,
                    },
                    name=f"lr_{lr}_bs_{batch_size}_epochs_{epochs}"
                )

                # Load tokenizer and model
                tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
                model = OPTForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
                device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
                model.to(device)

                # Load dataset
                dataset = StockPredictionDataset(opt_tokenized_path, MAX_LENGTH)
                total_size = len(dataset)
                train_size = int(0.6 * total_size)
                val_size = int(0.2 * total_size)
                test_size = total_size - train_size - val_size
                train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])
                train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
                val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
                test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

                # Initialize optimizer
                optimizer = AdamW(model.parameters(), lr=lr)

                # Training loop
                for epoch in range(epochs):
                    total_loss = 0
                    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
                        input_ids = batch["input_ids"].to(device)
                        attention_mask = batch["attention_mask"].to(device)
                        labels = batch["labels"].to(device)

                        optimizer.zero_grad()
                        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                        loss = outputs.loss
                        loss.backward()
                        optimizer.step()
                        total_loss += loss.item()
                    avg_train_loss = total_loss / len(train_loader)
                    print(f"Epoch {epoch+1}/{epochs} - Train Loss: {avg_train_loss}")

                    # Validation evaluation
                    val_loss, val_accuracy, val_precision, val_recall, val_f1 = evaluate(model, val_loader, device)
                    print(f"Epoch {epoch+1}/{epochs} - Val Loss: {val_loss}, Accuracy: {val_accuracy}, "
                          f"Precision: {val_precision}, Recall: {val_recall}, F1: {val_f1}")
                    wandb.log({
                        "train_loss": avg_train_loss,
                        "val_loss": val_loss,
                        "val_accuracy": val_accuracy,
                        "val_precision": val_precision,
                        "val_recall": val_recall,
                        "val_f1": val_f1,
                        "epoch": epoch+1
                    })

                # Test evaluation after training
                test_loss, test_accuracy, test_precision, test_recall, test_f1 = evaluate(model, test_loader, device)
                print(f"Test Loss: {test_loss}, Accuracy: {test_accuracy}, Precision: {test_precision}, "
                      f"Recall: {test_recall}, F1: {test_f1}")
                wandb.log({
                    "test_loss": test_loss,
                    "test_accuracy": test_accuracy,
                    "test_precision": test_precision,
                    "test_recall": test_recall,
                    "test_f1": test_f1
                })

                # Save model and tokenizer
                model_save_path = f"./opt_stock_model_lr_{lr}_bs_{batch_size}_epochs_{epochs}"
                model.save_pretrained(model_save_path)
                tokenizer.save_pretrained(model_save_path)
                wandb.save(f"{model_save_path}/*")
                wandb.finish()

if __name__ == "__main__":
    main()


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3: 100%|██████████| 741/741 [5:41:08<00:00, 27.62s/it]  


Epoch 1/3 - Train Loss: 0.7004794223829802


KeyboardInterrupt: 