In [None]:
import os
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
import torch

# Enable Weights & Biases for tracking fine-tuning progress
os.environ["WANDB_DISABLED"] = "true"

def load_dataset_and_preprocess(dataset_name="zeroshot/twitter-financial-news-sentiment"):
    """Loads and preprocesses the financial sentiment dataset."""
    dataset = load_dataset(dataset_name)
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")  # Using DistilBERT for efficiency
    
    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True)
    
    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    return tokenized_datasets, tokenizer

In [2]:
def fine_tune_model(tokenized_datasets, tokenizer, model_name="distilbert-base-uncased", num_labels=3):
    """Fine-tunes a DistilBERT model on the financial sentiment dataset."""
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
    
    training_args = TrainingArguments(
        output_dir="./results",
        run_name="financial_sentiment_experiment",  # Set unique run name
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_steps=50,  # Show progress every 50 steps
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir="./logs",
        report_to=["wandb"],  # Enable logging to Weights & Biases
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],  # Changed from "test" to "validation"
        tokenizer=tokenizer,
    )
    
    trainer.train()
    return model

def analyze_sentiment(model, tokenizer, texts):
    """Analyzes sentiment for financial news texts using the fine-tuned model."""
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    labels = ["negative", "neutral", "positive"]  # Adjust if necessary
    return [labels[pred.argmax().item()] for pred in predictions]

In [None]:
dataset_name = "zeroshot/twitter-financial-news-sentiment"
tokenized_datasets, tokenizer = load_dataset_and_preprocess(dataset_name)

fine_tuned_model = fine_tune_model(tokenized_datasets, tokenizer)

# Example financial news headlines
financial_news = [
    "Federal Reserve signals rate hikes to continue amid inflation concerns",
    "Tech stocks surge as earnings beat expectations",
    "Oil prices fall as demand weakens in Asia"
]

# Perform sentiment analysis
results = analyze_sentiment(fine_tuned_model, tokenizer, financial_news)

# Print results
for news, sentiment in zip(financial_news, results):
    print(f"News: {news}\nPredicted Sentiment: {sentiment}\n")

Map:   0%|          | 0/2388 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked,