In [None]:
# === Data Loading and Preparation ===
!pip install praw
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from torch.utils.data import Dataset, DataLoader
from collections import Counter
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, AdamW
import torch.nn as nn
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score, precision_score, recall_score, f1_score
import praw
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import requests
from bs4 import BeautifulSoup


# Loading training datasets
cnbc_df = pd.read_csv("cnbc_sentiment.csv")
guardian_df = pd.read_csv("guardian_sentiment.csv")
reuters_df = pd.read_csv("reuters_sentiment.csv")
df = pd.concat([
    cnbc_df[['Headlines', 'Sentiment']],
    guardian_df[['Headlines', 'Sentiment']],
    reuters_df[['Headlines', 'Sentiment']]
])
df['Sentiment'] = df['Sentiment'].astype(int)

# Oversampling minority classes for balanced dataset
minority_classes = [2, 3, 4]
max_count = df['Sentiment'].value_counts().max()
oversampled_dfs = [df[df['Sentiment'] == 1], df[df['Sentiment'] == 5]]
for sentiment in minority_classes:
    class_df = df[df['Sentiment'] == sentiment]
    oversampled_df = class_df.sample(max_count, replace=True, random_state=42)
    oversampled_dfs.append(oversampled_df)
df_balanced = pd.concat(oversampled_dfs)

# Splitting datasets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_balanced['Headlines'].tolist(),
    df_balanced['Sentiment'].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df_balanced['Sentiment']
)
train_texts_full, val_texts_full, train_labels_full, val_labels_full = train_test_split(
    df['Headlines'].tolist(),
    df['Sentiment'].tolist(),
    test_size=0.2,
    random_state=42
)

# Defining SentimentDataset (its shared across models)
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer=None, word_to_idx=None, max_length=64):
        self.texts = texts
        self.labels = [label - 1 for label in labels]
        self.tokenizer = tokenizer
        self.word_to_idx = word_to_idx
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        if self.tokenizer:
            encoding = self.tokenizer(
                text,
                add_special_tokens=True,
                max_length=self.max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )
            return {
                'input_ids': encoding['input_ids'].squeeze(0),
                'attention_mask': encoding['attention_mask'].squeeze(0),
                'labels': torch.tensor(label, dtype=torch.long)
            }
        elif self.word_to_idx:
            tokens = word_tokenize(text.lower())
            indices = [self.word_to_idx.get(token, 0) for token in tokens[:self.max_length]]
            indices += [0] * (self.max_length - len(indices)) if len(indices) < self.max_length else indices
            return {
                'input_ids': torch.tensor(indices, dtype=torch.long),
                'labels': torch.tensor(label, dtype=torch.long)
            }
        else:
            raise ValueError("Must provide tokenizer or word_to_idx")

# Vocabulary for LSTM
all_words = [word for text in df_balanced['Headlines'] for word in word_tokenize(text.lower())]
vocab = {word: idx + 1 for idx, (word, _) in enumerate(Counter(all_words).most_common(10000))}
vocab['<unk>'] = 0
all_words_full = [word for text in df['Headlines'] for word in word_tokenize(text.lower())]
vocab_full = {word: idx + 1 for idx, (word, _) in enumerate(Counter(all_words_full).most_common(10000))}
vocab_full['<unk>'] = 0

# Dataset info
print("=== Original Class Distribution ===")
print(df['Sentiment'].value_counts().sort_index())
print("\n=== Balanced Class Distribution ===")
print(df_balanced['Sentiment'].value_counts().sort_index())
print("\n=== Dataset Size ===")
print(f"Original rows: {df.shape[0]}, Balanced rows: {df_balanced.shape[0]}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


=== Original Class Distribution ===
Sentiment
1    29541
2     3107
3     4278
4     5093
5    11351
Name: count, dtype: int64

=== Balanced Class Distribution ===
Sentiment
1    29541
2    29541
3    29541
4    29541
5    11351
Name: count, dtype: int64

=== Dataset Size ===
Original rows: 53370, Balanced rows: 129515


In [None]:
# === DistilBERT Model and Combined Metrics ===

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=5).to(device)
model_weighted = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=5).to(device)

# Training setup (Undersampled)
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)
scaler = torch.amp.GradScaler('cuda')
class_counts = df_balanced['Sentiment'].value_counts().sort_index()
class_weights = torch.tensor([1.0 / count for count in class_counts], dtype=torch.float).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)

train_dataset = SentimentDataset(train_texts, train_labels, tokenizer=tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=2)

# Training loop at 5 epochs
model.train()
for epoch in range(5):
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        with torch.amp.autocast(device_type='cuda'):
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"DistilBERT Undersampled Epoch {epoch+1}/5, Loss: {avg_loss:.4f}")

# Training setup (Class-Weighted)
optimizer_weighted = AdamW(model_weighted.parameters(), lr=1e-5, weight_decay=0.01)
scaler_weighted = torch.amp.GradScaler('cuda')
class_counts_full = df['Sentiment'].value_counts().sort_index()
class_weights_full = torch.tensor([1.0 / count for count in class_counts_full], dtype=torch.float).to(device)
criterion_weighted = nn.CrossEntropyLoss(weight=class_weights_full)

train_dataset_full = SentimentDataset(train_texts_full, train_labels_full, tokenizer=tokenizer)
train_loader_full = DataLoader(train_dataset_full, batch_size=16, shuffle=True, num_workers=2)

# Training loop
model_weighted.train()
for epoch in range(5):
    total_loss = 0
    for batch in train_loader_full:
        optimizer_weighted.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        with torch.amp.autocast(device_type='cuda'):
            outputs = model_weighted(input_ids, attention_mask=attention_mask)
            loss = criterion_weighted(outputs.logits, labels)
        scaler_weighted.scale(loss).backward()
        scaler_weighted.unscale_(optimizer_weighted)
        torch.nn.utils.clip_grad_norm_(model_weighted.parameters(), max_norm=1.0)
        scaler_weighted.step(optimizer_weighted)
        scaler_weighted.update()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader_full)
    print(f"DistilBERT Class-Weighted Epoch {epoch+1}/5, Loss: {avg_loss:.4f}")

# Combined evaluation function
def evaluate_model(model, tokenizer, texts, labels, model_name):
    eval_dataset = SentimentDataset(texts, labels, tokenizer=tokenizer)
    eval_loader = DataLoader(eval_dataset, batch_size=32, shuffle=False, num_workers=2)
    model.eval()
    all_preds_regression, all_preds_classification, all_labels = [], [], []
    with torch.no_grad():
        for batch in eval_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            with torch.amp.autocast(device_type='cuda'):
                outputs = model(input_ids, attention_mask=attention_mask)
            probabilities = torch.softmax(outputs.logits, dim=1).cpu().numpy()
            preds_regression = [sum(i * prob for i, prob in enumerate(prob)) + 1 for prob in probabilities]
            preds_classification = np.argmax(probabilities, axis=1)
            all_preds_regression.extend(preds_regression)
            all_preds_classification.extend(preds_classification)
            all_labels.extend([label + 1 for label in labels.cpu().numpy()])
    # Regression metrics
    mse = mean_squared_error(all_labels, all_preds_regression)
    mae = mean_absolute_error(all_labels, all_preds_regression)
    r2 = r2_score(all_labels, all_preds_regression)
    # Classification metrics
    accuracy = accuracy_score([label - 1 for label in all_labels], all_preds_classification)
    precision = precision_score([label - 1 for label in all_labels], all_preds_classification, average='weighted')
    recall = recall_score([label - 1 for label in all_labels], all_preds_classification, average='weighted')
    f1 = f1_score([label - 1 for label in all_labels], all_preds_classification, average='weighted')
    print(f"\n=== {model_name} Combined Metrics ===")
    print("Regression Metrics:")
    print(f"MSE: {mse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")
    print("Classification Metrics:")
    print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")
    print("Sample Predictions vs True Labels:")
    for i in range(min(5, len(all_preds_regression))):
        print(f"Pred: {all_preds_regression[i]:.2f}, True: {all_labels[i]:.2f}")

# Evaluate both models
evaluate_model(model, tokenizer, val_texts, val_labels, "DistilBERT Undersampled")
evaluate_model(model_weighted, tokenizer, val_texts_full, val_labels_full, "DistilBERT Class-Weighted")

In [None]:
# === MiniLM Model and Combined Metrics ===

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Loading tokenizer and the model
tokenizer_mini = AutoTokenizer.from_pretrained('microsoft/MiniLM-L12-H384-uncased')
model_mini = AutoModelForSequenceClassification.from_pretrained('microsoft/MiniLM-L12-H384-uncased', num_labels=5).to(device)

# Training setup
optimizer_mini = AdamW(model_mini.parameters(), lr=2e-5, weight_decay=0.01)
scaler_mini = torch.amp.GradScaler('cuda')
class_counts = df_balanced['Sentiment'].value_counts().sort_index()
class_weights = torch.tensor([1.0 / count for count in class_counts], dtype=torch.float).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)

train_dataset = SentimentDataset(train_texts, train_labels, tokenizer=tokenizer_mini)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=2)

# Training loop fixed at 5 epochs
model_mini.train()
for epoch in range(5):
    total_loss = 0
    for batch in train_loader:
        optimizer_mini.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        with torch.amp.autocast(device_type='cuda'):
            outputs = model_mini(input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)
        scaler_mini.scale(loss).backward()
        scaler_mini.unscale_(optimizer_mini)
        torch.nn.utils.clip_grad_norm_(model_mini.parameters(), max_norm=1.0)
        scaler_mini.step(optimizer_mini)
        scaler_mini.update()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"MiniLM Undersampled Epoch {epoch+1}/5, Loss: {avg_loss:.4f}")

# Combined evaluation function
def evaluate_model(model, tokenizer, texts, labels, model_name):
    eval_dataset = SentimentDataset(texts, labels, tokenizer=tokenizer)
    eval_loader = DataLoader(eval_dataset, batch_size=32, shuffle=False, num_workers=2)
    model.eval()
    all_preds_regression, all_preds_classification, all_labels = [], [], []
    with torch.no_grad():
        for batch in eval_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            with torch.amp.autocast(device_type='cuda'):
                outputs = model(input_ids, attention_mask=attention_mask)
            probabilities = torch.softmax(outputs.logits, dim=1).cpu().numpy()
            preds_regression = [sum(i * prob for i, prob in enumerate(prob)) + 1 for prob in probabilities]
            preds_classification = np.argmax(probabilities, axis=1)
            all_preds_regression.extend(preds_regression)
            all_preds_classification.extend(preds_classification)
            all_labels.extend([label + 1 for label in labels.cpu().numpy()])

# Regression metrics
    mse = mean_squared_error(all_labels, all_preds_regression)
    mae = mean_absolute_error(all_labels, all_preds_regression)
    r2 = r2_score(all_labels, all_preds_regression)

# Classification metrics
    accuracy = accuracy_score([label - 1 for label in all_labels], all_preds_classification)
    precision = precision_score([label - 1 for label in all_labels], all_preds_classification, average='weighted')
    recall = recall_score([label - 1 for label in all_labels], all_preds_classification, average='weighted')
    f1 = f1_score([label - 1 for label in all_labels], all_preds_classification, average='weighted')
    print(f"\n=== {model_name} Combined Metrics ===")
    print("Regression Metrics:")
    print(f"MSE: {mse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")
    print("Classification Metrics:")
    print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")
    print("Sample Predictions vs True Labels:")
    for i in range(min(5, len(all_preds_regression))):
        print(f"Pred: {all_preds_regression[i]:.2f}, True: {all_labels[i]:.2f}")

# Model evaluation
evaluate_model(model_mini, tokenizer_mini, val_texts, val_labels, "MiniLM Undersampled")

In [None]:
# === LSTM Model and Combined Metrics ===

# Downloading GloVe embeddings which give better word representations than random initialization
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip -d glove

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

"""
Loading GloVe embeddings from the downloaded file
- I’m using 100-dimensional vectors because they’re a good balance of size and quality
- Storing them in a dictionary for quick lookup
"""
glove_path = 'glove/glove.6B.100d.txt'
embeddings_index = {}
with open(glove_path, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
embedding_dim = 100

# These matrices will map my vocab words to GloVe vectors
embedding_matrix = np.zeros((len(vocab) + 1, embedding_dim))
for word, idx in vocab.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[idx] = embedding_vector
embedding_matrix_full = np.zeros((len(vocab_full) + 1, embedding_dim))
for word, idx in vocab_full.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix_full[idx] = embedding_vector

# Defining my LSTM model class
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, vocab_size, embedding_matrix):
        super().__init__()

# Embedding layer with pre-trained GloVe vectors, not frozen so it can fine-tune
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float), freeze=False)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(0.3)

# Forward pass for predictions
    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        output, (hidden, _) = self.lstm(embedded)
        hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)
        return self.fc(self.dropout(hidden))

# Setting up the undersampled LSTM model
hidden_dim = 256
output_dim = 5
model_lstm_under = LSTMClassifier(embedding_dim, hidden_dim, output_dim, len(vocab) + 1, embedding_matrix).to(device)
optimizer = torch.optim.Adam(model_lstm_under.parameters(), lr=0.001)
scaler = torch.amp.GradScaler('cuda')
class_counts = df_balanced['Sentiment'].value_counts().sort_index()
class_weights = torch.tensor([1.0 / count for count in class_counts], dtype=torch.float).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)

# Preparing training data for undersampled model
train_dataset = SentimentDataset(train_texts, train_labels, word_to_idx=vocab)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)

# Training loop for undersampled LSTM with 5 epochs
model_lstm_under.train()
for epoch in range(5):
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        with torch.amp.autocast(device_type='cuda'):
            outputs = model_lstm_under(input_ids)
            loss = criterion(outputs, labels)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        total_loss += loss.item()
    print(f"Undersampling LSTM Epoch {epoch+1}/5, Loss: {total_loss/len(train_loader):.4f}")

# Setting up the class-weighted LSTM model
model_lstm_weighted = LSTMClassifier(embedding_dim, hidden_dim, output_dim, len(vocab_full) + 1, embedding_matrix_full).to(device)
optimizer_weighted = torch.optim.Adam(model_lstm_weighted.parameters(), lr=0.001)
scaler_weighted = torch.amp.GradScaler('cuda')

# Class weights for the full dataset
class_counts_full = df['Sentiment'].value_counts().sort_index()
class_weights_full = torch.tensor([1.0 / count for count in class_counts_full], dtype=torch.float).to(device)
criterion_weighted = nn.CrossEntropyLoss(weight=class_weights_full)

# Preparing training data for class-weighted model
train_dataset_full = SentimentDataset(train_texts_full, train_labels_full, word_to_idx=vocab_full)
train_loader_full = DataLoader(train_dataset_full, batch_size=32, shuffle=True, num_workers=2)

# Training loop for class-weighted LSTM
model_lstm_weighted.train()
for epoch in range(5):
    total_loss = 0
    for batch in train_loader_full:
        optimizer_weighted.zero_grad()
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        with torch.amp.autocast(device_type='cuda'):
            outputs = model_lstm_weighted(input_ids)
            loss = criterion_weighted(outputs, labels)
        scaler_weighted.scale(loss).backward()
        scaler_weighted.step(optimizer_weighted)
        scaler_weighted.update()
        total_loss += loss.item()
    print(f"Class-Weighted LSTM Epoch {epoch+1}/5, Loss: {total_loss/len(train_loader_full):.4f}")

# Function to evaluate LSTM models with combined metrics
def evaluate_lstm(model, texts, labels, word_to_idx, model_name):
    eval_dataset = SentimentDataset(texts, labels, word_to_idx=word_to_idx)
    eval_loader = DataLoader(eval_dataset, batch_size=32, shuffle=False, num_workers=2)
    model.eval()
    all_preds_regression, all_preds_classification, all_labels = [], [], []
    with torch.no_grad():
        for batch in eval_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            with torch.amp.autocast(device_type='cuda'):
                outputs = model(input_ids)
            probabilities = torch.softmax(outputs, dim=1).cpu().numpy()
            preds_regression = [sum(i * prob for i, prob in enumerate(prob)) + 1 for prob in probabilities]
            preds_classification = np.argmax(probabilities, axis=1)
            all_preds_regression.extend(preds_regression)
            all_preds_classification.extend(preds_classification)
            all_labels.extend([label + 1 for label in labels.cpu().numpy()])

# Regression metrics to see how close predictions are to true values
    mse = mean_squared_error(all_labels, all_preds_regression)
    mae = mean_absolute_error(all_labels, all_preds_regression)
    r2 = r2_score(all_labels, all_preds_regression)

# Classification metrics for exact class matches
    accuracy = accuracy_score([label - 1 for label in all_labels], all_preds_classification)
    precision = precision_score([label - 1 for label in all_labels], all_preds_classification, average='weighted')
    recall = recall_score([label - 1 for label in all_labels], all_preds_classification, average='weighted')
    f1 = f1_score([label - 1 for label in all_labels], all_preds_classification, average='weighted')
    print(f"\n=== {model_name} Combined Metrics ===")
    print("Regression Metrics:")
    print(f"MSE: {mse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")
    print("Classification Metrics:")
    print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")
    print("Sample Predictions vs True Labels:")
    for i in range(min(5, len(all_preds_regression))):
        print(f"Pred: {all_preds_regression[i]:.2f}, True: {all_labels[i]:.2f}")

# Evaluating both LSTM models
evaluate_lstm(model_lstm_under, val_texts, val_labels, vocab, "LSTM Undersampled")
evaluate_lstm(model_lstm_weighted, val_texts_full, val_labels_full, vocab_full, "LSTM Class-Weighted")

In [None]:
# === Real-Time Gold Sentiment and Trading Signals (DistilBERT, MiniLM, and LSTM) ===

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Loading tokenizers for my transformer models
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
tokenizer_mini = AutoTokenizer.from_pretrained('microsoft/MiniLM-L12-H384-uncased')

"""
Loading DistilBERT models
- I trained two versions: undersampled and class-weighted
- They predict sentiment on a 1-5 scale, so num_labels=5
- Moving them to the device for computation
"""
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=5).to(device)
model_weighted = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=5).to(device)
try:
    # Loading weights is important
    model.load_state_dict(torch.load('distilbert_under_model.pth', map_location=device))
    model_weighted.load_state_dict(torch.load('distilbert_weighted_model.pth', map_location=device))
    print("DistilBERT models loaded with trained weights.")
except FileNotFoundError:
    print("Warning: DistilBERT trained weights not found. Predictions will be random unless trained.")
except NameError:
    print("DistilBERT models not found in memory.")

# Loading MiniLM model and weights
model_mini = AutoModelForSequenceClassification.from_pretrained('microsoft/MiniLM-L12-H384-uncased', num_labels=5).to(device)
try:
    # Adjust path if saved differently in Cell 3
    model_mini.load_state_dict(torch.load('minilm_model.pth', map_location=device))
    print("MiniLM model loaded with trained weights.")
except FileNotFoundError:
    print("Warning: MiniLM trained weights not found. Predictions will be random unless trained.")
except NameError:
    print("MiniLM model not found in memory.")

# Setting up LSTM parameters
embedding_dim = 100
hidden_dim = 256
output_dim = 5

# Preparing embedding matrices for LSTM for undersampled and full vocabulary
try:
    embedding_matrix = np.zeros((len(vocab) + 1, embedding_dim))
    embedding_matrix_full = np.zeros((len(vocab_full) + 1, embedding_dim))
except NameError:
    print("Vocab not defined.")
    vocab = {}
    vocab_full = {}

# Reusing the LSTM class
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, vocab_size, embedding_matrix):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float), freeze=False)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(0.3)

    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        output, (hidden, _) = self.lstm(embedded)
        hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)
        return self.fc(self.dropout(hidden))

# Initializing LSTM models and loading weights
try:
    model_lstm_under = LSTMClassifier(embedding_dim, hidden_dim, output_dim, len(vocab) + 1, embedding_matrix).to(device)
    model_lstm_weighted = LSTMClassifier(embedding_dim, hidden_dim, output_dim, len(vocab_full) + 1, embedding_matrix_full).to(device)
    try:
        model_lstm_under.load_state_dict(torch.load('lstm_under_model.pth', map_location=device))
        model_lstm_weighted.load_state_dict(torch.load('lstm_weighted_model.pth', map_location=device))
        print("LSTM models loaded with trained weights.")
    except FileNotFoundError:
        print("Warning: LSTM trained weights not found. Predictions will be random unless trained.")
except NameError:
    print("LSTM models not initialized due to missing vocab or embeddings.")

# Function to fetch news from NewsData.io
def get_newsdata_articles(query):
    api_key = ""
    url = f"https://newsdata.io/api/1/news?apikey={api_key}&q={query}&language=en"
    try:
        response = requests.get(url)
        response.raise_for_status()
        articles = response.json().get('results', [])
        return [article['title'] for article in articles[:5]]
    except requests.exceptions.HTTPError as e:
        print(f"NewsData.io Error: {e}. Check API key validity or quota.")
        return []

# Function to fetch news from MarketAux
def get_marketaux_articles(query):
    api_key = ""
    url = f"https://api.marketaux.com/v1/news/all?symbols={query}&api_token={api_key}"
    try:
        response = requests.get(url)
        response.raise_for_status()
        articles = response.json().get('data', [])
        return [article['title'] for article in articles[:5]]
    except requests.exceptions.HTTPError as e:
        print(f"MarketAux Error: {e}. Check API key or quota.")
        return []

# Function to fetch Reddit posts
def get_reddit_posts(query):
    reddit = praw.Reddit(client_id="",
                         client_secret="",
                         user_agent="")
    try:
        subreddit = reddit.subreddit("all")
        posts = subreddit.search(query, limit=5)
        return [post.title for post in posts]
    except Exception as e:
        print(f"Reddit Error: {e}. Check credentials or connectivity.")
        return []

# Collecting latest gold-related texts from all sources (gold as a placeholder, works on commodities in general)
gold_texts = []
gold_texts.extend(get_newsdata_articles("gold commodity"))
gold_texts.extend(get_marketaux_articles("GC=F"))
gold_texts.extend(get_reddit_posts("gold commodity"))

if not gold_texts:
    print("No gold-related texts fetched. Verify API keys and network connection.")
else:
    print("\n=== Latest Gold Commodity Texts ===")
    for i, text in enumerate(gold_texts, 1):
        print(f"{i}. {text}")

# Prediction function for transformer models
    def predict_transformer(model, tokenizer, texts):
        model.eval()
        encodings = tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors='pt')
        input_ids = encodings['input_ids'].to(device)
        attention_mask = encodings['attention_mask'].to(device)
        with torch.no_grad():
            with torch.amp.autocast(device_type='cuda'):
                outputs = model(input_ids, attention_mask=attention_mask)
            probabilities = torch.softmax(outputs.logits, dim=1).cpu().numpy()
            preds = [sum(i * prob for i, prob in enumerate(prob)) + 1 for prob in probabilities]
        return preds

# Prediction function for LSTM
    def predict_lstm(model, texts, word_to_idx, max_len=128):
        model.eval()
        tokenized = [[word_to_idx.get(word, 0) for word in text.lower().split()] for text in texts]
        padded = [seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in tokenized]
        input_ids = torch.tensor(padded, dtype=torch.long).to(device)
        with torch.no_grad():
            with torch.amp.autocast(device_type='cuda'):
                outputs = model(input_ids)
            probabilities = torch.softmax(outputs, dim=1).cpu().numpy()
            preds = [sum(i * prob for i, prob in enumerate(prob)) + 1 for prob in probabilities]
        return preds

# Signal mappings
    def get_signal(pred):
        if pred > 3.5:
            return "Buy"
        elif pred < 2.5:
            return "Sell"
        else:
            return "Hold"

# Prediction and signal generation
    models = {
        "DistilBERT Undersampled": (model, tokenizer),
        "DistilBERT Class-Weighted": (model_weighted, tokenizer),
        "MiniLM Undersampled": (model_mini, tokenizer_mini),
        "LSTM Undersampled": (model_lstm_under, vocab),
        "LSTM Class-Weighted": (model_lstm_weighted, vocab_full)
    }

    print("\n=== Trading Signals for Gold Commodity ===")
    for model_name, (model_obj, tok_or_vocab) in models.items():
        if "LSTM" in model_name:
            preds = predict_lstm(model_obj, gold_texts, tok_or_vocab)
        else:
            preds = predict_transformer(model_obj, tok_or_vocab, gold_texts)
        signals = [get_signal(pred) for pred in preds]
        avg_pred = np.mean(preds)
        avg_signal = get_signal(avg_pred)
        print(f"\n{model_name}:")
        for i, (text, pred, signal) in enumerate(zip(gold_texts, preds, signals), 1):
            print(f"{i}. {text}")
            print(f"   Predicted Sentiment: {pred:.2f}, Signal: {signal}")
        print(f"Average Sentiment: {avg_pred:.2f}, Average Signal: {avg_signal}")

In [None]:
# === Backtesting and Returns Calculation ===

print("\n=== Backtesting and Returns Calculation ===\n")

# Fetching current price and monthly change from Trading Economics
def get_price_data(ticker):
    url = "https://tradingeconomics.com/commodity/gold"
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        tables = soup.find_all('table')
        target_table = None

# Looking for the table with gold data from the website
        for table in tables:
            if 'Gold' in table.get_text():
                target_table = table
                break
        if not target_table:
            print("No table containing 'Gold' found.")
            return None, None
        headers = target_table.find('tr')
        header_text = [th.get_text(strip=True) for th in headers.find_all('th')]
        price_index = header_text.index('Price')
        month_index = header_text.index('Month')
        rows = target_table.find_all('tr')
        gold_row = None
        for row in rows:
            cells = row.find_all('td')
            if cells and cells[0].get_text(strip=True) == 'Gold':
                gold_row = row
                break
        if not gold_row:
            print("Gold row not found.")
            return None, None
        tds = gold_row.find_all('td')
        price_text = tds[price_index].get_text(strip=True)
        monthly_change_text = tds[month_index].get_text(strip=True)
        try:
            price_text = price_text.replace(',', '').replace('$', '')
            current_price = float(price_text)
            monthly_change_text = monthly_change_text.replace(',', '').replace('+', '').replace('−', '-').replace('%', '')
            monthly_change = float(monthly_change_text) / 100 if '%' in tds[month_index].get_text(strip=True) else float(monthly_change_text)
        except ValueError as e:
            print(f"Error converting to float: {e}")
            return None, None
        return current_price, monthly_change
    except Exception as e:
        print(f"Error fetching price data: {e}")
        return None, None

"""
    Getting user input for signals and investment
    - I’m asking for signals from 15 days ago to simulate past decisions
    - Using numbers to make input easier (1=Buy, 2=Sell, 3=Hold)
    """
current_price, change_15_days = get_price_data("GC=F")
if current_price is None or change_15_days is None:
    print("Unable to proceed with backtesting due to missing price data.")
else:
    print("Enter the trading signal from 15 days ago for each model (1 = Buy, 2 = Sell, 3 = Hold):")
    signal_distil_u = int(input("DistilBERT Undersampled Signal: "))
    signal_distil_c = int(input("DistilBERT Class-Weighted Signal: "))
    signal_lstm_u = int(input("LSTM Undersampled Signal: "))
    signal_lstm_c = int(input("LSTM Class-Weighted Signal: "))
    signal_mini_u = int(input("MiniLM Undersampled Signal: "))
    signal_mini_c = int(input("MiniLM Class-Weighted Signal: "))
    investment = float(input("Enter amount invested (in USD): "))

# Mapping numeric signals to trading actions
    signal_map = {1: "Go Long", 2: "Go Short", 3: "Hold"}
    signals = {
        "DistilBERT Undersampled": signal_map[signal_distil_u],
        "DistilBERT Class-Weighted": signal_map[signal_distil_c],
        "LSTM Undersampled": signal_map[signal_lstm_u],
        "LSTM Class-Weighted": signal_map[signal_lstm_c],
        "MiniLM Undersampled": signal_map[signal_mini_u],
        "MiniLM Class-Weighted": signal_map[signal_mini_c]
    }

# Calculating returns using the monthly change as a proxy for 15-day change
    price_15_days_ago = current_price / (1 + change_15_days)
    print(f"\nPrice 15 Days Ago (Estimated): ${price_15_days_ago:,.2f}")
    print(f"Current Price: ${current_price:,.2f}")
    print(f"15-Day Change (Using Monthly Proxy): {change_15_days:.2%}")

    for model, signal in signals.items():
        print(f"\n{model} (Signal: {signal}):")
        if signal == "Go Long":
            returns = investment * (current_price / price_15_days_ago - 1)
            print(f"Returns: ${returns:,.2f} ({(returns/investment)*100:.2f}%)")
            print("Correct" if change_15_days > 0 else "Incorrect")
        elif signal == "Go Short":
            returns = investment * (price_15_days_ago / current_price - 1)
            print(f"Returns: ${returns:,.2f} ({(returns/investment)*100:.2f}%)")
            print("Correct" if change_15_days < 0 else "Incorrect")
        else:
            returns = 0
            print(f"Returns: ${returns:,.2f} (0.00%)")
            print("Correct" if abs(change_15_days) < 0.01 else "Incorrect")