In [1]:
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# PATHS and HYPERPARAMETERS =====
TRAIN_FILE = r"E:\Research-8177\fiqa_train.csv"
TEST_FILE  =r"E:\Research-8177\fiqa_test.csv"

MODEL_NAME = "yiyanghkust/finbert-tone"
SAVE_DIR = "E:/Research-8177/finbert_fiqa_final"


# Hyperparameters for the fine-tuning process
EPOCHS = 3
LR = 2e-5
BATCH_SIZE = 16

# GAN hyperparameters
GAN_EPOCHS = 50
NOISE_DIM = 100
EMBED_DIM = 768
SYNTHETIC_SAMPLES = 2000

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# DATASET CLASSES =====
class FiqaDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_len=128):
        self.df = pd.read_csv(file_path)
        self.texts = self.df["text"].astype(str).tolist()
        self.labels = self.df["label"].astype(int).tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

class SyntheticDataset(Dataset):
    def __init__(self, synthetic_embeddings, num_labels=3):
        self.synthetic_embeddings = synthetic_embeddings
        self.synthetic_labels = torch.randint(0, num_labels, (len(synthetic_embeddings),), dtype=torch.long)

    def __len__(self):
        return len(self.synthetic_embeddings)

    def __getitem__(self, idx):
        return {
            "embeddings": self.synthetic_embeddings[idx],
            "labels": self.synthetic_labels[idx]
        }

# LOAD FINBERT & TOKENIZER =====
try:
    tokenizer = BertTokenizer.from_pretrained(SAVE_DIR)
    finbert = BertForSequenceClassification.from_pretrained(SAVE_DIR)
    print(f"Loaded FinBERT model and tokenizer from: {SAVE_DIR}")
except OSError:
    print(f"Model not found at {SAVE_DIR}. Loading base model from {MODEL_NAME}.")
    tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
    config = BertConfig.from_pretrained(MODEL_NAME, num_labels=3)
    finbert = BertForSequenceClassification.from_pretrained(MODEL_NAME, config=config)

finbert.to(device)

#  GAN MODEL CLASSES =====
class Generator(nn.Module):
    def __init__(self, noise_dim=NOISE_DIM, embed_dim=EMBED_DIM):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(noise_dim, 512),
            nn.LeakyReLU(0.2),
            nn.Linear(512, 1024),
            nn.LeakyReLU(0.2),
            nn.Linear(1024, embed_dim),
        )
    def forward(self, z):
        return self.model(z)

class Discriminator(nn.Module):
    def __init__(self, embed_dim=EMBED_DIM):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(embed_dim, 512),
            nn.LeakyReLU(0.2),
            nn.Linear(512, 256),
            nn.LeakyReLU(0.2),
            nn.Linear(256, 1),
            nn.Sigmoid(),
        )
    def forward(self, x):
        return self.model(x)

#GAN TRAINING FUNCTIONS =====
def get_embeddings(texts, model, tokenizer, batch_size=16):
    model.eval()
    embeddings = []
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            enc = tokenizer(batch, padding=True, truncation=True, return_tensors="pt", max_length=128).to(device)
            outputs = model.bert(**enc)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]
            embeddings.append(cls_embeddings.detach().cpu())
    return torch.cat(embeddings, dim=0)

def train_gan(real_embeddings, epochs=GAN_EPOCHS, batch_size=64, noise_dim=NOISE_DIM):
    generator = Generator(noise_dim=noise_dim).to(device)
    discriminator = Discriminator().to(device)
    criterion = nn.BCELoss()
    optim_G = optim.Adam(generator.parameters(), lr=2e-4)
    optim_D = optim.Adam(discriminator.parameters(), lr=2e-4)

    dataset = torch.utils.data.DataLoader(real_embeddings, batch_size=batch_size, shuffle=True)
    print("Starting GAN training...")
    for epoch in range(epochs):
        for real in dataset:
            real = real.to(device)
            current_batch_size = real.size(0)
            real_labels = torch.ones(current_batch_size, 1).to(device)
            fake_labels = torch.zeros(current_batch_size, 1).to(device)

            # Train Discriminator ---
            z = torch.randn(current_batch_size, noise_dim).to(device)
            fake = generator(z)
            d_real = discriminator(real)
            d_fake = discriminator(fake.detach())
            loss_D = criterion(d_real, real_labels) + criterion(d_fake, fake_labels)
            optim_D.zero_grad()
            loss_D.backward()
            optim_D.step()

            #Train Generator ---
            z = torch.randn(current_batch_size, noise_dim).to(device)
            fake = generator(z)
            d_fake = discriminator(fake)
            loss_G = criterion(d_fake, real_labels)
            optim_G.zero_grad()
            loss_G.backward()
            optim_G.step()
        
        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1}/{epochs} | Loss D: {loss_D.item():.4f} | Loss G: {loss_G.item():.4f}")
    
    print("GAN training complete.")
    return generator

def generate_synthetic_embeddings(generator, n_samples=SYNTHETIC_SAMPLES, noise_dim=NOISE_DIM):
    generator.eval()
    z = torch.randn(n_samples, noise_dim).to(device)
    with torch.no_grad():
        fake_embeddings = generator(z)
    return fake_embeddings.cpu()

def train_model(model, real_loader, synthetic_loader, epochs, lr):
    model.train()
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()

    real_iter = iter(real_loader)
    synthetic_iter = iter(synthetic_loader)

    print("Starting fine-tuning...")
    for epoch in range(epochs):
        total_loss = 0
        num_batches = 0

        # Alternate between real and synthetic data batches
        while True:
            real_batch = next(real_iter, None)
            if real_batch is not None:
                # Train on real data
                inputs = {k: v.to(device) for k, v in real_batch.items()}
                optimizer.zero_grad()
                outputs = model(**inputs)
                loss = outputs.loss
                loss.backward()
                optimizer.step()
                total_loss += loss.item()
                num_batches += 1
            
            synthetic_batch = next(synthetic_iter, None)
            if synthetic_batch is not None:
                # Train on synthetic data
                embeddings = synthetic_batch["embeddings"].to(device)
                labels = synthetic_batch["labels"].to(device)
                
                optimizer.zero_grad()
                
                # Get the classifier head
                classifier = model.classifier
                # Forward pass through classifier
                logits = classifier(embeddings)
                
                loss = loss_fn(logits.view(-1, model.config.num_labels), labels.view(-1))
                loss.backward()
                optimizer.step()
                total_loss += loss.item()
                num_batches += 1

            if real_batch is None and synthetic_batch is None:
                break
        
        # Reset iterators for next epoch
        real_iter = iter(real_loader)
        synthetic_iter = iter(synthetic_loader)

        avg_loss = total_loss / num_batches if num_batches > 0 else 0
        print(f"Epoch {epoch+1}/{epochs}, Average Loss: {avg_loss:.4f}")

#  MAIN PIPELINE EXECUTION =====
if __name__ == "__main__":
    df_train = pd.read_csv(TRAIN_FILE)
    fiqa_texts = df_train["text"].astype(str).tolist()
    fiqa_labels = df_train["label"].astype(int).tolist()

    # GAN DATA AUGMENTATION ---
    print("Step 1: Extracting embeddings from real data...")
    real_embeddings = get_embeddings(fiqa_texts, finbert, tokenizer)

    print("Step 2: Training GAN to generate synthetic embeddings...")
    generator = train_gan(real_embeddings, epochs=GAN_EPOCHS)
    
    print("Step 3: Generating synthetic embeddings...")
    synthetic_embeddings = generate_synthetic_embeddings(generator, n_samples=SYNTHETIC_SAMPLES)
    print(f"Generated {len(synthetic_embeddings)} synthetic embeddings.")

    #FINETUNING WITH AUGMENTED DATA ---
    print("Step 4: Preparing data for fine-tuning...")
    real_dataset = FiqaDataset(TRAIN_FILE, tokenizer)
    synthetic_dataset = SyntheticDataset(synthetic_embeddings)
    test_dataset = FiqaDataset(TEST_FILE, tokenizer)

    real_loader = DataLoader(real_dataset, batch_size=BATCH_SIZE, shuffle=True)
    synthetic_loader = DataLoader(synthetic_dataset, batch_size=BATCH_SIZE, shuffle=True)

    print(f"Total real training samples: {len(real_dataset)}")
    print(f"Total synthetic samples: {len(synthetic_dataset)}")

    print("Step 5: Fine-tuning FinBERT on the augmented data...")
    train_model(finbert, real_loader, synthetic_loader, EPOCHS, LR)

    #SAVE FINAL MODEL =====
    print("Step 6: Saving the fine-tuned model...")
    os.makedirs(SAVE_DIR, exist_ok=True)
    finbert.save_pretrained(SAVE_DIR)
    tokenizer.save_pretrained(SAVE_DIR)
    print(f" Final model saved at {SAVE_DIR}")

    #EVALUATION =====
    print("\n--- Starting Evaluation on Test Set ---")
    finbert.eval()
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
    
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = finbert(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            
            all_preds.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted', zero_division=0)
    acc = accuracy_score(all_labels, all_preds)

    print(f"\nEvaluation Results on Test Set:")
    print(f"  Accuracy: {acc:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1 Score: {f1:.4f}")

    print("\nTraining and evaluation complete.")

Using device: cuda
Loaded FinBERT model and tokenizer from: E:/Research-8177/finbert_fiqa_final
Step 1: Extracting embeddings from real data...
Step 2: Training GAN to generate synthetic embeddings...
Starting GAN training...
Epoch 10/50 | Loss D: 0.1414 | Loss G: 5.5034
Epoch 20/50 | Loss D: 0.0055 | Loss G: 4.3198
Epoch 30/50 | Loss D: 0.9582 | Loss G: 6.3785
Epoch 40/50 | Loss D: 0.0817 | Loss G: 3.7977
Epoch 50/50 | Loss D: 0.9463 | Loss G: 3.6760
GAN training complete.
Step 3: Generating synthetic embeddings...
Generated 2000 synthetic embeddings.
Step 4: Preparing data for fine-tuning...
Total real training samples: 4673
Total synthetic samples: 2000
Step 5: Fine-tuning FinBERT on the augmented data...
Starting fine-tuning...
Epoch 1/3, Average Loss: 0.8504
Epoch 2/3, Average Loss: 0.5605
Epoch 3/3, Average Loss: 0.4872
Step 6: Saving the fine-tuned model...
 Final model saved at E:/Research-8177/finbert_fiqa_final

--- Starting Evaluation on Test Set ---

Evaluation Results on T