In [None]:
import torch
import torch.optim as optim
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import time
from sklearn.metrics import accuracy_score, f1_score, precision_score
import numpy as np
import wandb

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
def encoded_data(tokenizer, data):
    encoded_data = tokenizer.batch_encode_plus(
        data['content'].tolist(),
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    return encoded_data

def create_dataset(encoded_data, data, device):
    dataset = TensorDataset(
        encoded_data['input_ids'].to(device),
        encoded_data['attention_mask'].to(device),
        torch.tensor(data['labels'].tolist()).float().to(device)
    )

    return dataset

def trainer(train_loss, val_losses, val_accuracies, num_epochs, train_loader, val_dataset, val_loader, model, optimizer, scheduler, device):
    # Training Electra model
    for epoch in range(num_epochs):
        for i, batch in tqdm(enumerate(train_loader), total=len(train_loader), desc=f"epoch {epoch}", position=0):
            model.train()
            batch = tuple(t.to(device) for t in batch)
            input_ids, attention_mask, labels = batch
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs[0]
            loss.backward()
            optimizer.step()
            scheduler.step()
            
            if i % 250 == 0:
                wandb.log({"loss": loss.item()})
            if i % 2000 == 0:

        # Evaluate the model
                model.eval()
                val_loss, val_acc, val_f1m, val_steps = 0, 0, 0, 0
                with torch.no_grad():
                    for batch in tqdm(val_loader, total=len(val_loader), desc='training_eval', position=0):
                        batch = tuple(t.to(device) for t in batch)
                        input_ids, attention_mask, labels = batch
                        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                        v_loss = outputs[0]
                        logits = outputs[1]
                        preds = (torch.sigmoid(logits) > 0.5).float()
                        
                        ac = accuracy_score(labels.cpu().numpy().flatten(), preds.cpu().numpy().flatten())
                        f1m = f1_score(labels.cpu().numpy(), preds.cpu().numpy(), average='micro')
                        val_acc += ac
                        val_f1m += f1m
                        val_loss += v_loss.item()
                        val_steps += 1
        
                    avg_val_loss = val_loss / val_steps
                    avg_val_acc = val_acc / len(val_dataset)
                    avg_val_f1m = val_f1m / len(val_dataset)

                    wandb.log({"avg_val_loss": avg_val_loss, "avg_val_acc": avg_val_acc, "avg_val_f1m": avg_val_f1m})
                    
                    train_loss.append(v_loss.item())
                    val_losses.append(avg_val_loss)
                    val_accuracies.append(avg_val_acc)
            
                    print("\n--------------------------------------------")
                    print('Epoch {:} / {:}'.format(epoch + 1, num_epochs))
                    print("Training loss: ", loss.item())
                    print("Validation loss: ", avg_val_loss)
                    print("Validation accuracy: ", avg_val_acc)
                    print("Validation f1m: ", avg_val_f1m)

def testing(model, test_loader, test_data, device):
    start = time.time()
    model.eval()
    acc = 0
    test_loss = 0
    test_steps = 0

    with torch.no_grad(): 
        for batch in test_loader:
            batch = tuple(t.to(device) for t in batch)
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs[0]
            logits = outputs[1]
            test_loss += loss.item()
            # acc += (logits.argmax(1) == labels).sum().item()
            acc += accuracy_score(labels.cpu().detach().numpy(), logits.cpu().detach().numpy())
            test_steps += 1

        accuracy = acc / len(test_data)

    print("Test loss", test_loss / test_steps)
    print("Test accuracy: {:.2f}%".format(accuracy*100))
    print("Time",time.time()-start)

# Load ELECTRA model and tokenizer


In [None]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained('google/electra-base-discriminator')

# Load Electra Model
model = AutoModelForSequenceClassification.from_pretrained(
    "google/electra-base-discriminator",
    problem_type="multi_label_classification",
    num_labels=3)

model.to(device)

# Load Dataset SST-2 English

In [None]:
# Split the dataset into train, validation, and test sets
train_data = pd.read_csv("./data/train.csv", delimiter='|')
val_data = pd.read_csv("./data/val.csv", delimiter='|')
test_data = pd.read_csv("./data/test.csv", delimiter='|')

In [None]:
train_encoded_data = encoded_data(tokenizer, train_data)
val_encoded_data = encoded_data(tokenizer, val_data)
test_encoded_data = encoded_data(tokenizer, test_data)

In [None]:
columns = ["avg_white_pop_pct","avg_median_hh_inc","avg_non_college_pct"]
train_data['labels'] = train_data[columns].values.tolist()
val_data['labels'] = val_data[columns].values.tolist()
test_data['labels'] = test_data[columns].values.tolist()

In [None]:
train_dataset = create_dataset(train_encoded_data, train_data, device)
val_dataset = create_dataset(val_encoded_data, val_data, device)
test_dataset = create_dataset(test_encoded_data, test_data, device)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Fine-tuning

In [None]:
# Define the optimizer and learning rate scheduler
num_epochs = 5
optimizer = optim.AdamW(model.parameters(), lr=6.68561343998775e-5, eps=1e-8)
total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [None]:
wandb.init(
    project="news-nlp",

    config={
        "epochs": 5,
        "model": 'electra',
    }
)
wandb_flag = True

In [None]:
# Training Electra model
train_loss = []
val_losses = []
val_accuracies = []

start = time.time()
trainer(train_loss, val_losses, val_accuracies, num_epochs, train_loader, val_dataset, val_loader, model, optimizer, scheduler, device)
print("Time",time.time()-start)

In [None]:
wandb.finish()
wandb_flag = False

In [None]:
# save the model
torch.save(model.state_dict(), 'electra_models/transformerELECTRA-1.pt')

#### Test model on test set

In [None]:
#Reload model
model = AutoModelForSequenceClassification.from_pretrained("google/electra-base-discriminator", num_labels=2)
model.load_state_dict(torch.load('electra_models/transformerELECTRA-1.pt', map_location=device))

model.to(device)

In [None]:
testing(model, test_loader, test_data, device)