In [5]:
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

# Clear CUDA cache
torch.cuda.empty_cache()

# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available. Using GPU.")
else:
    device = torch.device("cpu")
    print("GPU is not available. Using CPU.")

GPU is available. Using GPU.


In [6]:
# Load data
data = pd.read_csv('./data/orientation/orientation-gb-train.tsv', delimiter='\t')

# Check for and handle NaN values
data = data.dropna(subset=['text', 'label'])

# Ensure labels are integers
data['label'] = data['label'].astype(int)

train_texts = data['text'].tolist()
train_labels = data['label'].tolist()

# Split the data into training and test sets (e.g., 80-20 split)
train_texts, test_texts, train_labels, test_labels = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42)

# Load BERT model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = DistilBertModel.from_pretrained('distilbert-base-cased').to(device)



In [15]:
# Function to get BERT embeddings in batches
def bert_vectorize(texts, tokenizer, model, max_length=256, batch_size=8):
    vectors = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', padding='max_length', truncation=True, max_length=max_length)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        batch_vectors = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        vectors.append(batch_vectors)
    return np.concatenate(vectors, axis=0)

# Adjust sequence length if necessary
max_length = 256

# Generate BERT embeddings for training and test data
train_vectors = bert_vectorize(train_texts, tokenizer, model, max_length=max_length)
test_vectors = bert_vectorize(test_texts, tokenizer, model, max_length=max_length)

In [16]:
# Convert to PyTorch tensors
train_vectors = torch.tensor(train_vectors, dtype=torch.float32).to(device)
train_labels = torch.tensor(train_labels, dtype=torch.long).to(device)
test_vectors = torch.tensor(test_vectors, dtype=torch.float32).to(device)
test_labels = torch.tensor(test_labels, dtype=torch.long).to(device)

# Create TensorDatasets
train_dataset = TensorDataset(train_vectors, train_labels)
test_dataset = TensorDataset(test_vectors, test_labels)

# Create DataLoaders
batch_size = 4
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

  train_labels = torch.tensor(train_labels, dtype=torch.long).to(device)
  test_labels = torch.tensor(test_labels, dtype=torch.long).to(device)


In [9]:
# Define a simple neural network
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

In [10]:
# Training and evaluation function
def train_and_evaluate(train_loader, test_loader, input_size, hidden_size, num_classes, num_epochs=5):
    model = SimpleNN(input_size, hidden_size, num_classes).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    train_losses = []
    test_f1_scores = []

    # Training loop
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for vectors, labels in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}'):
            vectors, labels = vectors.to(device), labels.to(device)
            outputs = model(vectors)
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * vectors.size(0)

        epoch_loss = running_loss / len(train_loader.dataset)
        train_losses.append(epoch_loss)

        # Evaluation
        model.eval()
        all_preds = []
        all_labels = []
        with torch.no_grad():
            for vectors, labels in test_loader:
                vectors, labels = vectors.to(device), labels.to(device)
                outputs = model(vectors)
                _, preds = torch.max(outputs, 1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        epoch_f1 = f1_score(all_labels, all_preds, average='weighted')
        test_f1_scores.append(epoch_f1)

        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, F1 Score: {epoch_f1:.4f}')

    return train_losses, test_f1_scores

In [17]:
# Train and evaluate model
input_size = train_vectors.shape[1]
hidden_size = 50
num_classes = len(set(train_labels.cpu().numpy()))

train_losses, test_f1_scores = train_and_evaluate(train_loader, test_loader, input_size, hidden_size, num_classes)

Epoch 1/5: 100%|██████████| 4848/4848 [00:06<00:00, 727.17it/s]


Epoch [1/5], Loss: 0.6002, F1 Score: 0.6650


Epoch 2/5: 100%|██████████| 4848/4848 [00:06<00:00, 693.13it/s]


Epoch [2/5], Loss: 0.5736, F1 Score: 0.6658


Epoch 3/5: 100%|██████████| 4848/4848 [00:06<00:00, 726.40it/s]


Epoch [3/5], Loss: 0.5638, F1 Score: 0.7073


Epoch 4/5: 100%|██████████| 4848/4848 [00:06<00:00, 750.34it/s]


Epoch [4/5], Loss: 0.5549, F1 Score: 0.7018


Epoch 5/5: 100%|██████████| 4848/4848 [00:06<00:00, 732.92it/s]


Epoch [5/5], Loss: 0.5472, F1 Score: 0.7083
