In [2]:
import os
import csv
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
import copy
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import numpy as np
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pre-trained tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("albert-xxlarge-v2")
model = AutoModelForSequenceClassification.from_pretrained("albert-xxlarge-v2", num_labels=2)
model = torch.nn.DataParallel(model, output_device=6)
model.to(device)


for name, param in model.named_parameters():
     if 'albert.embeddings' in name or 'embedding' in name:
        param.requires_grad = False

for name, param in model.named_parameters():
     print(name, param.requires_grad)

# Load the data
train = pd.read_csv('pnli_train.csv', encoding='utf-8', header=None)
dev = pd.read_csv('pnli_dev.csv', encoding='utf-8', header=None)

train = np.array(train)
dev = np.array(dev)
precondition_train, statement_train, label_train = train[:, 0], train[:, 1], train[:, 2]
precondition_dev, statement_dev, label_dev = dev[:,0], dev[:,1], dev[:,2]

# Combine precondition and statement for training and dev sets
X_train = []
for pre, sta in zip(precondition_train, statement_train):
    X_train.append(f"{pre} {sta}")

X_train = [str(X) for X in X_train]
y_train = [int(y) for y in label_train]


X_dev = []
for pred, state in zip(precondition_dev, statement_dev):
    X_dev.append(f"{pred} {state}")

X_dev = [str (X) for X in X_dev]
y_dev = [int(y) for y in label_dev]



# Tokenize input sentences
X_train_encoded = tokenizer(X_train, padding=True, truncation=True, return_tensors="pt", max_length=128)
X_dev_encoded = tokenizer(X_dev, padding=True, truncation=True, return_tensors="pt", max_length=128)


# Create DataLoader for train and dev sets
train_dataset = TensorDataset(X_train_encoded["input_ids"], X_train_encoded["attention_mask"], torch.tensor(y_train))
dev_dataset = TensorDataset(X_dev_encoded["input_ids"], X_dev_encoded["attention_mask"], torch.tensor(y_dev))

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=8, drop_last=True)
dev_loader = DataLoader(dev_dataset, batch_size=16, num_workers=8, shuffle=True, drop_last=True)

# Fine-tune the model
optimizer = torch.optim.RAdam(model.parameters(), lr=2e-5, weight_decay=0.01)

# Set up warm-up steps
total_steps = len(train_loader) * 5  # 10 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=total_steps // 10, num_training_steps=total_steps)

base_dev_loss = 1000
for epoch in range(10):
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss = loss.mean()
        loss.backward()
        optimizer.step()
        scheduler.step()  # Update learning rate

    # Evaluate on dev set
    model.eval()
    dev_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in dev_loader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            dev_loss += outputs.loss.mean().item()
            predicted_labels = torch.argmax(outputs.logits, dim=1)
            correct += (predicted_labels == labels).sum().item()
            total += len(labels)

    accuracy = correct / total
    print(f"Epoch {epoch+1}: Dev Loss = {dev_loss/len(dev_loader):.4f}, Accuracy = {accuracy:.4f}")

    if base_dev_loss > dev_loss:
        print("Replace best model")
        base_dev_loss=dev_loss
        best_model = copy.deepcopy(model)

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-xxlarge-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


module.albert.embeddings.word_embeddings.weight False
module.albert.embeddings.position_embeddings.weight False
module.albert.embeddings.token_type_embeddings.weight False
module.albert.embeddings.LayerNorm.weight False
module.albert.embeddings.LayerNorm.bias False
module.albert.encoder.embedding_hidden_mapping_in.weight False
module.albert.encoder.embedding_hidden_mapping_in.bias False
module.albert.encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm.weight True
module.albert.encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm.bias True
module.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.query.weight True
module.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.query.bias True
module.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.key.weight True
module.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.key.bias True
module.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.value.weig

In [5]:
print('Loading best model')
model = best_model

model.eval()
dev_loss = 0
correct = 0
total = 0
with torch.no_grad():
    for batch in dev_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        dev_loss += outputs.loss.item()
        predicted_labels = torch.argmax(outputs.logits, dim=1)
        correct += (predicted_labels == labels).sum().item()
        total += len(labels)

accuracy = correct / total
print(f"Final: Dev Loss = {dev_loss/len(dev_loader):.4f}, Accuracy = {accuracy:.4f}")


# Make predictions on the unlabeled test set
test = pd.read_csv('pnli_test_unlabeled.csv', encoding='utf-8', header=None)
test = np.array(test)
X_test = []
precondition_test, statement_test = test[:,0], test[:,1]
for pred, state in zip(precondition_test, statement_test):
    X_test.append(f"{pred} {state}")

X_test = [str (X) for X in X_test]
X_test_encoded = tokenizer(X_test, padding=True, truncation=True, return_tensors="pt", max_length=128)
test_dataset = TensorDataset(X_test_encoded["input_ids"], X_test_encoded["attention_mask"])
test_loader = DataLoader(test_dataset, batch_size=50, num_workers=8)

# Predict on test set
results = []
model.eval()
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask = batch
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        predicted_labels = torch.argmax(outputs.logits, dim=1)
        results.extend(predicted_labels.cpu().numpy())

results = [int(x) for x in results]
with open('upload_predictions_albert.txt', 'w', encoding = 'utf-8') as fp:
    for x in results:
        fp.write(str(x) + '\n')

Loading best model
Final: Dev Loss = 0.2423, Accuracy = 0.9183
