In [5]:
!pip install --upgrade transformers


Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.28.0
    Uninstalling transformers-4.28.0:
      Successfully uninstalled transformers-4.28.0
Successfully installed transformers-4.28.1


In [41]:
import pandas as pd
from transformers import XLNetTokenizer, XLNetForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import torch
import numpy as np
from torch.utils.data.dataloader import default_collate




In [19]:
# Load the data
data = pd.read_csv('/Users/toshiniagrawal/Desktop/data/preprocessed_dataset/trainn.csv')


In [20]:
# Preprocess the data
labels = np.array(data['labels'].tolist())
sentences = np.array(data['text'].tolist())

In [30]:
def preprocess_function(examples):
    # create a list of dictionaries
    processed_inputs = []
    for example in examples:
        processed_inputs.append({
            "text": example.text_a,
            "label": example.label
        })
    return processed_inputs


In [21]:
# Split the dataset into training and validation sets
train_size = int(0.8 * len(sentences))
train_sentences, val_sentences = sentences[:train_size], sentences[train_size:]
train_labels, val_labels = labels[:train_size], labels[train_size:]

In [22]:
def encode_dataset(tokenizer, dataset, limit=None):
    encoded_dataset = []
    for i, example in enumerate(dataset):
        if limit is not None and i >= limit:
            break
        encoded_example = tokenizer(
            example["text"],
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt",
        )
        encoded_example["labels"] = example["labels"]
        encoded_dataset.append(encoded_example)
    return encoded_dataset


In [23]:
# Tokenize the data
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
train_encodings = tokenizer(train_sentences.tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_sentences.tolist(), truncation=True, padding=True)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [24]:
# Convert the data to PyTorch tensors
train_dataset = torch.utils.data.TensorDataset(torch.tensor(train_encodings['input_ids']),
                                               torch.tensor(train_encodings['attention_mask']),
                                               torch.tensor(train_labels))
val_dataset = torch.utils.data.TensorDataset(torch.tensor(val_encodings['input_ids']),
                                             torch.tensor(val_encodings['attention_mask']),
                                             torch.tensor(val_labels))



In [25]:
# Define the model
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=2)


Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [26]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    save_total_limit=2
)

In [36]:
def my_data_collator(features):
    batch = {}
    for feature in features:
        for key, value in feature.items():
            if key not in batch:
                batch[key] = []
            batch[key].append(value)
    batch = {key: torch.stack(value, dim=0) for key, value in batch.items()}
    return batch


In [37]:
# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=my_data_collator,
    #compute_metrics=lambda pred: {'accuracy': accuracy_score(pred.label_ids, pred.predictions.argmax(-1)), 
                                  #'f1': f1_score(pred.label_ids, pred.predictions.argmax(-1))}
)

In [38]:
def get_data_loader(tokenizer, data_dir, batch_size=32, max_seq_length=128):
    dataset = load_dataset("csv", data_files={"trainn": os.path.join(data_dir, "trainn.csv")})["train"]

    def preprocess_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=max_seq_length)

    dataset = dataset.map(preprocess_function, batched=True)

    data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, collate_fn=lambda x: x)

    return data_loader


In [40]:
# Train the model
trainer.train()

AttributeError: 'tuple' object has no attribute 'items'

In [31]:
# Evaluate the model on the validation set
predictions = trainer.predict(val_dataset)
print('Accuracy:', accuracy_score(val_labels, predictions.predictions.argmax(-1)))
print('F1 score:', f1_score(val_labels, predictions.predictions.argmax(-1)))


TypeError: vars() argument must have __dict__ attribute