In [1]:
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import torch
from sklearn.metrics import accuracy_score


# Load the dataset
df_train = pd.read_csv("train.csv")
df_eval = df_train.iloc[0:200,:]

# Load the pre-trained BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='steps',
    eval_steps=100,
    save_strategy='steps',
    save_steps=1000,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    weight_decay=0.01,
    push_to_hub=False,
    logging_steps=100,
    load_best_model_at_end=False, 
    metric_for_best_model='accuracy',
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

# Define the training and evaluation datasets
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, prompt=None):
        self.df = df
        self.tokenizer = tokenizer
        self.prompt = prompt

    def __getitem__(self, index):
        row = self.df.iloc[index]
        text = row['text']
        if self.prompt is not None:
            prompt = self.prompt.format(row['keyword']) 
            text = f"{prompt} {text}"
        inputs = self.tokenizer.encode_plus(text, max_length=128, padding='max_length', truncation=True, return_tensors='pt')
        labels = torch.tensor(row['target'])
        return {'input_ids': inputs['input_ids'][0], 'attention_mask': inputs['attention_mask'][0], 'labels': labels}

    def __len__(self):
        return len(self.df)



train_dataset = MyDataset(df_train, tokenizer,prompt="What is the {} mentioned in the tweet?")
eval_dataset = MyDataset(df_eval, tokenizer,prompt="What is the {} mentioned in the tweet?")

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
  
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()



2023-04-24 01:10:01.248618: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if yo

Step,Training Loss,Validation Loss
200,0.4949,0.246502
400,0.4075,0.208443


{'eval_loss': 0.20967704057693481,
 'eval_runtime': 3.8333,
 'eval_samples_per_second': 52.175,
 'eval_steps_per_second': 1.826,
 'epoch': 1.0}