# BERT

## Plain BERT, no pre-trained tokenizer, fine-tuned for classification, default hyperparameters

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.utils.class_weight import compute_class_weight


In [None]:
# Load your dataset
train_df = pd.read_csv('train.csv')
valid_df = pd.read_csv('valid.csv')

# Convert DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)

In [None]:

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_valid_dataset = valid_dataset.map(tokenize_function, batched=True)

In [None]:
#Set format for PyTorch
tokenized_train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_valid_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
from transformers import Trainer, TrainingArguments

# Load model
num_labels = len(train_df['label'].unique())
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
)




In [None]:
# Compute metrics
from datasets import load_metric
import numpy as np

metric = load_metric('accuracy')

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return metric.compute(predictions=preds, references=p.label_ids)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_valid_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
# Train the model
trainer.train()

# Evaluate the model
eval_result = trainer.evaluate()
print(eval_result)