In [None]:
#!pip install sklearn transformers==4.28.0 datasets
#!pip install --upgrade accelerate

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from datasets import load_dataset

In [None]:
# Load the IMDb movie review dataset from Hugging Face datasets
dataset = load_dataset('imdb')

In [None]:
# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(dataset['train']['text'], dataset['train']['label'], test_size=0.2, random_state=42)

In [None]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the input texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [None]:
# Convert labels to tensors
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

In [None]:
# Create the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

In [None]:
# Set up the training parameters
optimizer = AdamW(model.parameters(), lr=1e-5)

In [None]:
#Prepare datasets
train_input_ids, train_attention_mask = torch.tensor(train_encodings['input_ids']), torch.tensor(train_encodings['attention_mask'])
test_input_ids, test_attention_mask = torch.tensor(test_encodings['input_ids']), torch.tensor(test_encodings['attention_mask'])
train_dataset = torch.utils.data.TensorDataset(train_input_ids, train_attention_mask, train_labels)
test_dataset = torch.utils.data.TensorDataset(train_input_ids, train_attention_mask, train_labels)

In [None]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=True)

In [None]:
# Training loop
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()

In [None]:
for epoch in range(1):  # adjust the number of epochs as needed
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

In [None]:
# Evaluate the model on the test set
model.eval()
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16)
predictions = []

In [None]:
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, _ = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        predictions.extend(preds.cpu().numpy())

# Calculate accuracy
accuracy = (predictions == test_labels.numpy()).mean()
print('Accuracy:', accuracy)

In [None]:
"Huggingface TrainingArgument 사용법 (훨씬 쉬움)"

In [None]:
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

In [None]:
# Prepare the training arguments
training_args = TrainingArguments(
    output_dir='./results',  # Directory to save the trained model
    num_train_epochs=3,      # Number of training epochs
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=64,   # Batch size for evaluation
    warmup_steps=500,        # Number of warmup steps
    weight_decay=0.01,       # Weight decay
    logging_dir='./logs',    # Directory to save the training logs
    logging_steps=500,       # Log training loss every N steps
    evaluation_strategy='epoch'  # Evaluate after each epoch
)

In [None]:
# Define a function to tokenize and preprocess the text
def preprocess_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

In [None]:
# Load and preprocess the dataset
train_dataset = load_dataset('imdb', split='train')
train_dataset = train_dataset.map(preprocess_function, batched=True)

# Instantiate the Trainer class
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

In [None]:
# Start the training
trainer.train()

In [None]:
# Evaluate the trained model
eval_results = trainer.evaluate()

print(eval_results)