In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES']='5'

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, GPT2ForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score


train = pd.read_csv('./data/ratings_en_train.txt', delimiter='\t')
test = pd.read_csv('./data/ratings_en_test.txt', delimiter='\t')
#train = pd.read_csv('./data/ratings_ko_train.txt', delimiter='\t')
#test = pd.read_csv('./data/ratings_ko_test.txt', delimiter='\t')

model = "openai-community/gpt2"
#model = "skt/kogpt2-base-v2"

output_dir = "./results/imdb"
#output_dir = "./results/nsmc"

<h3>Setting Hyperparameters

In [None]:
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    acc = accuracy_score(labels, preds)
    return {"eval_accuracy": acc}

# Preprocess the dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])  # Ensure text is converted to string
        label = self.labels[idx]
        
        # Tokenize text
        inputs = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt"
        )
        
        input_ids = inputs["input_ids"].squeeze(0)
        attention_mask = inputs["attention_mask"].squeeze(0)
        
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": torch.tensor(label, dtype=torch.long)
        }


# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model)
tokenizer.pad_token_id = tokenizer.eos_token_id

# Create the dataset objects
max_length = 256  # Adjust according to your model's maximum input length
train_dataset = TextDataset(train['document'].tolist(), train['label'].tolist(), tokenizer, max_length=max_length)
val_dataset = TextDataset(test['document'].tolist(), test['label'].tolist(), tokenizer, max_length=max_length)

# Initialize the model
model = GPT2ForSequenceClassification.from_pretrained(model, num_labels=2)
model.config.pad_token_id = model.config.eos_token_id

model.resize_token_embeddings(len(tokenizer))

# Define the training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    save_total_limit=2,
    eval_steps=100,
    metric_for_best_model="eval_accuracy"
)

In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

<h3>Training

In [None]:
trainer.train()

In [None]:
# Evaluate the model
results = trainer.evaluate()

print("Validation Loss:", results['eval_loss'])
print("Validation Accuracy:", results['eval_accuracy'])

In [None]:
trainer.save_model()

<h3>Evaluation

In [None]:
from transformers import pipeline

classifi = pipeline('text-classification',model=output_dir, tokenizer=output_dir)
def classify_review(input):
    for line in input:
        text = classifi(line)
        id2label = {'LABEL_0': "NEGATIVE", 'LABEL_1': "POSITIVE"}
        print(line+":",round(text[0]['score']*100,2),"% 의 확률로 "+id2label[text[0]['label']]+"입니다.")


In [None]:
input = ["This movie is so boring","Very good movie", "I fall asleep during the movie", "The best movie of my life"]
#input = ["이 영화 너무 재미있어요!","너무 실망스러운 영화", "보다가 졸려서 잠들었어요", "시간 가는 줄 모르고 본 영화"]

In [None]:
classify_review(input)