In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
dataset = load_dataset("csv", data_files="./ChnSentiCorp_htl_all.csv", split="train")
dataset = dataset.filter(lambda x: x["review"] is not None)
dataset

Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

In [7]:
dataset = dataset.train_test_split(test_size=0.2)
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6212
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 1553
    })
})

In [8]:
import torch
tokenizer = AutoTokenizer.from_pretrained("./rbt3")

def process_function(exmaples):
    tokenized_examples = tokenizer(exmaples["review"], truncation=True, padding="max_length", max_length=128)
    tokenized_examples["labels"] = exmaples["label"]
    return tokenized_examples

tokenized_dataset = dataset.map(process_function, batched=True, remove_columns=dataset["train"].column_names)
tokenized_dataset

Map: 100%|██████████| 6212/6212 [00:00<00:00, 8931.04 examples/s]
Map: 100%|██████████| 1553/1553 [00:00<00:00, 8440.65 examples/s] 


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 6212
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1553
    })
})