In [None]:
!pip install transformers
!pip install datasets

In [2]:
import logging
import math
import os
import sys
from dataclasses import dataclass, field
from typing import Optional

from datasets import load_dataset

import transformers
from transformers import (
    CONFIG_MAPPING,
    MODEL_FOR_MASKED_LM_MAPPING,
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint, is_main_process

In [3]:
tokenizer = AutoTokenizer.from_pretrained("snunlp/KR-FinBert-SC")
model = AutoModelForSequenceClassification.from_pretrained("snunlp/KR-FinBert-SC")

In [4]:
INPUT_FILE = '/content/drive/MyDrive/Project3/2.Preprocess/out/0.7data_for_transformer.csv' #change

datasets = load_dataset('csv', data_files=INPUT_FILE)



  0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
# split data - train, validation
datasets["validation"] = load_dataset(
                'csv', 
                data_files=INPUT_FILE,
                split='train[:30%]',
            )
datasets["train"] = load_dataset(
                'csv', 
                data_files=INPUT_FILE,
                split="train[30%:]",
            )



In [6]:
column_names = datasets["train"].column_names
text_column_name = "text" if "text" in column_names else column_names[0]

In [7]:
max_seq_length = min(512, tokenizer.model_max_length)

In [8]:
## When using line_by_line, we just tokenize each nonempty line.

padding = "max_length" # "max_length" if data_args.pad_to_max_length else False

def tokenize_function(examples):
            # Remove empty lines
            examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
            return tokenizer(
                examples["text"],
                padding=padding,
                truncation=True,
                max_length=max_seq_length,
                # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
                # receives the `special_tokens_mask`.
                return_special_tokens_mask=True,
            )

tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            #num_proc=data_args.preprocessing_num_workers,
            remove_columns=[text_column_name],   # ?
            #load_from_cache_file=not data_args.overwrite_cache,
        )



In [9]:
# Tokenizing
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]

In [10]:
from torch.utils.data import DataLoader

dataloader_train = DataLoader(train_dataset, batch_size=4)
dataloader_train

<torch.utils.data.dataloader.DataLoader at 0x7f91e1f033a0>

In [11]:
# train_features, train_labels = next(iter(dataloader_train))
# print(f"Feature batch shape: {train_features.size()}")
# print(f"Labels batch shape: {train_labels.size()}")

In [12]:
# Data collator
# This one will take care of randomly masking the tokens.
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [13]:
OUTPUT_PATH = '/content/drive/MyDrive/Project3/3.Classification' # change

training_args = TrainingArguments(
    output_dir=OUTPUT_PATH+'/out',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=8,   # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=0,               # number of warmup steps for learning rate scheduler
    weight_decay=0.0,               # strength of weight decay
    logging_dir='/logs',            # directory for storing logs
    logging_steps=500,               # How often to print logs
    do_train=True,                   # Perform training
    do_eval=True,                    # Perform evaluation
    evaluation_strategy="epoch",     # evalute after eachh epoch
    gradient_accumulation_steps=1,  # total number of steps before back propagation
    #fp16=True,                       # Use mixed precision
    #fp16_opt_level="01",             # mixed precision mode
    run_name="First trial",       # experiment name
    seed=42                           # Seed for experiment reproducibility 3x3
)



In [14]:
# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    #data_collator=data_collator,
)

In [15]:
## train
last_checkpoint = get_last_checkpoint('/content/drive/MyDrive/Project3/3.Classification/out/')
train_result = trainer.train(resume_from_checkpoint=last_checkpoint)
trainer.save_model()  # Saves the tokenizer too for easy upload
metrics = train_result.metrics

# max_train_samples = (
#     data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset
#     )
#     metrics["train_samples"] = min(max_train_samples, len(train_dataset))

#     trainer.log_metrics("train", metrics)
#     trainer.save_metrics("train", metrics)
#     trainer.save_state()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 589
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 222
  Number of trainable parameters = 101403651
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: ignored

In [None]:
## eval
metrics = trainer.evaluate()

max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)
metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))
perplexity = math.exp(metrics["eval_loss"])
metrics["perplexity"] = perplexity

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)