In [None]:
import pickle as pickle
import os
import pandas as pd
import torch
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoConfig
from my_load_data import *
import numpy as np
import random

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  # calculate accuracy using sklearn's function
  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }

def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    
seed_everything(40)

# load model and tokenizer
MODEL_NAME = "xlm-roberta-large"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

tokenizer.add_special_tokens({"additional_special_tokens":["[ENT1]", "[END1]", "[ENT2]", "[END2]"]})

# load dataset
train_dataset, dev_dataset = load_data("/opt/ml/input/data/train/train.tsv", 0.1, entity=False)
# train_dataset, dev_dataset = load_data("/opt/ml/input/data/train/train.tsv", 0)
#dev_dataset = load_data("./dataset/train/dev.tsv")

train_label = train_dataset['label'].values
dev_label = dev_dataset['label'].values

# tokenizing dataset
tokenized_train = tokenized_dataset(train_dataset, tokenizer)
tokenized_dev = tokenized_dataset(dev_dataset, tokenizer)

# make dataset for pytorch.
RE_train_dataset = RE_Dataset(tokenized_train, train_label)
RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# setting model hyperparameter
auto_config = AutoConfig.from_pretrained(MODEL_NAME)
# auto_config.attention_probs_dropout_prob = 0.5
auto_config.num_labels = 42
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=auto_config) 
# model.parameters
model.resize_token_embeddings(len(tokenizer))
model.to(device)

# 사용한 option 외에도 다양한 option들이 있습니다.
# https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요.
training_args = TrainingArguments(
output_dir='./results/xlm',          # output directory
save_total_limit=5,              # number of total save model.
save_steps=500,                 # model saving step.
num_train_epochs=15,              # total number of training epochs
learning_rate=1e-5,               # learning_rate
per_device_train_batch_size=32,  # batch size per device during training
per_device_eval_batch_size=32,   # batch size for evaluation
gradient_accumulation_steps=1,
label_smoothing_factor=0.5,
warmup_steps=500,                # number of warmup steps for learning rate scheduler
weight_decay=0.01,               # strength of weight decay
logging_dir='./logs/xlm',            # directory for storing logs
logging_steps=100,              # log saving step.
evaluation_strategy='steps', # evaluation strategy to adopt during training
                            # `no`: No evaluation during training.
                            # `steps`: Evaluate every `eval_steps`.
                            # `epoch`: Evaluate every end of epoch.
eval_steps = 100,            # evaluation step.
)

trainer = Trainer(
model=model,                         # the instantiated 🤗 Transformers model to be trained
args=training_args,                  # training arguments, defined above
train_dataset=RE_train_dataset,         # training dataset
eval_dataset=RE_dev_dataset,             # evaluation dataset
compute_metrics=compute_metrics         # define metrics function
)

# train model
trainer.train()

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.den

Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
100,3.5436,3.28378,0.492222,5.2199,172.417
200,3.2567,3.25087,0.492222,5.2178,172.485
300,3.2656,3.22055,0.492222,5.2118,172.684
400,3.2148,3.072523,0.567778,5.2181,172.478
500,3.0096,2.943788,0.658889,5.206,172.876
600,2.894,2.864519,0.721111,5.2079,172.814
700,2.8482,2.873505,0.695556,5.2034,172.964
800,2.7949,2.820904,0.771111,5.2154,172.565
900,2.7783,2.811993,0.741111,5.2396,171.769
1000,2.7664,2.798092,0.754444,5.202,173.01


  item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}
  item = {key: torch.tensor(val[idx]) for key, val in 

In [7]:
auto_config.attention_probs_dropout_prob = 0.5

0.5

In [13]:
trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}


Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
500,2.446,2.376332,0.524444,3.5626,505.248
1000,1.8661,1.98933,0.680556,3.5344,509.286
1500,1.642,1.946461,0.710556,3.7204,483.818
2000,1.5051,2.000974,0.731667,3.6645,491.201
2500,1.4321,2.130034,0.692778,3.5551,506.315
3000,1.3806,2.070653,0.734444,3.5737,503.684
3500,1.365,2.124775,0.710556,3.6172,497.622
4000,1.3205,2.159071,0.716111,3.5698,504.227
4500,1.297,2.206514,0.712222,3.5866,501.871
5000,1.2688,2.162366,0.721111,3.5861,501.942


  item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}
  item = {key: torch.tensor(val[idx]) for key, val in 

TrainOutput(global_step=6750, training_loss=1.5396863923249422, metrics={'train_runtime': 1330.3261, 'train_samples_per_second': 5.074, 'total_flos': 7318578456774000.0, 'epoch': 15.0, 'init_mem_cpu_alloc_delta': 0, 'init_mem_gpu_alloc_delta': 0, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 56160256, 'train_mem_gpu_alloc_delta': 1368433664, 'train_mem_cpu_peaked_delta': 214994944, 'train_mem_gpu_peaked_delta': 1323940352})

In [4]:
import pickle as pickle
import os
import pandas as pd
import torch
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoConfig
from my_load_data import *
import numpy as np
import random

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  # calculate accuracy using sklearn's function
  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }

def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    
seed_everything(40)

# load model and tokenizer
MODEL_NAME = "monologg/koelectra-base-v3-discriminator"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

tokenizer.add_special_tokens({"additional_special_tokens":["[ENT1]", "[END1]", "[ENT2]", "[END2]"]})

# load dataset
# train_dataset, dev_dataset = load_data("/opt/ml/input/data/train/train.tsv")
train_dataset = load_data("/opt/ml/input/data/train/train_c.tsv", 0)
#dev_dataset = load_data("./dataset/train/dev.tsv")

train_label = train_dataset['label'].values
# dev_label = dev_dataset['label'].values

# tokenizing dataset
tokenized_train = tokenized_dataset(train_dataset, tokenizer)
# tokenized_dev = tokenized_dataset(dev_dataset, tokenizer)

# make dataset for pytorch.
RE_train_dataset = RE_Dataset(tokenized_train, train_label)
# RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# setting model hyperparameter
auto_config = AutoConfig.from_pretrained(MODEL_NAME)
auto_config.num_labels = 42
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=auto_config) 
# model.parameters
model.resize_token_embeddings(len(tokenizer))
model.to(device)

# 사용한 option 외에도 다양한 option들이 있습니다.
# https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요.
training_args = TrainingArguments(
output_dir='./results/ko-v3',          # output directory
save_total_limit=10,              # number of total save model.
save_steps=500,                 # model saving step.
num_train_epochs=10,              # total number of training epochs
learning_rate=5e-6,               # learning_rate
per_device_train_batch_size=32,  # batch size per device during training
per_device_eval_batch_size=32,   # batch size for evaluation
gradient_accumulation_steps=1,
label_smoothing_factor=0.1,
warmup_steps=500,                # number of warmup steps for learning rate scheduler
weight_decay=0.01,               # strength of weight decay
logging_dir='./logs/ko-v3',            # directory for storing logs
logging_steps=100,              # log saving step.
# evaluation_strategy='steps', # evaluation strategy to adopt during training
                            # `no`: No evaluation during training.
                            # `steps`: Evaluate every `eval_steps`.
                            # `epoch`: Evaluate every end of epoch.
# eval_steps = 500,            # evaluation step.
)

trainer = Trainer(
model=model,                         # the instantiated 🤗 Transformers model to be trained
args=training_args,                  # training arguments, defined above
train_dataset=RE_train_dataset,         # training dataset
# eval_dataset=RE_dev_dataset,             # evaluation dataset
compute_metrics=compute_metrics         # define metrics function
)

# train model
trainer.train()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


KeyboardInterrupt: 

In [4]:
tokenizer

PreTrainedTokenizerFast(name_or_path='monologg/koelectra-base-v3-discriminator', vocab_size=35000, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]', 'additional_special_tokens': ['[ENT1]', '[/ENT1]', '[ENT2]', '[\\ENT2]']})

In [11]:
model.electra.embeddings.forward

<bound method ElectraEmbeddings.forward of ElectraEmbeddings(
  (word_embeddings): Embedding(35004, 768)
  (position_embeddings): Embedding(512, 768)
  (token_type_embeddings): Embedding(2, 768)
  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)>

In [16]:
for param in model.electra.parameters():
    param.requires_grad = False

In [10]:
RE_train_dataset[10]

{'input_ids': tensor([    2,  9355,     3,  7001,     3,  6380,   146,  9853,  4556,  4073,
          4034, 10726,  4007,  8648,  4034,  6364,    16,  6226,  4007,  6695,
          4139,  6364,    16,  6315,  4444,  4147,  4292,  6260, 21609,  4007,
          6483,  6318,  4292,  6311,  6364,    16,  6299, 35000,  9355,    63,
            41, 28744,  4057,    65,  3232,  8867,  7796,   147,  2670,   146,
          6594,  4006,  4112, 35002,  7001,    63,    41, 28744,  4097,    65,
            16,  7744,  6283,  4047,  6261,  6284,  4219,  4195,  3771,  4820,
          7001,    16,  3342,  6318, 10728,  3201,  4279,  4034,  7001, 24387,
           147,  2075,  6374,  4398,  4176,    18,     3,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 

In [2]:
train_dataset.iloc[10].sentence

'심상정 정의당 대표는 21일 "국민들께선 이번 총선에서 지지부진한 정치권의 이합집산과 꼼수정치를 심판하고 진정한 정치혁명의 길을 열어 달라"고 말했다.'

In [8]:
a = [tokenizer.decode(RE_train_dataset[i]['input_ids']) for i in range(9000)]

In [11]:
M = 0
for i in a:
    if M < len(i):
        M = len(i)
        idx = i
M

555

In [17]:
tokenizer(train_dataset.iloc[10].sentence, return_tensors='pt')['input_ids']

tensor([[    2, 21150, 12541,  6270,  4034,  6591,  4366,     6,  6318,  4006,
          5253,  4207,  6294,  7562,  4073,  4129, 20677,  4283,  6315,  4046,
          4234,  3240,  4280,  4557,  4122,  4047, 16351,  4142,  4332,  4110,
          9064,  4279,  4219,  7499,  4283,  6315, 20351,  4234,  2139,  4292,
          3142,  4025,  7193,     6,  2075,  2633,  4398,  4176,    18,     3]])

In [30]:
len(tokenizer)

35000

In [32]:
tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

In [31]:
# model