In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
%cd drive/MyDrive/data

/content/drive/MyDrive/data


In [None]:
!pip install datasets transformers huggingface_hub -q

[K     |████████████████████████████████| 451 kB 33.4 MB/s 
[K     |████████████████████████████████| 5.5 MB 86.4 MB/s 
[K     |████████████████████████████████| 182 kB 77.1 MB/s 
[K     |████████████████████████████████| 115 kB 86.2 MB/s 
[K     |████████████████████████████████| 212 kB 90.4 MB/s 
[K     |████████████████████████████████| 127 kB 85.1 MB/s 
[K     |████████████████████████████████| 7.6 MB 70.3 MB/s 
[?25h

In [None]:
import os
import tqdm
import pandas as pd
import torch
import datasets

from glob import glob
from tqdm import tqdm

from datasets import load_dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding

In [None]:
model_name = 'tae898/emoberta-large'

batch_size = 64
learning_rate = 5e-5
num_epochs = 4

In [None]:
import pandas as pd

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [None]:
train['labels'] = train['Target'].map({'neutral':0,
                                 'joy':1,
                                 'surprise':2,
                                 'anger':3,
                                 'sadness':4,
                                 'disgust':5,
                                 'fear':6})

from sklearn.model_selection import train_test_split

t_data, v_data = train_test_split(train, test_size = 0.2, random_state = 42,
                                  shuffle = True)

In [None]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(t_data)
val_dataset = Dataset.from_pandas(v_data)
test_dataset = Dataset.from_pandas(test)

raw_dataset = datasets.DatasetDict({"train":Dataset.from_dict({'text': train_dataset['Utterance'], 'labels':train_dataset['labels']}), 
                                    "valid":Dataset.from_dict({'text': val_dataset['Utterance'], 'labels':val_dataset['labels']}), 
                                    "test":Dataset.from_dict({'text': test_dataset['Utterance']})})
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 7991
    })
    valid: Dataset({
        features: ['text', 'labels'],
        num_rows: 1998
    })
    test: Dataset({
        features: ['text'],
        num_rows: 2610
    })
})

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding = True, max_length = 128)

tokenized_datasets = raw_dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Downloading:   0%|          | 0.00/408 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 7991
    })
    valid: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 1998
    })
    test: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 2610
    })
})

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'attention_mask']

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator)
valid_dataloader = DataLoader(tokenized_datasets["valid"], shuffle=True, batch_size=batch_size, collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_datasets["test"], shuffle=False, batch_size=batch_size, collate_fn=data_collator)

In [None]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'labels': torch.Size([64]),
 'input_ids': torch.Size([64, 62]),
 'attention_mask': torch.Size([64, 62])}

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7)

Downloading:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

In [None]:
from transformers import get_scheduler, AdamW
optimizer = AdamW(model.parameters(), lr=learning_rate)

num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

500




In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [None]:
from tqdm.auto import tqdm
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    model.save_pretrained(f"./result/{model_name}/{epoch}")
    tokenizer.save_pretrained(f"./result/{model_name}/{epoch}")

  0%|          | 0/500 [00:00<?, ?it/s]

In [None]:
!pip install torchmetrics -q

[?25l[K     |▋                               | 10 kB 33.9 MB/s eta 0:00:01[K     |█▎                              | 20 kB 31.6 MB/s eta 0:00:01[K     |█▉                              | 30 kB 39.8 MB/s eta 0:00:01[K     |██▌                             | 40 kB 24.0 MB/s eta 0:00:01[K     |███                             | 51 kB 25.8 MB/s eta 0:00:01[K     |███▊                            | 61 kB 29.6 MB/s eta 0:00:01[K     |████▎                           | 71 kB 27.8 MB/s eta 0:00:01[K     |█████                           | 81 kB 30.0 MB/s eta 0:00:01[K     |█████▋                          | 92 kB 30.9 MB/s eta 0:00:01[K     |██████▏                         | 102 kB 32.5 MB/s eta 0:00:01[K     |██████▉                         | 112 kB 32.5 MB/s eta 0:00:01[K     |███████▍                        | 122 kB 32.5 MB/s eta 0:00:01[K     |████████                        | 133 kB 32.5 MB/s eta 0:00:01[K     |████████▋                       | 143 kB 32.5 MB/s eta 0:

In [None]:
from torchmetrics import Accuracy
accuracy = Accuracy()

prediction_list_valid = []
target_list_valid = []

model.eval()
for batch in valid_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1).cpu()
    targets = batch['labels'].cpu()

    prediction_list_valid.extend(predictions)
    target_list_valid.extend(targets)
    #print(accuracy(predictions, targets)) # 매 batch 마다의 Accuracy

In [None]:
prediction_list = []
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    prediction_list.extend(predictions.cpu().tolist())

In [None]:
prediction_list[:5]

[2, 0, 0, 0, 1]

In [None]:
sample = pd.read_csv("sample_submission.csv")
sample['Target'] = prediction_list
sample

Unnamed: 0,ID,Target
0,TEST_0000,2
1,TEST_0001,0
2,TEST_0002,0
3,TEST_0003,0
4,TEST_0004,1
...,...,...
2605,TEST_2605,0
2606,TEST_2606,3
2607,TEST_2607,0
2608,TEST_2608,2


In [None]:
sample['Target'] = sample['Target'].map({
                                 0:'neutral',
                                 1:'joy',
                                 2:'surprise',
                                 3:'anger',
                                 4:'sadness',
                                 5:'disgust',
                                 6:'fear'})

In [None]:
sample['Target'].value_counts()

neutral     1364
joy          422
anger        307
surprise     287
sadness      143
disgust       48
fear          39
Name: Target, dtype: int64

In [None]:
sample.to_csv("emoberta_submit_7.csv", index = False)