In [None]:
!pip install transformers -q

[K     |████████████████████████████████| 5.5 MB 4.9 MB/s 
[K     |████████████████████████████████| 7.6 MB 55.9 MB/s 
[K     |████████████████████████████████| 182 kB 85.4 MB/s 
[?25h

In [None]:
!pip install datasets -q

[K     |████████████████████████████████| 451 kB 5.0 MB/s 
[K     |████████████████████████████████| 115 kB 79.4 MB/s 
[K     |████████████████████████████████| 212 kB 56.5 MB/s 
[K     |████████████████████████████████| 127 kB 89.4 MB/s 
[?25h

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd drive/MyDrive/data

/content/drive/MyDrive/data


# 데이터 로드

In [None]:
import pandas as pd

train = pd.read_csv("train.csv")
train.head()

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID,Target
0,TRAIN_0000,also I was the point person on my company’s tr...,Chandler,0,neutral
1,TRAIN_0001,You must’ve had your hands full.,The Interviewer,0,neutral
2,TRAIN_0002,That I did. That I did.,Chandler,0,neutral
3,TRAIN_0003,So let’s talk a little bit about your duties.,The Interviewer,0,neutral
4,TRAIN_0004,My duties? All right.,Chandler,0,surprise


### 화자 발화 병합

In [None]:
speak = train['Speaker'].tolist()
text = train['Utterance'].tolist()

conv_text = []

for i in range(len(train)):
  conv_text.append(speak[i] + " : "+ text[i])

train['Utterance'] = conv_text

### 분할

In [None]:
train['labels'] = train['Target'].map({'neutral':0,
                                 'joy':1,
                                 'surprise':2,
                                 'anger':3,
                                 'sadness':4,
                                 'disgust':5,
                                 'fear':6})

In [None]:
train = train.drop(columns = ['ID', 'Speaker', 'Dialogue_ID','Target'])

In [None]:
from sklearn.model_selection import train_test_split

t_data, v_data = train_test_split(train, test_size = 0.2, random_state = 42,
                                  shuffle = True)

### pandas to dictdataset

In [None]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(t_data)
val_dataset = Dataset.from_pandas(v_data)

from datasets.dataset_dict import DatasetDict

raw_data = {
     'train':Dataset.from_dict({'Utterance':train_dataset['Utterance'], 'labels' : train_dataset['labels']}),
     'val':Dataset.from_dict({'Utterance':val_dataset['Utterance'], 'labels' : val_dataset['labels']})
     }

raw_data = DatasetDict(raw_data)

In [None]:
raw_data

DatasetDict({
    train: Dataset({
        features: ['Utterance', 'labels'],
        num_rows: 7991
    })
    val: Dataset({
        features: ['Utterance', 'labels'],
        num_rows: 1998
    })
})

### 모델 로드

In [None]:
from transformers import AutoTokenizer

In [None]:
model_name = "tae898/emoberta-large"

tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case = True)
 
def tokenize_function(examples):
    return tokenizer(examples["Utterance"], padding="max_length", truncation=True)

Downloading:   0%|          | 0.00/408 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

In [None]:
import os
import torch
import numpy as np
import random

def seed_everything(seed:int = 1004):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

seed_everything(42)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [None]:
def tokenize_function(example):
	return tokenizer(example['Utterance'],  
                    padding='max_length', 
                    truncation=True, 
                    max_length=128)

In [None]:
preprocessed_data = raw_data.map(tokenize_function, batched = True)

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
preprocessed_data = preprocessed_data.remove_columns(["Utterance"])

In [None]:
preprocessed_data.set_format("torch")
preprocessed_data["train"].column_names

['labels', 'input_ids', 'attention_mask']

### 데이터 로더 구축

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from torch.utils.data import DataLoader, RandomSampler

train_dataloader = DataLoader(
    preprocessed_data["train"],
    batch_size=32,
    sampler = RandomSampler(preprocessed_data["train"], replacement = False),
    collate_fn=data_collator,
    drop_last = False
)

In [None]:
from torch.utils.data import SequentialSampler

val_dataloader = DataLoader(
    preprocessed_data["val"],
    batch_size=32,
    sampler = SequentialSampler(preprocessed_data["val"]),
    collate_fn=data_collator,
    drop_last = False
)

### ㄱㄱ

In [None]:
from transformers import AutoConfig, AutoModelForSequenceClassification

pretrained_model_config = AutoConfig.from_pretrained(
    model_name
)

pretrained_model_config.hidden_dropout_prob = 0.2
pretrained_model_config.attention_probs_dropout_prob = 0.2

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    config = pretrained_model_config,
)

Downloading:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

In [None]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)



In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./result',
    num_train_epochs=7,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=4,
    save_total_limit=5,
    do_train=True,
    do_eval=True,
    save_steps=500,
    evaluation_strategy='steps',
    eval_steps = 500,
    load_best_model_at_end = True,
    learning_rate = 3e-5
)

In [None]:
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):

    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=preprocessed_data["train"],
    eval_dataset=preprocessed_data["val"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator = data_collator
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 7991
  Num Epochs = 7
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1750
  Number of trainable parameters = 355366919
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
500,1.0541,0.983952,0.66967
1000,0.8594,1.034333,0.684685
1500,0.6293,1.111802,0.675676


***** Running Evaluation *****
  Num examples = 1998
  Batch size = 4
Saving model checkpoint to ./result/checkpoint-500
Configuration saved in ./result/checkpoint-500/config.json
Model weights saved in ./result/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./result/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./result/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1998
  Batch size = 4
Saving model checkpoint to ./result/checkpoint-1000
Configuration saved in ./result/checkpoint-1000/config.json
Model weights saved in ./result/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./result/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./result/checkpoint-1000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1998
  Batch size = 4
Saving model checkpoint to ./result/checkpoint-1500
Configuration saved in ./result/checkpoint-1500/config.json
Model weights s

TrainOutput(global_step=1750, training_loss=0.7979229954310826, metrics={'train_runtime': 1051.734, 'train_samples_per_second': 53.186, 'train_steps_per_second': 1.664, 'total_flos': 1.3032581335651584e+16, 'train_loss': 0.7979229954310826, 'epoch': 7.0})

### test

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

Tokenizer_NAME = "tae898/emoberta-large"
tokenizer = AutoTokenizer.from_pretrained(Tokenizer_NAME)

MODEL_NAME = './result/checkpoint-1000'
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.resize_token_embeddings(tokenizer.vocab_size)
model.to(device)

print(tokenizer)

In [None]:
test = pd.read_csv("test.csv")
test

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID
0,TEST_0000,Why do all the coffee cups have figures below?,Mark,0
1,TEST_0001,"Oh. It's so Monica can follow. Of this way, if...",Rachell,0
2,TEST_0002,You know what?,Rachell,0
3,TEST_0003,"Come on, Lydia, you can do it.",Joeyy,1
4,TEST_0004,To push!,Joeyy,1
...,...,...,...,...
2605,TEST_2605,"Yeah, I mean, go Ross, no one will even notice...",Rachell,279
2606,TEST_2606,They don't listen to me?,Rossi,279
2607,TEST_2607,"Of course, they listen to you! Everyone listen...",Rachell,279
2608,TEST_2608,"Monica, do you really think I should try this ...",Rossi,279


In [None]:
test= test.drop(columns = ['ID', 'Speaker', 'Dialogue_ID'])

test_dataset = Dataset.from_pandas(test)

test_data = {
     'test':Dataset.from_dict({'Utterance':test_dataset['Utterance']})
     }

test_data = DatasetDict(test_data)

In [None]:
test_preprocessed_data = test_data.map(tokenize_function, batched = True)
test_preprocessed_data.set_format("torch")

  0%|          | 0/3 [00:00<?, ?ba/s]

In [None]:
test_preprocessed_data = test_preprocessed_data.remove_columns(["Utterance"])
test_preprocessed_data["test"].column_names

['input_ids', 'attention_mask']

In [None]:
outputs = trainer.predict(test_preprocessed_data["test"])

***** Running Prediction *****
  Num examples = 2610
  Batch size = 4


In [None]:
outputs

PredictionOutput(predictions=array([[ 2.3586338 , -1.0591763 ,  2.3376906 , ..., -1.0344751 ,
        -1.9529845 , -1.8629124 ],
       [ 4.482628  , -0.09352561, -1.5792487 , ..., -0.7881235 ,
        -1.5916578 , -0.79508394],
       [ 2.765784  , -0.72937596, -2.3851838 , ...,  0.21447274,
         0.37984425, -0.98323023],
       ...,
       [ 2.1571233 ,  1.9439018 , -0.762742  , ..., -0.6793149 ,
        -2.2379944 , -1.1548946 ],
       [ 0.78102773, -0.5840291 ,  1.3907613 , ..., -0.83942884,
        -1.596452  ,  0.32665807],
       [ 3.5313938 ,  2.4741142 , -2.090935  , ..., -0.8596171 ,
        -0.85437953, -1.687212  ]], dtype=float32), label_ids=None, metrics={'test_runtime': 19.946, 'test_samples_per_second': 130.853, 'test_steps_per_second': 32.738})

In [None]:
y_pred = outputs.predictions.argmax(1)

In [None]:
sample = pd.read_csv("sample_submission.csv")
sample['Target'] = y_pred

In [None]:
sample['Target'] = sample['Target'].map({0:'neutral',
                                 1:'joy',
                                 2:'surprise',
                                 3:'anger',
                                 4:'sadness',
                                 5:'disgust',
                                 6:'fear'})
sample.head()

Unnamed: 0,ID,Target
0,TEST_0000,neutral
1,TEST_0001,neutral
2,TEST_0002,neutral
3,TEST_0003,neutral
4,TEST_0004,joy


In [None]:
sample['Target'].value_counts()

neutral     1407
joy          473
anger        316
sadness      153
surprise     148
fear          87
disgust       26
Name: Target, dtype: int64

In [None]:
sample.to_csv("emoberta_submit_3.csv",index = False)