In [1]:
!pip install transformers -q

[K     |████████████████████████████████| 5.5 MB 29.0 MB/s 
[K     |████████████████████████████████| 7.6 MB 82.1 MB/s 
[K     |████████████████████████████████| 182 kB 87.4 MB/s 
[?25h

In [2]:
!pip install datasets -q

[K     |████████████████████████████████| 451 kB 26.1 MB/s 
[K     |████████████████████████████████| 115 kB 85.0 MB/s 
[K     |████████████████████████████████| 212 kB 88.4 MB/s 
[K     |████████████████████████████████| 127 kB 12.8 MB/s 
[?25h

In [None]:
!pip install nltk -q

In [None]:
!pip install gensim -q

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%cd drive/MyDrive/data

/content/drive/MyDrive/data


# 데이터 로드

In [7]:
import pandas as pd

train = pd.read_csv("train.csv")
train.head()

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID,Target
0,TRAIN_0000,also I was the point person on my company’s tr...,Chandler,0,neutral
1,TRAIN_0001,You must’ve had your hands full.,The Interviewer,0,neutral
2,TRAIN_0002,That I did. That I did.,Chandler,0,neutral
3,TRAIN_0003,So let’s talk a little bit about your duties.,The Interviewer,0,neutral
4,TRAIN_0004,My duties? All right.,Chandler,0,surprise


### 전처리

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from gensim import corpora

In [None]:
input_data = train['Utterance']
input_data

0       also I was the point person on my company’s tr...
1                        You must’ve had your hands full.
2                                 That I did. That I did.
3           So let’s talk a little bit about your duties.
4                                  My duties?  All right.
                              ...                        
9984                                           You or me?
9985    I got it. Uh, Joey, women don't have Adam's ap...
9986                 You guys are messing with me, right?
9987                                                Yeah.
9988    That was a good one. For a second there, I was...
Name: Utterance, Length: 9989, dtype: object

In [None]:
doc_set = []

for doc in input_data:
  if type(doc) != float :
    doc_set.append(doc.replace("_"," "))

doc_set[:5]

['also I was the point person on my company’s transition from the KL-5 to GR-6 system.',
 'You must’ve had your hands full.',
 'That I did. That I did.',
 'So let’s talk a little bit about your duties.',
 'My duties?  All right.']

In [None]:
import nltk
nltk.download('stopwords')

stopWords = stopwords.words("english")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
stemmer = PorterStemmer()

In [None]:
import nltk
nltk.download('punkt')

words = []

for doc in doc_set:
  tokenizedWords = word_tokenize(doc.lower())
  stoppedWords = [v for v in tokenizedWords if v not in stopWords]
  stemmedWords = [stemmer.stem(v) for v in stoppedWords]
  words.append(stemmedWords)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
words[:5]

[['also',
  'point',
  'person',
  'compani',
  '’',
  'transit',
  'kl-5',
  'gr-6',
  'system',
  '.'],
 ['must', '’', 'hand', 'full', '.'],
 ['.', '.'],
 ['let', '’', 'talk', 'littl', 'bit', 'duti', '.'],
 ['duti', '?', 'right', '.']]

In [None]:
dictionary = corpora.Dictionary(words)
corpus = [dictionary.doc2bow(word) for word in words]
corpus[:5]

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1)],
 [(0, 1), (9, 1), (10, 1), (11, 1), (12, 1)],
 [(0, 2)],
 [(0, 1), (9, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)],
 [(0, 1), (14, 1), (18, 1), (19, 1)]]

### 분할

In [8]:
train['labels'] = train['Target'].map({'neutral':0,
                                 'joy':1,
                                 'surprise':2,
                                 'anger':3,
                                 'sadness':4,
                                 'disgust':5,
                                 'fear':6})

In [9]:
train = train.drop(columns = ['ID', 'Speaker', 'Dialogue_ID','Target'])

In [10]:
from sklearn.model_selection import train_test_split

t_data, v_data = train_test_split(train, test_size = 0.2, random_state = 42,
                                  shuffle = True)

### pandas to dictdataset

In [21]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(t_data)
val_dataset = Dataset.from_pandas(v_data)

from datasets.dataset_dict import DatasetDict

raw_data = {
     'train':Dataset.from_dict({'Utterance':train_dataset['Utterance'], 'labels' : train_dataset['labels']}),
     'val':Dataset.from_dict({'Utterance':val_dataset['Utterance'], 'labels' : val_dataset['labels']})
     }

raw_data = DatasetDict(raw_data)

In [22]:
raw_data

DatasetDict({
    train: Dataset({
        features: ['Utterance', 'labels'],
        num_rows: 7991
    })
    val: Dataset({
        features: ['Utterance', 'labels'],
        num_rows: 1998
    })
})

### 모델 로드

In [14]:
from transformers import AutoTokenizer

In [15]:
model_name = "tae898/emoberta-large"

tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case = True)
 
def tokenize_function(examples):
    return tokenizer(examples["Utterance"], padding="max_length", truncation=True)

Downloading:   0%|          | 0.00/408 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

In [16]:
import os
import torch
import numpy as np
import random

def seed_everything(seed:int = 1004):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

seed_everything(42)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [17]:
def tokenize_function(example):
	return tokenizer(example['Utterance'],  
                    padding='max_length', 
                    truncation=True, 
                    max_length=128)

In [23]:
preprocessed_data = raw_data.map(tokenize_function, batched = True)

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [24]:
preprocessed_data = preprocessed_data.remove_columns(["Utterance"])

In [25]:
preprocessed_data.set_format("torch")
preprocessed_data["train"].column_names

['labels', 'input_ids', 'attention_mask']

### 데이터 로더 구축

In [27]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [36]:
from torch.utils.data import DataLoader, RandomSampler

train_dataloader = DataLoader(
    preprocessed_data["train"],
    batch_size=32,
    sampler = RandomSampler(preprocessed_data["train"], replacement = False),
    collate_fn=data_collator,
    drop_last = False
)

In [41]:
from torch.utils.data import SequentialSampler

val_dataloader = DataLoader(
    preprocessed_data["val"],
    batch_size=32,
    sampler = SequentialSampler(preprocessed_data["val"]),
    collate_fn=data_collator,
    drop_last = False
)

### ㄱㄱ

In [32]:
from transformers import AutoConfig, AutoModelForSequenceClassification

pretrained_model_config = AutoConfig.from_pretrained(
    model_name
)

pretrained_model_config.hidden_dropout_prob = 0.2
pretrained_model_config.attention_probs_dropout_prob = 0.2

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    config = pretrained_model_config,
)

In [None]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)



In [34]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./result',
    num_train_epochs=7,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=4,
    save_total_limit=5,
    do_train=True,
    do_eval=True,
    save_steps=500,
    evaluation_strategy='steps',
    eval_steps = 500,
    load_best_model_at_end = True,
    learning_rate = 3e-5
)

In [45]:
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):

    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=preprocessed_data["train"],
    eval_dataset=preprocessed_data["val"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator = data_collator
)

In [46]:
trainer.train()

***** Running training *****
  Num examples = 7991
  Num Epochs = 7
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1750
  Number of trainable parameters = 355366919
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
500,1.0491,0.939581,0.688188
1000,0.7996,0.988836,0.696697
1500,0.5926,1.047169,0.693694


***** Running Evaluation *****
  Num examples = 1998
  Batch size = 4
Saving model checkpoint to ./result/checkpoint-500
Configuration saved in ./result/checkpoint-500/config.json
Model weights saved in ./result/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./result/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./result/checkpoint-500/special_tokens_map.json
Deleting older checkpoint [result/checkpoint-750] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1998
  Batch size = 4
Saving model checkpoint to ./result/checkpoint-1000
Configuration saved in ./result/checkpoint-1000/config.json
Model weights saved in ./result/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./result/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./result/checkpoint-1000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1998
  Batch size = 4
Saving model checkpoint to ./result/checkpoint-1

TrainOutput(global_step=1750, training_loss=0.7675420052664621, metrics={'train_runtime': 1224.1777, 'train_samples_per_second': 45.694, 'train_steps_per_second': 1.43, 'total_flos': 1.3032581335651584e+16, 'train_loss': 0.7675420052664621, 'epoch': 7.0})

### test

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

Tokenizer_NAME = "tae898/emoberta-large"
tokenizer = AutoTokenizer.from_pretrained(Tokenizer_NAME)

MODEL_NAME = './result/checkpoint-4000'
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.resize_token_embeddings(tokenizer.vocab_size)
model.to(device)

print(tokenizer)

In [47]:
test = pd.read_csv("test.csv")
test

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID
0,TEST_0000,Why do all the coffee cups have figures below?,Mark,0
1,TEST_0001,"Oh. It's so Monica can follow. Of this way, if...",Rachell,0
2,TEST_0002,You know what?,Rachell,0
3,TEST_0003,"Come on, Lydia, you can do it.",Joeyy,1
4,TEST_0004,To push!,Joeyy,1
...,...,...,...,...
2605,TEST_2605,"Yeah, I mean, go Ross, no one will even notice...",Rachell,279
2606,TEST_2606,They don't listen to me?,Rossi,279
2607,TEST_2607,"Of course, they listen to you! Everyone listen...",Rachell,279
2608,TEST_2608,"Monica, do you really think I should try this ...",Rossi,279


In [48]:
test= test.drop(columns = ['ID', 'Speaker', 'Dialogue_ID'])

test_dataset = Dataset.from_pandas(test)

test_data = {
     'test':Dataset.from_dict({'Utterance':test_dataset['Utterance']})
     }

test_data = DatasetDict(test_data)

In [49]:
test_preprocessed_data = test_data.map(tokenize_function, batched = True)
test_preprocessed_data.set_format("torch")

  0%|          | 0/3 [00:00<?, ?ba/s]

In [50]:
test_preprocessed_data = test_preprocessed_data.remove_columns(["Utterance"])
test_preprocessed_data["test"].column_names

['input_ids', 'attention_mask']

In [51]:
outputs = trainer.predict(test_preprocessed_data["test"])

***** Running Prediction *****
  Num examples = 2610
  Batch size = 4


In [52]:
outputs

PredictionOutput(predictions=array([[ 2.2689662 , -1.0385181 ,  1.6683528 , ..., -0.41413724,
        -1.6678011 , -2.0935316 ],
       [ 4.1162686 , -0.4175847 , -1.3725915 , ..., -0.38035923,
        -1.5095967 , -0.93313617],
       [ 3.2989528 , -0.7929618 , -1.4336239 , ...,  0.02662226,
        -0.7911409 , -1.4068867 ],
       ...,
       [ 2.7592726 ,  2.7362325 , -1.5906775 , ..., -0.79058605,
        -2.1038551 , -1.667241  ],
       [-0.29382962, -1.0274105 , -0.08833681, ...,  1.1835624 ,
        -1.0023769 ,  0.44821757],
       [ 3.1735482 ,  3.395937  , -1.2191862 , ..., -0.96608955,
        -1.3108206 , -2.0659006 ]], dtype=float32), label_ids=None, metrics={'test_runtime': 19.3736, 'test_samples_per_second': 134.719, 'test_steps_per_second': 33.706})

In [53]:
y_pred = outputs.predictions.argmax(1)

In [54]:
sample = pd.read_csv("sample_submission.csv")
sample['Target'] = y_pred

In [55]:
sample['Target'] = sample['Target'].map({0:'neutral',
                                 1:'joy',
                                 2:'surprise',
                                 3:'anger',
                                 4:'sadness',
                                 5:'disgust',
                                 6:'fear'})
sample.head()

Unnamed: 0,ID,Target
0,TEST_0000,neutral
1,TEST_0001,neutral
2,TEST_0002,neutral
3,TEST_0003,neutral
4,TEST_0004,joy


In [56]:
sample.to_csv("emoberta_submit_3.csv",index = False)