In [1]:
!pip install transformers -q

In [1]:
!pip install datasets -q

In [None]:
!pip install nltk -q

In [None]:
!pip install gensim -q

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
%cd drive/MyDrive/data

/content/drive/MyDrive/data


# 데이터 로드

In [6]:
import pandas as pd

train = pd.read_csv("train.csv")
train.head()

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID,Target
0,TRAIN_0000,also I was the point person on my company’s tr...,Chandler,0,neutral
1,TRAIN_0001,You must’ve had your hands full.,The Interviewer,0,neutral
2,TRAIN_0002,That I did. That I did.,Chandler,0,neutral
3,TRAIN_0003,So let’s talk a little bit about your duties.,The Interviewer,0,neutral
4,TRAIN_0004,My duties? All right.,Chandler,0,surprise


### 전처리

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from gensim import corpora

In [None]:
input_data = train['Utterance']
input_data

0       also I was the point person on my company’s tr...
1                        You must’ve had your hands full.
2                                 That I did. That I did.
3           So let’s talk a little bit about your duties.
4                                  My duties?  All right.
                              ...                        
9984                                           You or me?
9985    I got it. Uh, Joey, women don't have Adam's ap...
9986                 You guys are messing with me, right?
9987                                                Yeah.
9988    That was a good one. For a second there, I was...
Name: Utterance, Length: 9989, dtype: object

In [None]:
doc_set = []

for doc in input_data:
  if type(doc) != float :
    doc_set.append(doc.replace("_"," "))

doc_set[:5]

['also I was the point person on my company’s transition from the KL-5 to GR-6 system.',
 'You must’ve had your hands full.',
 'That I did. That I did.',
 'So let’s talk a little bit about your duties.',
 'My duties?  All right.']

In [None]:
import nltk
nltk.download('stopwords')

stopWords = stopwords.words("english")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
stemmer = PorterStemmer()

In [None]:
import nltk
nltk.download('punkt')

words = []

for doc in doc_set:
  tokenizedWords = word_tokenize(doc.lower())
  stoppedWords = [v for v in tokenizedWords if v not in stopWords]
  stemmedWords = [stemmer.stem(v) for v in stoppedWords]
  words.append(stemmedWords)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
words[:5]

[['also',
  'point',
  'person',
  'compani',
  '’',
  'transit',
  'kl-5',
  'gr-6',
  'system',
  '.'],
 ['must', '’', 'hand', 'full', '.'],
 ['.', '.'],
 ['let', '’', 'talk', 'littl', 'bit', 'duti', '.'],
 ['duti', '?', 'right', '.']]

In [None]:
dictionary = corpora.Dictionary(words)
corpus = [dictionary.doc2bow(word) for word in words]
corpus[:5]

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1)],
 [(0, 1), (9, 1), (10, 1), (11, 1), (12, 1)],
 [(0, 2)],
 [(0, 1), (9, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)],
 [(0, 1), (14, 1), (18, 1), (19, 1)]]

### 분할

In [7]:
train['Labels'] = train['Target'].map({'neutral':0,
                                 'joy':1,
                                 'surprise':2,
                                 'anger':3,
                                 'sadness':4,
                                 'disgust':5,
                                 'fear':6})

In [8]:
train = train.drop(columns = ['ID', 'Speaker', 'Dialogue_ID','Target'])

In [9]:
from sklearn.model_selection import train_test_split

t_data, v_data = train_test_split(train, test_size = 0.2, random_state = 42,
                                  shuffle = True)

### pandas to dictdataset

In [10]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(t_data)
val_dataset = Dataset.from_pandas(v_data)

from datasets.dataset_dict import DatasetDict

raw_data = {
     'train':Dataset.from_dict({'Utterance':train_dataset['Utterance'], 'Labels' : train_dataset['Labels']}),
     'val':Dataset.from_dict({'Utterance':val_dataset['Utterance'], 'Labels' : val_dataset['Labels']})
     }

raw_data = DatasetDict(raw_data)

In [11]:
raw_data

DatasetDict({
    train: Dataset({
        features: ['Utterance', 'Labels'],
        num_rows: 7991
    })
    val: Dataset({
        features: ['Utterance', 'Labels'],
        num_rows: 1998
    })
})

### 모델 로드

In [12]:
from transformers import AutoTokenizer

In [13]:
model_name = "tae898/emoberta-large"

tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case = True)
 
def tokenize_function(examples):
    return tokenizer(examples["Utterance"], padding="max_length", truncation=True)

In [14]:
import os
import torch
import numpy as np
import random

def seed_everything(seed:int = 1004):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

seed_everything(42)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [15]:
def tokenize_function(example):
	return tokenizer(example['Utterance'],  
                    padding='max_length', 
                    truncation=True, 
                    max_length=128)

In [16]:
preprocessed_data = raw_data.map(tokenize_function, batched = True)

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [17]:
preprocessed_data = preprocessed_data.remove_columns(["Utterance"])
preprocessed_data = preprocessed_data.rename_column("Labels", "labels")

In [18]:
preprocessed_data.set_format("torch")
preprocessed_data["train"].column_names

['labels', 'input_ids', 'attention_mask']

### 데이터 로더 구축

In [23]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from torch.utils.data import DataLoader, RandomSampler

train_dataloader = DataLoader(
    tokenized_train,
    batch_size=32,
    sampler = RandomSampler(tokenized_train, replacement = False),
    collate_fn=data_collator,
    drop_last = False
)

In [None]:
from torch.utils.data import SequentialSampler

train_dataloader = DataLoader(
    tokenized_eval,
    batch_size=32,
    sampler = SequentialSampler(tokenized_eval),
    collate_fn=data_collator,
    drop_last = False
)

### ㄱㄱ

In [19]:
from transformers import AutoConfig, AutoModelForSequenceClassification

pretrained_model_config = AutoConfig.from_pretrained(
    model_name
)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    config = pretrained_model_config,
)

In [20]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)



In [21]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./result',
    num_train_epochs=7,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=4,
    save_total_limit=5,
    do_train=True,
    do_eval=True,
    save_steps=500,
    evaluation_strategy='steps',
    eval_steps = 500,
    load_best_model_at_end = True,
    learning_rate = 2e-5
)

In [26]:
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):

    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=preprocessed_data["train"],
    eval_dataset=preprocessed_data["val"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator = data_collator
)

Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [None]:
trainer.train()

### test

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

Tokenizer_NAME = "tae898/emoberta-large"
tokenizer = AutoTokenizer.from_pretrained(Tokenizer_NAME)

MODEL_NAME = './result/checkpoint-4000'
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.resize_token_embeddings(tokenizer.vocab_size)
model.to(device)

print(tokenizer)

In [28]:
test = pd.read_csv("test.csv")
test

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID
0,TEST_0000,Why do all the coffee cups have figures below?,Mark,0
1,TEST_0001,"Oh. It's so Monica can follow. Of this way, if...",Rachell,0
2,TEST_0002,You know what?,Rachell,0
3,TEST_0003,"Come on, Lydia, you can do it.",Joeyy,1
4,TEST_0004,To push!,Joeyy,1
...,...,...,...,...
2605,TEST_2605,"Yeah, I mean, go Ross, no one will even notice...",Rachell,279
2606,TEST_2606,They don't listen to me?,Rossi,279
2607,TEST_2607,"Of course, they listen to you! Everyone listen...",Rachell,279
2608,TEST_2608,"Monica, do you really think I should try this ...",Rossi,279


In [29]:
test= test.drop(columns = ['ID', 'Speaker', 'Dialogue_ID'])

test_dataset = Dataset.from_pandas(test)

test_data = {
     'test':Dataset.from_dict({'Utterance':test_dataset['Utterance']})
     }

test_data = DatasetDict(test_data)

In [30]:
test_preprocessed_data = test_data.map(tokenize_function, batched = True)
test_preprocessed_data.set_format("torch")

  0%|          | 0/3 [00:00<?, ?ba/s]

['Utterance', 'input_ids', 'attention_mask']

In [32]:
test_preprocessed_data = test_preprocessed_data.remove_columns(["Utterance"])
test_preprocessed_data["test"].column_names

['input_ids', 'attention_mask']

In [34]:
outputs = trainer.predict(test_preprocessed_data["test"])

***** Running Prediction *****
  Num examples = 2610
  Batch size = 4


In [35]:
outputs

PredictionOutput(predictions=array([[ 2.556825  , -1.5981467 ,  2.5536907 , ..., -0.13046078,
        -2.1479995 , -1.4384737 ],
       [ 5.902675  , -0.31581146, -1.2520833 , ..., -0.5592227 ,
        -2.508309  , -1.5447471 ],
       [ 2.4086266 , -0.75795346, -1.2532747 , ..., -1.3469623 ,
        -0.16228461, -0.93050176],
       ...,
       [ 4.593633  ,  1.6788275 , -0.48462075, ..., -1.2034016 ,
        -3.6891553 , -0.886418  ],
       [-0.1958743 , -0.5388358 ,  2.0178022 , ...,  0.03130305,
        -2.414132  ,  2.0354948 ],
       [ 4.3759227 ,  3.0463383 , -1.087309  , ..., -0.8029928 ,
        -1.6816545 , -2.0003784 ]], dtype=float32), label_ids=None, metrics={'test_runtime': 20.3364, 'test_samples_per_second': 128.342, 'test_steps_per_second': 32.11})

In [36]:
y_pred = outputs.predictions.argmax(1)

array([0, 0, 0, ..., 0, 6, 0])

In [39]:
sample = pd.read_csv("sample_submission.csv")
sample['Target'] = y_pred

Unnamed: 0,ID,Target
0,TEST_0000,0
1,TEST_0001,0
2,TEST_0002,0
3,TEST_0003,0
4,TEST_0004,1
...,...,...
2605,TEST_2605,0
2606,TEST_2606,3
2607,TEST_2607,0
2608,TEST_2608,6


In [40]:
sample['Target'] = sample['Target'].map({0:'neutral',
                                 1:'joy',
                                 2:'surprise',
                                 3:'anger',
                                 4:'sadness',
                                 5:'disgust',
                                 6:'fear'})
sample.head()

Unnamed: 0,ID,Target
0,TEST_0000,neutral
1,TEST_0001,neutral
2,TEST_0002,neutral
3,TEST_0003,neutral
4,TEST_0004,joy


In [41]:
sample.to_csv("emoberta_submit_2.csv",index = False)