In [None]:
!pip install transformers -q

In [1]:
from google.colab import drive
drive.mount('/content/drive')

%cd drive/MyDrive/data

Mounted at /content/drive
/content/drive/MyDrive/data


In [7]:
import pandas as pd
train = pd.read_csv("train.csv")

In [3]:
train.head()

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID,Target
0,TRAIN_0000,also I was the point person on my company’s tr...,Chandler,0,neutral
1,TRAIN_0001,You must’ve had your hands full.,The Interviewer,0,neutral
2,TRAIN_0002,That I did. That I did.,Chandler,0,neutral
3,TRAIN_0003,So let’s talk a little bit about your duties.,The Interviewer,0,neutral
4,TRAIN_0004,My duties? All right.,Chandler,0,surprise


### 불용어 정의

In [57]:
!pip install nltk -q

In [60]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [62]:
stop_words = set(stopwords.words("english"))

# 대화별 문장

In [None]:
t = []
i = 0

for i in range(len(train)):
  t.append([train.loc[train['Dialogue_ID'] == i]['Utterance']])

print(t)

In [19]:
t[0]

[0     also I was the point person on my company’s tr...
 1                      You must’ve had your hands full.
 2                               That I did. That I did.
 3         So let’s talk a little bit about your duties.
 4                                My duties?  All right.
 5     Now you’ll be heading a whole division, so you...
 6                                                I see.
 7     But there’ll be perhaps 30 people under you so...
 8                                         Good to know.
 9                                 We can go into detail
 10                               No don’t I beg of you!
 11    All right then, we’ll have a definite answer f...
 12                                             Really?!
 13                           Absolutely.  You can relax
 Name: Utterance, dtype: object]

# 화자

In [43]:
speaker = train['Speaker'].unique().tolist()

In [None]:
s = []
i = 0

for i in range(len(speaker)):
  s.append([train.loc[train['Speaker'] == speaker[i]]['Utterance']])

print(s)

In [56]:
s[0]

[0       also I was the point person on my company’s tr...
 2                                 That I did. That I did.
 4                                  My duties?  All right.
 6                                                  I see.
 8                                           Good to know.
                               ...                        
 9971                                                 Hey.
 9974                                 I broke up with her.
 9976    They were huge. When she sneezed, bats flew ou...
 9978    I'm tellin' you, she leaned back; I could see ...
 9984                                           You or me?
 Name: Utterance, Length: 1283, dtype: object]

# fff

In [64]:
!pip install datasets -q

[K     |████████████████████████████████| 451 kB 5.2 MB/s 
[K     |████████████████████████████████| 115 kB 51.8 MB/s 
[K     |████████████████████████████████| 182 kB 56.6 MB/s 
[K     |████████████████████████████████| 212 kB 46.0 MB/s 
[K     |████████████████████████████████| 127 kB 53.9 MB/s 
[?25h

In [63]:
train['label'] = train['Target'].map({'neutral':0,
                                 'joy':1,
                                 'surprise':2,
                                 'anger':3,
                                 'sadness':4,
                                 'disgust':5,
                                 'fear':6})

In [80]:
num_labels = len(train['label'].unique())

In [65]:
from sklearn.model_selection import train_test_split

t_data, v_data = train_test_split(train, test_size = 0.2, random_state = 42,
                                  shuffle = True)

In [66]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(t_data)
val_dataset = Dataset.from_pandas(v_data)

from datasets.dataset_dict import DatasetDict

train_data = {'train':Dataset.from_dict({'label':t_data['label'], 'text' : t_data['Utterance']})}
val_data = {'val':Dataset.from_dict({'label':v_data['label'], 'text' : v_data['Utterance']})}

train_data['train'][100]

{'label': 1,
 'text': "See? Ben doesn't think you're a loser, he thinks you're a cowboy! Now that's something."}

In [75]:
from transformers import AutoTokenizer

model_name = "tae898/emoberta-large"
tokenizer = AutoTokenizer.from_pretrained(
    model_name, padding = 128, truncation = True)

train_word_tokens = tokenizer(train_data['train']['text'])

train_result = []
for word in train_word_tokens: 
    if word not in stop_words: 
        train_result.append(word)

In [77]:
val_result = []

val_word_tokens = tokenizer(val_data['val']['text'])

for word in val_word_tokens: 
    if word not in stop_words: 
       val_result.append(word)

In [81]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Downloading:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

In [89]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="test_trainer",
    num_train_epochs=7,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=4,
    do_train=True,
    do_eval=True,
    evaluation_strategy='steps',
    eval_steps = 500,
    load_best_model_at_end = True,
    learning_rate = 2e-5
)

In [85]:
!pip install evaluate -q

[?25l[K     |████▌                           | 10 kB 10.9 MB/s eta 0:00:01[K     |█████████                       | 20 kB 8.5 MB/s eta 0:00:01[K     |█████████████▌                  | 30 kB 11.4 MB/s eta 0:00:01[K     |██████████████████              | 40 kB 4.8 MB/s eta 0:00:01[K     |██████████████████████▌         | 51 kB 4.0 MB/s eta 0:00:01[K     |███████████████████████████     | 61 kB 4.7 MB/s eta 0:00:01[K     |███████████████████████████████▌| 71 kB 5.3 MB/s eta 0:00:01[K     |████████████████████████████████| 72 kB 427 kB/s 
[?25h

In [87]:
import numpy as np
import evaluate

metric = evaluate.load("f1", average = "macro")

In [88]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [90]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_result,
    eval_dataset=val_result,
    compute_metrics=compute_metrics,
)

In [91]:
trainer.train()

***** Running training *****
  Num examples = 2
  Num Epochs = 7
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 7
  Number of trainable parameters = 355366919


TypeError: ignored

### tf

In [92]:
train_data

{'train': Dataset({
     features: ['label', 'text'],
     num_rows: 7991
 })}

In [93]:
val_data

{'val': Dataset({
     features: ['label', 'text'],
     num_rows: 1998
 })}

In [96]:
train_tokenized = tokenizer(train_data["train"]["text"], return_tensors = "np", padding = True)
train_labels = np.array(train_data["train"]["label"])

In [98]:
from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.optimizers import Adam

# Load and compile our model
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
# Lower learning rates are often better for fine-tuning transformers
model.compile(optimizer=Adam(3e-5))

model.fit(train_tokenized, train_labels)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}



Downloading:   0%|          | 0.00/527M [00:00<?, ?B/s]

loading weights file tf_model.h5 from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/tf_model.h5
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


ValueError: ignored