In [1]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Dense, Input, Flatten, Concatenate
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model

In [2]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import re
import urllib.request
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### **훈련**

훈련 파일 friends_train, frineds_test, friends_dev 파일을 불러옴

In [4]:
import json

#txt2 = pd.read_csv('friends_train.json',sep='\t')
with open('/content/drive/MyDrive/dataset/friends_train.json') as f:
    txt2 = json.load(f)
with open('/content/drive/MyDrive/dataset/friends_test.json') as f:
    txt3 = json.load(f)
with open('/content/drive/MyDrive/dataset/friends_dev.json') as f:
    txt4 = json.load(f)

In [5]:
data = {
    'text' :[],
    'label': [],
}
#neutral joy sadness fear anger suprise disgust non
for n in txt2:
    for di in n:
        data['text'].append(di['utterance'])
        emotion = di['emotion']
        if emotion == 'neutral': emotion = 0
        elif emotion == 'joy': emotion = 1
        elif emotion == 'sadness': emotion = 2
        elif emotion == 'fear': emotion = 3
        elif emotion == 'anger': emotion = 4
        elif emotion == 'suprise': emotion = 5
        elif emotion == 'disgust': emotion = 6
        else: emotion = 7
        data['label'].append(emotion)

In [6]:
ratings_train = pd.DataFrame(data)

In [7]:
test_data = {
    'text' :[],
    'label': [],
}
#neutral joy sadness fear anger suprise disgust non
for n in txt3:
    for di in n:
        test_data['text'].append(di['utterance'])
        emotion = di['emotion']
        if emotion == 'neutral': emotion = 0
        elif emotion == 'joy': emotion = 1
        elif emotion == 'sadness': emotion = 2
        elif emotion == 'fear': emotion = 3
        elif emotion == 'anger': emotion = 4
        elif emotion == 'suprise': emotion = 5
        elif emotion == 'disgust': emotion = 6
        else: emotion = 7
        test_data['label'].append(emotion)

In [8]:
ratings_test = pd.DataFrame(test_data)

In [9]:
dev_data = {
    'text' :[],
    'label': [],
}
#neutral joy sadness fear anger suprise disgust non
for n in txt4:
    for di in n:
        dev_data['text'].append(di['utterance'])
        emotion = di['emotion']
        if emotion == 'neutral': emotion = 0
        elif emotion == 'joy': emotion = 1
        elif emotion == 'sadness': emotion = 2
        elif emotion == 'fear': emotion = 3
        elif emotion == 'anger': emotion = 4
        elif emotion == 'suprise': emotion = 5
        elif emotion == 'disgust': emotion = 6
        else: emotion = 7
        dev_data['label'].append(emotion)

In [10]:
ratings_dev = pd.DataFrame(dev_data)

In [11]:
ratings_train = pd.concat([ratings_train, ratings_dev])

In [12]:
ratings_train['text'].nunique(), ratings_train['label'].nunique()

(10294, 7)

In [13]:
ratings_train.drop_duplicates(subset=['text'], inplace=True) # document 열에서 중복인 내용이 있다면 중복 제거

In [14]:
ratings_test.drop_duplicates(subset=['text'], inplace=True)

In [15]:
print('중복제거 후 학습데이터 : '+str(len(ratings_train)))
print('중복제거 후 테스트데이터: '+str(len(ratings_test)))

중복제거 후 학습데이터 : 10294
중복제거 후 테스트데이터: 2505


In [16]:
sentences = ratings_train.text.values
labels = ratings_train.label.values

In [17]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 16.3MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 43.9MB/s 
Collecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 54.4MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=2f514c9e3be4a3f1c3

In [18]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [19]:
max_len = 0

for sent in sentences:
    input_ids = tokenizer.encode(sent, add_special_tokens=True)
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Max sentence length:  95


In [20]:
import torch

input_ids = []
attention_masks = []


for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
                        sent,                      
                        add_special_tokens = True, 
                        max_length = 64,        
                        pad_to_max_length = True,
                        return_attention_mask = True,  
                        return_tensors = 'pt', 
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

print(sentences[0])
print(input_ids[0])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


also I was the point person on my companys transition from the KL-5 to GR-6 system.
tensor([  101,  2036,  1045,  2001,  1996,  2391,  2711,  2006,  2026,  2194,
         2015,  6653,  2013,  1996,  1047,  2140,  1011,  1019,  2000, 24665,
         1011,  1020,  2291,  1012,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0])


In [21]:
from torch.utils.data import TensorDataset, random_split

dataset = TensorDataset(input_ids, attention_masks, labels)

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

9,264 training samples
1,030 validation samples


In [22]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 256

train_dataloader = DataLoader(
            train_dataset, 
            sampler = RandomSampler(train_dataset), 
            batch_size = batch_size 
        )

validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset),
            batch_size = batch_size 
        )

In [23]:
if torch.cuda.is_available():
  device = torch.device('cuda')
else:
  device = torch.device('cpu')

In [24]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels = 8,
    output_attentions = False, 
    output_hidden_states = False
)
model.cuda()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [25]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8 
                )

In [26]:
from transformers import get_linear_schedule_with_warmup

epochs = 5
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, 
                                            num_training_steps = total_steps)

In [27]:
import numpy as np

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [28]:
import time
import datetime

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [29]:
import random
import numpy as np

seed_val = 21

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
training_stats = []


total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))

    t0 = time.time()

    total_train_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):

        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        model.zero_grad()        
        #batch = tuple(b.to(device) for b in batch)
        #b_input_ids, b_input_mask, b_labels = batch

        outputs = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)
        loss=outputs[0]

        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    training_time = format_time(time.time() - t0)

    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))

    t0 = time.time()

    model.eval()


    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        with torch.no_grad():        
            outputs = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
        loss = outputs[0]
        logits = outputs[1]
            
        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        total_eval_accuracy += flat_accuracy(logits, label_ids)

    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    validation_time = format_time(time.time() - t0)

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

  Average training loss: 1.60
  Training epcoh took: 0:01:25
  Accuracy: 0.47
  Average training loss: 1.18
  Training epcoh took: 0:01:31
  Accuracy: 0.50
  Average training loss: 1.05
  Training epcoh took: 0:01:35
  Accuracy: 0.59
  Average training loss: 0.98
  Training epcoh took: 0:01:35
  Accuracy: 0.56
  Average training loss: 0.95
  Training epcoh took: 0:01:35
  Accuracy: 0.60
Total training took 0:08:01 (h:mm:ss)


### **결과**

eng 변수에 en_data 파일 위치를 집어넣고 코드를 수행시켜 결과 출력

In [30]:
import pandas as pd

eng = pd.read_csv('/content/drive/MyDrive/dataset/en_data.csv')

In [31]:
eng.head()

Unnamed: 0,id,i_dialog,i_utterance,speaker,utterance
0,0,0,0,Phoebe,"Alright, whadyou do with him?"
1,1,0,1,Monica,Oh! You're awake!
2,2,0,2,Joey,Then you gotta come clean with Ma! This is not...
3,3,0,3,Mr. Tribbiani,"Yeah, but this is"
4,4,0,4,Joey,I don't wanna hear it! Now go to my room!


In [32]:
sentences = eng.utterance.values

In [33]:
def test_sentences(sentences):
    model.eval()
    inputs, masks = convert_input_data(sentences)

    b_input_ids = inputs.to(device)
    b_input_mask = masks.to(device)
            
    with torch.no_grad():     
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    return logits

def convert_input_data(sentences):
    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    input_ids = pad_sequences(input_ids, maxlen=max_len, dtype="long", truncating="post", padding="post")
    attention_masks = []
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)
    inputs = torch.tensor(input_ids)
    masks = torch.tensor(attention_masks)

    return inputs, masks

def inttolabel(idx):
    return {0:'neutral',
             1:'joy', 
             2:'sadness',
             3:'fear',
             4:'anger',
             5:'surprise',
             6:'disgust',
             7:'non-neutral'}[idx]

In [34]:
en_data = [['Id', 'Predicted']]

for idx in range(len(eng['utterance'])):
  sen = eng['utterance'][idx]
  logit = test_sentences([sen])

  en_data.append([idx, inttolabel(np.argmax(logit))])

In [35]:
dataframe = pd.DataFrame(en_data)
dataframe.to_csv("/content/drive/MyDrive/dataset/en_result.csv", header=False, index=False)