In [1]:
!pip install transformers -q

[K     |████████████████████████████████| 5.5 MB 30.1 MB/s 
[K     |████████████████████████████████| 163 kB 66.4 MB/s 
[K     |████████████████████████████████| 7.6 MB 59.9 MB/s 
[?25h

In [2]:
from google.colab import drive
drive.mount('/content/drive')
%cd drive/MyDrive/Bert

Mounted at /content/drive
/content/drive/MyDrive/Bert


In [35]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertTokenizerFast
from transformers import TFBertModel

import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

In [16]:
train = pd.read_csv('data/train.csv')

train['Labels'] = train['Target'].map({'neutral':0,
                                 'joy':1,
                                 'surprise':2,
                                 'anger':3,
                                 'sadness':4,
                                 'disgust':5,
                                 'fear':6})

train = train.drop(['Target'], axis = 1)
train.head()

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID,Labels
0,TRAIN_0000,also I was the point person on my company’s tr...,Chandler,0,0
1,TRAIN_0001,You must’ve had your hands full.,The Interviewer,0,0
2,TRAIN_0002,That I did. That I did.,Chandler,0,0
3,TRAIN_0003,So let’s talk a little bit about your duties.,The Interviewer,0,0
4,TRAIN_0004,My duties? All right.,Chandler,0,2


In [17]:
train['Labels'].unique()

array([0, 2, 6, 4, 1, 5, 3])

In [22]:
x = train['Utterance'].values
y = train['Labels'].values

In [23]:
x_train, x_valid, y_train, y_valid = train_test_split(
    x,y, test_size = 0.2, stratify = y, random_state = 42
)

In [76]:
test = pd.read_csv('data/test.csv')
test.head()
z = test['Utterance'].values

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [44]:
max_len = 128

def tokenize(data,max_len=max_len) :
    input_ids = []
    attention_masks = []
    for i in range(len(data)):
        encoded = tokenizer.encode_plus(
            data[i],
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return np.array(input_ids),np.array(attention_masks)

In [78]:
train_input_ids, train_attention_masks = tokenize(x_train, max_len)
val_input_ids, val_attention_masks = tokenize(x_valid, max_len)
test_input_ids, test_attention_masks = tokenize(z, max_len)

In [None]:
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

In [69]:
def create_model(model, max_len=max_len):
    
    ##params###
    opt = tf.keras.optimizers.Adam(learning_rate=1e-5, decay=1e-7)
    loss = tf.keras.losses.SparseCategoricalCrossentropy()
    accuracy = tf.keras.metrics.SparseCategoricalAccuracy()


    input_ids = tf.keras.Input(shape=(max_len,),dtype='int32')
    
    attention_masks = tf.keras.Input(shape=(max_len,),dtype='int32')
    
    embeddings = bert_model([input_ids,attention_masks])[1]
    
    output = tf.keras.layers.Dense(3, activation="softmax")(embeddings)
    
    model = tf.keras.models.Model(inputs = [input_ids,attention_masks], outputs = output)
    
    model.compile(opt, loss=loss, metrics=accuracy)
    
    
    return model

In [55]:
model = create_model(bert_model, max_len)
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_9 (InputLayer)           [(None, 128)]        0           []                               
                                                                                                  
 input_10 (InputLayer)          [(None, 128)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  109482240   ['input_9[0][0]',                
                                thPoolingAndCrossAt               'input_10[0][0]']               
                                tentions(last_hidde                                               
                                n_state=(None, 128,                                         

In [None]:
history_bert = model.fit([train_input_ids,train_attention_masks], y_train, validation_data=([val_input_ids, val_attention_masks], y_valid), epochs=4, batch_size=32)

In [None]:
result_bert = model.predict([test_input_ids,test_attention_masks])

### 나중에 확인

In [None]:
tokenizer_emoberta = AutoTokenizer.from_pretrained("tae898/emoberta-large")

In [60]:
max_len = 128

def tokenize_emoberta(data,max_len=max_len) :
    input_ids = []
    attention_masks = []
    for i in range(len(data)):
        encoded = tokenizer_emoberta.encode_plus(
            data[i],
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return np.array(input_ids),np.array(attention_masks)

In [61]:
train_input_ids, train_attention_masks = tokenize_emoberta(x_train, max_len)
val_input_ids, val_attention_masks = tokenize_emoberta(x_valid, max_len)

In [62]:
emoberta_model = AutoModelForSequenceClassification.from_pretrained("tae898/emoberta-large")

All model checkpoint weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initialized from the model checkpoint at tae898/emoberta-large.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassification for predictions without further training.


In [72]:
def create_model(bert_model, max_len=max_len):
    
    opt = tf.keras.optimizers.Adam(learning_rate=1e-5, decay=1e-7)
    loss = tf.keras.losses.CategoricalCrossentropy()
    accuracy = tf.keras.metrics.CategoricalAccuracy()

    input_ids = tf.keras.Input(shape=(max_len,),dtype='int32')
    attention_masks = tf.keras.Input(shape=(max_len,),dtype='int32')
    output = bert_model([input_ids,attention_masks])
    output = output[1]
    output = tf.keras.layers.Dense(3, activation=tf.nn.softmax)(output)
    model = tf.keras.models.Model(inputs = [input_ids,attention_masks],outputs = output)
    model.compile(opt, loss=loss, metrics=accuracy)
    return model

In [73]:
model2 = create_model(emoberta_model, max_len)
model2.summary()

AttributeError: ignored

In [None]:
history_2 = model2.fit([train_input_ids,train_attention_masks], y_train, 
                      validation_data=([val_input_ids,val_attention_masks], y_valid), epochs=4, batch_size=30)