- python 3.9
- pip upgrade // python -m pip install --upgrade pip
- transformers
- sentencepiece
- tensorflow_addons

In [15]:
# ! python -m pip install --upgrade pip
# ! pip install transformers
# ! pip install sentencepiece
# ! pip install tensorflow_addons

In [16]:
from source.const import RANDOM_STATE,SEQUENCE_LEN

import tensorflow as tf
import pandas as pd
import numpy as np
from tqdm import tqdm

from transformers import *
import tensorflow_addons as tfa # for optimizer

In [17]:
# 개발 중에는 일부 데이터만으로..
train = pd.read_csv("./data/ratings_train.txt",sep='\t').sample(frac=0.1,random_state=RANDOM_STATE)
test = pd.read_csv("./data/ratings_test.txt",sep='\t').sample(frac=0.1,random_state=RANDOM_STATE)

print('shape of train : ',train.shape)
print('shape of test : ',test.shape)

shape of train :  (15000, 3)
shape of test :  (5000, 3)


In [18]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') # get pre-trained model

loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt from cache at C:\Users\kdh/.cache\huggingface\transformers\eff018e45de5364a8368df1f2df3461d506e2a111e9dd50af1fae061cd460ead.6c5b6600e968f4b5e08c86d8891ea99e51537fc2bf251435fb46922e8f7a7b29
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer_config.json from cache at C:\Users\kdh/.cache\huggingface\transformers\f55e7a2ad4f8d0fff2733b3f79777e1e99247f2e4583703e92ce74453af8c235.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f
loading configuration file https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json from cache at C:\Users\kdh/.cache\huggingface\transformers\6c4a5d81a58c9791cdf76a09bce1b5abfb9

In [19]:
# drop na
train.dropna(axis=0,how='any',inplace=True)
test.dropna(axis=0,how='any',inplace=True)

In [20]:
# huggingface

# convert train data into bert input
def get_bert_input(index,df): # need label
    tokenized = tokenizer.encode(df['document'].iloc[index], max_length=SEQUENCE_LEN, truncation=True, padding='max_length')
    
    num_zeros =  tokenized.count(0)
    
    segment = [0]*SEQUENCE_LEN
    mask = [1]*(SEQUENCE_LEN-num_zeros) + [0]*num_zeros # Non-zero with zero mask
    label = df['label'].iloc[index]
    
    return [tokenized,segment,mask,label]

BERT_input = pd.DataFrame(data = [get_bert_input(i,train) for i in tqdm(range(train.shape[0]))],columns=['token','segment','mask','label'])

100%|██████████| 14999/14999 [00:03<00:00, 4676.24it/s]


In [21]:
# convert test data into bert input
BERT_test_input = pd.DataFrame(data = [get_bert_input(i,test) for i in tqdm(range(test.shape[0]))],columns=['token','segment','mask','label'])

100%|██████████| 4999/4999 [00:01<00:00, 4790.84it/s]


<center><b>Data preparation</b></center>

In [30]:
X_train = BERT_input.loc[:,['token','segment','mask']]
X_train = [X_train['token'].to_numpy(),X_train['segment'].to_numpy(),X_train['mask'].to_numpy()]
y_train = [BERT_input.loc[:,['label']].to_numpy()]

X_test = BERT_test_input.loc[:,['token','segment','mask']]
X_test = [X_test['token'].to_numpy(),X_test['segment'].to_numpy(),X_test['mask'].to_numpy()]
y_test = [BERT_test_input.loc[:,['label']].to_numpy()]

<center><b>BERT Modeling</b></center>

In [23]:
# optimizer
opt = tfa.optimizers.RectifiedAdam(learning_rate=1.0e-5, weight_decay=0.0025, warmup_proportion=0.05)

In [24]:
# modeling
model = TFBertModel.from_pretrained('bert-base-multilingual-cased') # we gonna use pre-trained model

# indices must be integer
token_inputs = tf.keras.layers.Input((SEQUENCE_LEN,),dtype=tf.dtypes.int32, name='word')
mask_inputs = tf.keras.layers.Input((SEQUENCE_LEN,),dtype=tf.dtypes.int32, name='mask')
segment_inputs = tf.keras.layers.Input((SEQUENCE_LEN,),dtype=tf.dtypes.int32, name='segment')

bert_outputs = model([token_inputs, mask_inputs, segment_inputs])

bert_outputs = bert_outputs[1]
sentiment_first = tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))(bert_outputs)
sentiment_model = tf.keras.Model([token_inputs, mask_inputs, segment_inputs], sentiment_first)

sentiment_model.compile(optimizer=opt, loss='binary_crossentropy', metrics = ['accuracy'])
sentiment_model.summary()

loading configuration file https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json from cache at C:\Users\kdh/.cache\huggingface\transformers\6c4a5d81a58c9791cdf76a09bce1b5abfb9cf958aebada51200f4515403e5d08.0fe59f3f4f1335dadeb4bce8b8146199d9083512b50d07323c1c319f96df450c
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 word (InputLayer)              [(None, 128)]        0           []                               
                                                                                                  
 mask (InputLayer)              [(None, 128)]        0           []                               
                                                                                                  
 segment (InputLayer)           [(None, 128)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  177853440   ['word[0][0]',                   
                                thPoolingAndCrossAt               'mask[0][0]',               

In [29]:
# # training
# sentiment_model.fit(X_train, y_train, epochs=8, shuffle=True, batch_size=128, validation_data=(X_test, y_test))

# # save weight
# sentiment_model.save_weights('./model/huggingface_bert.h5')

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).