In [2]:
import numpy as np
import pandas as pd

import re
import os
import string
import nltk
from nltk.corpus import stopwords

import tensorflow as tf
from transformers import *
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

tf.random.set_seed(1234)
np.random.seed(1234)

tokenizer = BertTokenizer.from_pretrained('../../Tensorflow_NLP/chapter_7/bert-base-multilingual-cased')

def bert_tokenizer(sent, MAX_LEN):
    encoded_dict = tokenizer.encode_plus(
        text = sent,
        add_special_tokens = True,
        max_length = MAX_LEN,
        pad_to_max_length = True,
        return_attention_mask = True,
        truncation = True)
    
    input_id = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask']
    token_type_id = encoded_dict['token_type_ids']
    
    return input_id, attention_mask, token_type_id

train_data = pd.read_csv('train.csv', encoding='latin-1')
test_data = pd.read_csv('test.csv', encoding='latin-1')
train_data = train_data.dropna(how='any', axis=1)

train_data['text_len'] = train_data['text'].apply(lambda x:len(x.split(' ')))

# 전처리

# 1) url 제거 (https:///... www. ...)
def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)

# 2) 이모지, 이모티콘 제거
def remove_emoji(text):
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# 3) html 태그 제거 (<a>, <br> ...)
def remove_html(text):
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return re.sub(html, '', text)

# Special thanks to https://www.kaggle.com/tanulsingh077 for this function
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower() # 4) 소문자화, 문자열화
    text = re.sub('\[.*?\]', '', text) # 대괄호 있으면 아예 다 제거 
    
    # remove_url()
    text = re.sub(
        'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 
        '', 
        text
    )
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    
    # 5) 구두점 제거 
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    
    # 6) 숫자, 숫자포함단어는 모두 제거
    text = re.sub('\w*\d\w*', '', text)
    
    text = remove_url(text)
    text = remove_emoji(text)
    text = remove_html(text)
    
    text = re.sub('\s+', ' ', text)
    text = text.strip()
    
    return text

# 7) 불용어 제거
stop_words = stopwords.words('english') # 179가지 nltk 영어 불용어
more_stopwords = ['u', 'im', 'c'] # 179 + 3(축약,변환형)
stop_words = stop_words + more_stopwords # 182개 불용어

# # nltk의 SnowballStemmer('english') --> stem()
# # nltk.PorterStemmer, nltk.LancasterStemmer 존재 
# # Stemmer: 접사 삭제, 어간부 추출 / Lemmatizer: 어간 표제형 복원 (시간 오래걸림)
# # nltk.WordNetLemmatizer() --> lemma()
# stemmer = nltk.SnowballStemmer("english")
# # 8) 어간 Stemming

def preprocess_data(text):
    # Clean puntuation, urls, and so on
    text = clean_text(text)
    # Remove stopwords and Stemm all the words in the sentence
    text = ' '.join(word for word in text.split(' ') if word not in stop_words)

    return text

test_data['text_clean'] = test_data['text'].apply(preprocess_data)
train_data['text_clean'] = train_data['text'].apply(preprocess_data)

print(train_data.shape)

(7613, 5)


In [6]:
from tqdm import tqdm

input_ids = []
attention_masks = []
token_type_ids = []
train_data_labels = []

for train_sent, train_label in \
tqdm(zip(train_data['text_clean'], train_data['target']),
    total = len(train_data)):
    try:
        input_id, attention_mask, token_type_id = \
        bert_tokenizer(train_sent, 23)
        
        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        train_data_labels.append(train_label)
        
    except Exception as e:
        print(e)
        print(train_sent)
        pass
    
train_tweet_input_ids = np.array(input_ids, dtype=int)
train_tweet_attention_masks = np.array(attention_masks, dtype=int)
train_tweet_type_ids = np.array(token_type_ids, dtype=int)
train_tweet_inputs = (train_tweet_input_ids, train_tweet_attention_masks, train_tweet_type_ids)

train_data_labels = np.array(train_data_labels, dtype=np.int32)

input_id, attention_mask, token_type_id = [i[0] for i in train_tweet_inputs]
print(input_id, attention_mask, token_type_id, tokenizer.decode(input_id), sep='\n')

100%|████████████████████████████████████████████████████████████████████████████| 7613/7613 [00:01<00:00, 4727.00it/s]

[   101  37246  10107  27949  63406  11387  10512  10237  10142 106088
  19626    102      0      0      0      0      0      0      0      0
      0      0      0]
[1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[CLS] deeds reason earthquake may allah forgive us [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]





In [13]:
arr = np.array([len(tokenizer.encode(j)) for j in train_data['text_clean']])

arr.mean()

16.78654932352555

In [15]:
np.mean(arr)

16.78654932352555

In [16]:
np.median(arr)

17.0

In [17]:
np.percentile(arr, 75)

21.0

In [19]:
from tensorflow.keras.layers import Dense, Dropout

class TFBertClassifier(tf.keras.Model):
    def __init__(self, model_name, dir_path, num_class):
        super(TFBertClassifier, self).__init__()
        
        self.bert = TFBertModel.from_pretrained(model_name, cache_dir=dir_path)
        self.dropout = Dropout(self.bert.config.hidden_dropout_prob)
        self.classifier = Dense(num_class, 
                               kernel_initializer=tf.keras.initializers.TruncatedNormal(self.bert.config.initializer_range),
                               name = 'classifier')
        
    def call(self, inputs, attention_mask=None, token_type_ids=None, training=False):
        outputs = self.bert(inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output, training=training)
        # Dropout(training=False) 파라미터의 의미?
        logits = self.classifier(pooled_output)
        return logits
    
cls_model = TFBertClassifier(model_name = "../../Tensorflow_NLP/chapter_7/bert-base-multilingual-cased",
                            dir_path = 'bert_ckpt',
                            # 이 경로에 해당 모델 cache 저장됨
                            num_class = 2)


Some weights of the model checkpoint at ../../Tensorflow_NLP/chapter_7/bert-base-multilingual-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the model checkpoint at ../../Tensorflow_NLP/chapter_7/bert-base-multilingual-cased.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFBertModel for predictions without further training.


In [25]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy

optimizer = Adam(3e-5)
loss = SparseCategoricalCrossentropy(from_logits=True)
# 그냥 'binary_crossentropy' 와의 차이점? 
metric = SparseCategoricalAccuracy('accuracy')
# 그냥 "accuracy" 와의 차이점? 
cls_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [26]:
model_name = "tf2_bert_disaster_tweets_v4"

earlystop_callback = EarlyStopping(monitor='val_accuracy',
                                  min_delta=0.0001, patience=3)

checkpoint_path = os.path.join(model_name, 'weights.h5')
checkpoint_dir = os.path.dirname(checkpoint_path)

if os.path.exists(checkpoint_dir):
    print("{} -- Folder already exists \n".format(checkpoint_dir))
else:
    os.makedirs(checkpoint_dir, exist_ok=True)
    print("{} -- Folder create complete \n".format(checkpoint_dir))

cp_callback = ModelCheckpoint(checkpoint_path, monitor='val_accuracy',
                             verbose=1, save_best_only=True, save_weights_only=True)

history = cls_model.fit(train_tweet_inputs, train_data_labels,
                       epochs=15, batch_size=32,
                       validation_split = 0.2,
                       callbacks = [earlystop_callback, cp_callback])

print(history.history)

tf2_bert_disaster_tweets_v4 -- Folder create complete 

Epoch 1/15
Epoch 1: val_accuracy improved from -inf to 0.81550, saving model to tf2_bert_disaster_tweets_v4\weights.h5
Epoch 2/15
Epoch 2: val_accuracy did not improve from 0.81550
Epoch 3/15
Epoch 3: val_accuracy did not improve from 0.81550
Epoch 4/15
Epoch 4: val_accuracy did not improve from 0.81550
{'loss': [0.5062862038612366, 0.4031009078025818, 0.31013819575309753, 0.25328850746154785], 'accuracy': [0.7663382887840271, 0.8287356495857239, 0.882101833820343, 0.9093596339225769], 'val_loss': [0.4178755283355713, 0.44452497363090515, 0.51612389087677, 0.5483962297439575], 'val_accuracy': [0.8154957294464111, 0.8030203580856323, 0.7951411604881287, 0.794484555721283]}


In [29]:
input_ids = []
attention_masks = []
token_type_ids = []
# train_data_labels = []

for test_sent in tqdm(test_data['text_clean'], total = len(test_data)):
    try:
        input_id, attention_mask, token_type_id =\
        bert_tokenizer(test_sent, 23)
        
        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
#         train_data_labels.append(train_label)
        
    except Exception as e:
        print(e)
        print(train_sent)
        pass
    
test_tweet_input_ids = np.array(input_ids, dtype=int)
test_tweet_attention_masks = np.array(attention_masks, dtype=int)
test_tweet_type_ids = np.array(token_type_ids, dtype=int)
test_tweet_inputs = (test_tweet_input_ids, test_tweet_attention_masks, test_tweet_type_ids)

# test_data_labels = np.array(train_data_labels, dtype=np.int32)

print("# sents: {}".format(len(train_tweet_input_ids)))

y_pre = cls_model.predict(test_tweet_inputs)
sub = pd.DataFrame({'id':test_data['id'].values.tolist(), 
                    'target': [i.argmax() for i in y_pre]})
sub.to_csv("submission_5_bert_2.csv", index=False)

100%|████████████████████████████████████████████████████████████████████████████| 3263/3263 [00:00<00:00, 3832.65it/s]


# sents: 7613


  0.79272 --> SnowballStemmer 스테밍 한 버트 버전보다 좀더 높음 