In [1]:
import pandas as pd
import numpy as np
import urllib.request
import os
from tqdm import tqdm
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
import transformers
transformers.__version__
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf

from transformers import TFBertForSequenceClassification
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from transformers import TextClassificationPipeline

from sklearn.utils import class_weight
from transformers import TextClassificationPipeline
from sklearn.metrics import accuracy_score


# 데이터 로드 

In [4]:
dfp = pd.read_parquet('./df_minor_fill.pq')[['link',  'contents_clean','contents_nouns', 'y', 'y2']]
dfqe = pd.read_parquet('./df_35_noun.pq')[['link', 'contents_clean', 'contents_nouns', 'y', 'y2']]

In [8]:
X = np.array(dfp['contents_clean'].tolist())
y = np.array(dfp['y2num'].tolist())

# BERT fine-tuning

In [5]:
def convert_examples_to_features(examples, labels, max_seq_len, tokenizer):
    
    input_ids, attention_masks, token_type_ids, data_labels = [], [], [], []
    
    for example, label in tqdm(zip(examples, labels), total=len(examples)):
        input_id = tokenizer.encode(example, max_length=max_seq_len, pad_to_max_length=True)
        padding_count = input_id.count(tokenizer.pad_token_id)
        attention_mask = [1] * (max_seq_len - padding_count) + [0] * padding_count
        token_type_id = [0] * max_seq_len

        assert len(input_id) == max_seq_len, "Error with input length {} vs {}".format(len(input_id), max_seq_len)
        assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len)
        assert len(token_type_id) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_id), max_seq_len)

        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        data_labels.append(label)

    input_ids = np.array(input_ids, dtype=int)
    attention_masks = np.array(attention_masks, dtype=int)
    token_type_ids = np.array(token_type_ids, dtype=int)

    data_labels = np.asarray(data_labels, dtype=np.int32)

    return (input_ids, attention_masks, token_type_ids), data_labels

In [6]:
# load tokenizer 
max_seq_len = 2028
tokenizer = BertTokenizer.from_pretrained('klue/bert-base', truncation=True, max_seq_len=max_seq_len)

In [7]:
## 라벨 숫자 인코딩
lbl_name = sorted(dfp['y2'].unique().tolist())
lbl_num = list(range(len(lbl_name)))
lbl_name2num = dict(zip(lbl_name,lbl_num))
lbl_num2name = dict(zip(lbl_num,lbl_name))
dfp['y2num']  = dfp.y2.apply(lambda x : lbl_name2num[x])

In [9]:
# 원큐
class_weights = class_weight.compute_class_weight(class_weight  = 'balanced',classes= np.unique(y), y = y) 
class_weights_dict = dict(zip(  list(range(len(class_weights))),class_weights))

X_train = tokenizer(X.tolist(), truncation=True, padding=True)


train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(X_train),
    y
))


optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model = TFBertForSequenceClassification.from_pretrained("klue/bert-base", num_labels=dfp['y2num'].nunique(), from_pt=True)
loss =tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

history =model.fit(
train_dataset.shuffle(1000).batch(16), epochs=7, batch_size=32,class_weight= class_weights_dict,)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [10]:
# 훈련모델 저장
model.save_weights('model_weights_real_v02', save_format='tf')
model.save_weights('model_weights_real_v02.h5', save_format='h5')

# 사용법

In [12]:
from transformers import BertTokenizer
from transformers import TFBertForSequenceClassification
from transformers import TextClassificationPipeline
max_seq_len = 2028
tokenizer = BertTokenizer.from_pretrained('klue/bert-base', truncation=True, max_seq_len=max_seq_len )
model = TFBertForSequenceClassification.from_pretrained("klue/bert-base", num_labels=len(lbl_num2name), from_pt=True)
model.load_weights('./model_weights_real_v02')

text_classifier = TextClassificationPipeline(
    tokenizer=tokenizer, 
    model=model, 
    framework='tf',
    return_all_scores=True , device=0
)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
res = text_classifier('아버지가 방에 들어가신다')
res

[{'label': 'LABEL_0', 'score': 0.03188885748386383},
 {'label': 'LABEL_1', 'score': 0.012798765674233437},
 {'label': 'LABEL_2', 'score': 0.011650760658085346},
 {'label': 'LABEL_3', 'score': 0.1136927604675293},
 {'label': 'LABEL_4', 'score': 0.019805721938610077},
 {'label': 'LABEL_5', 'score': 0.0225137360394001},
 {'label': 'LABEL_6', 'score': 0.01153398398309946},
 {'label': 'LABEL_7', 'score': 0.009569304995238781},
 {'label': 'LABEL_8', 'score': 0.010257290676236153},
 {'label': 'LABEL_9', 'score': 0.02755940891802311},
 {'label': 'LABEL_10', 'score': 0.08993496745824814},
 {'label': 'LABEL_11', 'score': 0.00900056678801775},
 {'label': 'LABEL_12', 'score': 0.005275185685604811},
 {'label': 'LABEL_13', 'score': 0.08223633468151093},
 {'label': 'LABEL_14', 'score': 0.03501689434051514},
 {'label': 'LABEL_15', 'score': 0.010377055034041405},
 {'label': 'LABEL_16', 'score': 0.00671127624809742},
 {'label': 'LABEL_17', 'score': 0.12466222792863846},
 {'label': 'LABEL_18', 'score': 0

