## BERT

In [19]:
import os
import numpy as np
import pandas as pd

In [20]:
train_df_path = "./data/train.csv"
train_df = pd.read_csv(train_df_path)
train_df.head()

Unnamed: 0,idx,class,conversation
0,0,협박 대화,지금 너 스스로를 죽여달라고 애원하는 것인가?\n 아닙니다. 죄송합니다.\n 죽을 ...
1,1,협박 대화,길동경찰서입니다.\n9시 40분 마트에 폭발물을 설치할거다.\n네?\n똑바로 들어 ...
2,2,기타 괴롭힘 대화,너 되게 귀여운거 알지? 나보다 작은 남자는 첨봤어.\n그만해. 니들 놀리는거 재미...
3,3,갈취 대화,어이 거기\n예??\n너 말이야 너. 이리 오라고\n무슨 일.\n너 옷 좋아보인다?...
4,4,갈취 대화,저기요 혹시 날이 너무 뜨겁잖아요? 저희 회사에서 이 선크림 파는데 한 번 손등에 ...


In [21]:
train_df.groupby(by=['class']).count()


Unnamed: 0_level_0,idx,conversation
class,Unnamed: 1_level_1,Unnamed: 2_level_1
갈취 대화,981,981
기타 괴롭힘 대화,1094,1094
직장 내 괴롭힘 대화,979,979
협박 대화,896,896


In [22]:
label_encode = {
    "협박 대화" : 0,
    "갈취 대화" : 1,
    "직장 내 괴롭힘 대화" : 2,
    "기타 괴롭힘 대화" : 3,   
}
train_df['encoded_label'] = train_df['class'].map(label_encode)

In [23]:
train_texts = train_df['conversation'].to_list()
train_labels = train_df['encoded_label'].to_list()

In [25]:
from sklearn.model_selection import train_test_split

# Stratified Split Train and Validation data 
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2, random_state=1004, stratify=train_labels)

In [26]:
MODEL_PATH = "klue/bert-base"

In [27]:
from transformers import BertTokenizerFast

# Load Tokenizer 
tokenizer = BertTokenizerFast.from_pretrained(MODEL_PATH)

# Tokenizing
# dict_keys(['input_ids', 'token_type_ids', 'attention_mask']) 이런식으로 
train_encodings = tokenizer(train_texts, truncation=True, padding=True) 
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

Downloading:   0%|          | 0.00/243k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/289 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/425 [00:00<?, ?B/s]

In [28]:
print(dict(val_encodings).keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


In [29]:
import tensorflow as tf

# trainset-set
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))

# validation-set
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))

## FineTunnig


In [37]:
from transformers import TFBertForSequenceClassification

num_labels = len(label_encode)
print(num_labels)
# TODO : from_pt=False 혹은 없이 해보기
# from_pt – (optional) boolean, default False: Load the model weights from a PyTorch state_dict save file (see docstring of pretrained_model_name_or_path argument).
model = TFBertForSequenceClassification.from_pretrained(MODEL_PATH, num_labels=num_labels, from_pt=True)

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])

4


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
model.compute_loss

<bound method TFSequenceClassificationLoss.compute_loss of <transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification object at 0x7ff72a0f5e50>>

In [39]:
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler, ModelCheckpoint

callback_earlystopping = EarlyStopping(
    monitor="val_accuracy", 
    min_delta=0.001, # the threshold that triggers the termination (acc should at least improve 0.001)
    patience=2)

callback_learningrate_scheduler = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.1,
    patience=10,
    verbose=0,
    mode='auto',
    min_delta=0.0001,
    cooldown=0,
    min_lr=0,
)


callback_modelcheckpoint = ModelCheckpoint(
    filepath = "BERT_BestModel.keras",
    monitor="vall_accuracy",
    save_best_only=True,
)

callback_list = [callback_earlystopping, callback_learningrate_scheduler, callback_modelcheckpoint]

model.fit(
    train_dataset.shuffle(1000).batch(8), epochs=50, batch_size=8,
    validation_data=val_dataset.shuffle(1000).batch(16),
    callbacks = callback_list
)

Epoch 1/50
Epoch 2/50
Epoch 3/50


<keras.callbacks.History at 0x7ff72a039eb0>

## 모델, 토크나이저 저장

In [40]:
MODEL_NAME = 'fine-tuned-klue-bert-base'
MODEL_SAVE_PATH = os.path.join("_model", MODEL_NAME) # change this to your preferred location

if os.path.exists(MODEL_SAVE_PATH):
    print(f"{MODEL_SAVE_PATH} -- Folder already exists \n")
else:
    os.makedirs(MODEL_SAVE_PATH, exist_ok=True)
    print(f"{MODEL_SAVE_PATH} -- Folder create complete \n")

# save tokenizer, model
model.save_pretrained(MODEL_SAVE_PATH)
tokenizer.save_pretrained(MODEL_SAVE_PATH)

_model/fine-tuned-klue-bert-base -- Folder create complete 



('_model/fine-tuned-klue-bert-base/tokenizer_config.json',
 '_model/fine-tuned-klue-bert-base/special_tokens_map.json',
 '_model/fine-tuned-klue-bert-base/vocab.txt',
 '_model/fine-tuned-klue-bert-base/added_tokens.json',
 '_model/fine-tuned-klue-bert-base/tokenizer.json')

In [41]:
from transformers import TextClassificationPipeline

# Load Fine-tuning model
loaded_tokenizer = BertTokenizerFast.from_pretrained(MODEL_SAVE_PATH)
loaded_model = TFBertForSequenceClassification.from_pretrained(MODEL_SAVE_PATH)

text_classifier = TextClassificationPipeline(
    tokenizer=loaded_tokenizer, 
    model=loaded_model, 
    framework='tf',
    return_all_scores=True
)

Some layers from the model checkpoint at _model/fine-tuned-klue-bert-base were not used when initializing TFBertForSequenceClassification: ['dropout_113']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at _model/fine-tuned-klue-bert-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [42]:
test_df = pd.DataFrame(['file_name', 'class'])
test_df.head()

Unnamed: 0,0
0,file_name
1,class


In [43]:
import json

with open('./data/test.json', 'r') as f:
    test_json = json.load(f)
    
# test_json

In [44]:
from tqdm.auto import tqdm
answer_dict = {}
for file_name, text in tqdm(test_json.items()):
    preds_list = text_classifier(text['text'])[0]
    best_label = int(sorted(preds_list, key=lambda x : x['score'])[-1]['label'].split('_')[-1])
    answer_dict[file_name] = best_label
          
answer_dict

  0%|          | 0/400 [00:00<?, ?it/s]

{'t_000': 1,
 't_001': 2,
 't_002': 2,
 't_004': 3,
 't_005': 0,
 't_006': 0,
 't_007': 1,
 't_009': 1,
 't_010': 0,
 't_012': 2,
 't_014': 2,
 't_015': 0,
 't_018': 0,
 't_019': 3,
 't_020': 0,
 't_021': 3,
 't_022': 3,
 't_023': 1,
 't_024': 1,
 't_025': 2,
 't_028': 2,
 't_030': 1,
 't_031': 1,
 't_033': 1,
 't_034': 3,
 't_035': 3,
 't_036': 3,
 't_037': 1,
 't_038': 0,
 't_039': 1,
 't_040': 0,
 't_041': 1,
 't_042': 2,
 't_043': 1,
 't_045': 2,
 't_046': 0,
 't_047': 0,
 't_049': 0,
 't_050': 1,
 't_051': 3,
 't_052': 1,
 't_053': 2,
 't_054': 2,
 't_055': 3,
 't_056': 0,
 't_058': 3,
 't_059': 3,
 't_060': 1,
 't_061': 3,
 't_062': 2,
 't_063': 3,
 't_064': 2,
 't_065': 0,
 't_066': 3,
 't_067': 3,
 't_069': 1,
 't_071': 2,
 't_072': 3,
 't_073': 2,
 't_074': 0,
 't_076': 2,
 't_077': 1,
 't_078': 2,
 't_081': 0,
 't_083': 0,
 't_084': 0,
 't_085': 2,
 't_086': 1,
 't_088': 1,
 't_089': 2,
 't_091': 2,
 't_092': 0,
 't_093': 3,
 't_095': 2,
 't_097': 2,
 't_098': 1,
 't_099': 0,

In [47]:
for key, value in answer_dict.items():
    test_df = test_df.append({'file_name': key, 'class': value}, ignore_index=True)

In [48]:
test_df.to_csv("BERT_0.8937.csv")

In [49]:
test_df

Unnamed: 0,0,class,file_name
0,file_name,,
1,class,,
2,,1.0,t_000
3,,2.0,t_001
4,,2.0,t_002
...,...,...,...
797,,2.0,t_495
798,,2.0,t_496
799,,1.0,t_497
800,,2.0,t_498


In [51]:
test_df = test_df[2:]

In [53]:
test_df

Unnamed: 0,0,class,file_name
2,,1.0,t_000
3,,2.0,t_001
4,,2.0,t_002
5,,3.0,t_004
6,,0.0,t_005
...,...,...,...
797,,2.0,t_495
798,,2.0,t_496
799,,1.0,t_497
800,,2.0,t_498


In [54]:
test_df = test_df[['class', 'file_name']]

In [57]:
test_df['class'] = test_df['class'].astype('int32')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['class'] = test_df['class'].astype('int32')


In [58]:
test_df

Unnamed: 0,class,file_name
2,1,t_000
3,2,t_001
4,2,t_002
5,3,t_004
6,0,t_005
...,...,...
797,2,t_495
798,2,t_496
799,1,t_497
800,2,t_498


In [59]:
test_df.set_index('file_name', inplace=True)


In [60]:
test_df

Unnamed: 0_level_0,class
file_name,Unnamed: 1_level_1
t_000,1
t_001,2
t_002,2
t_004,3
t_005,0
...,...
t_495,2
t_496,2
t_497,1
t_498,2


In [61]:
test_df.to_csv('last.csv', index="file_name")