<a href="https://colab.research.google.com/github/jerrydevcodex/KoBERTIntentClassifier/blob/main/%08intent_classifier_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading JPype1-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m106.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading JPype1-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (488 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m488.6/488.6 kB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: JPype1, konlpy
Successfully installed JPype1-1.5.0 konlpy-0.6.0


In [3]:
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
import tensorflow as tf
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from konlpy.tag import Okt
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# 형태소 분석기 초기화
okt = Okt()

In [5]:
# 데이터 로드 및 전처리 함수
def load_and_preprocess_data(filepath):
    data = pd.read_csv(filepath)

    # NaN 값을 빈 문자열로 대체
    data['text'] = data['text'].fillna('')

    # 모든 값을 문자열로 변환
    data['text'] = data['text'].astype(str)

    # 숫자 및 특수문자 제거
    data['text'] = data['text'].apply(lambda x: re.sub(r'[^가-힣a-zA-Z\s]', '', x))

    # 형태소 분석 및 어간 추출
    data['text'] = data['text'].apply(lambda x: ' '.join(okt.morphs(x, stem=True)))

    return data

In [6]:
# 전처리된 데이터 로드
data = load_and_preprocess_data('/content/drive/MyDrive/train_data.csv')

In [7]:
# 레이블 인코딩
label_encoder = LabelEncoder()
data['encoded_intent'] = label_encoder.fit_transform(data['intent'])

In [8]:
# 데이터셋 분할
X_train, X_val, y_train, y_val = train_test_split(data['text'], data['encoded_intent'], test_size=0.2, random_state=42)

In [9]:
# BERT 모델 및 토크나이저 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=len(label_encoder.classes_))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# 텍스트 토큰화 함수
def encode_texts(texts, max_len=128):
    return tokenizer(
        list(texts),
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )

In [11]:
# 데이터셋 준비
train_encodings = encode_texts(X_train)
val_encodings = encode_texts(X_val)
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train))
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), y_val))

In [12]:
# 옵티마이저 설정
optimizer = Adam(learning_rate=2e-5)

In [13]:
# 사용자 정의 학습 루프
for epoch in range(3):  # Epoch 수
    print(f'Epoch {epoch + 1}/{3}')
    for step, (batch, labels) in enumerate(train_dataset.shuffle(100).batch(16)):
        with tf.GradientTape() as tape:
            logits = model(batch, training=True).logits
            loss_value = tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

        grads = tape.gradient(loss_value, model.trainable_weights)
        optimizer.apply_gradients(zip(grads, model.trainable_weights))

        if step % 10 == 0:
            print(f"Step {step}: loss = {np.mean(loss_value)}")

Epoch 1/3
Step 0: loss = 3.101482629776001
Step 10: loss = 2.967036724090576
Step 20: loss = 2.9859557151794434
Step 30: loss = 3.0290231704711914
Step 40: loss = 2.5451059341430664
Step 50: loss = 2.4994442462921143
Step 60: loss = 2.750460624694824
Step 70: loss = 2.146371603012085
Step 80: loss = 1.911475419998169
Step 90: loss = 1.5493550300598145
Step 100: loss = 1.7236547470092773
Step 110: loss = 1.8610602617263794
Step 120: loss = 1.2341328859329224
Step 130: loss = 1.3657034635543823
Step 140: loss = 1.2977344989776611
Step 150: loss = 0.9650576114654541
Step 160: loss = 0.9897984266281128
Step 170: loss = 0.9407517910003662
Step 180: loss = 0.9261181950569153
Step 190: loss = 0.7549649477005005
Step 200: loss = 0.8220906257629395
Step 210: loss = 0.7918038368225098
Step 220: loss = 0.8271998167037964
Step 230: loss = 0.6666632890701294
Step 240: loss = 0.49123692512512207
Step 250: loss = 0.7720668911933899
Step 260: loss = 0.3212566077709198
Step 270: loss = 0.66092479228973

In [14]:
# 모델 및 레이블 클래스 저장
model.save_pretrained('intent_classifier_model')
tokenizer.save_pretrained('intent_classifier_model')
np.save('label_classes.npy', label_encoder.classes_)
print("Model, tokenizer, and label classes saved.")

Model, tokenizer, and label classes saved.


In [24]:
# 레이블 인코더 로드
label_encoder = LabelEncoder()
label_encoder.classes_ = np.load('label_classes.npy', allow_pickle=True)
label_classes = label_encoder.classes_

In [25]:
# 모델 및 토크나이저 로드
from transformers import TFAutoModelForSequenceClassification, BertTokenizer
model = TFAutoModelForSequenceClassification.from_pretrained('intent_classifier_model')
tokenizer = BertTokenizer.from_pretrained('intent_classifier_model')

Some layers from the model checkpoint at intent_classifier_model were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at intent_classifier_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [26]:
# 테스트 데이터 로드
test_data = pd.read_csv('/content/drive/MyDrive/test_data.csv')

In [27]:
# 예측 함수
def predict_intent(texts):
    # 텍스트 데이터 토크나이즈 및 인코딩
    inputs = tokenizer(texts, return_tensors='tf', padding=True, truncation=True, max_length=128)

    # 모델 예측 수행
    outputs = model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
    predictions = tf.argmax(outputs.logits, axis=-1)

    # 예측 결과를 레이블로 변환
    predicted_labels = [label_classes[prediction] for prediction in predictions]

    return predicted_labels

In [28]:
# 예측 수행
predicted_intents = predict_intent(test_data['text'].tolist())

In [29]:
# 결과 데이터프레임 생성
result_df = test_data.copy()
result_df['predicted_intent'] = predicted_intents
result_df['match'] = (result_df['intent'] == result_df['predicted_intent']).astype(int)

In [30]:
# 결과를 CSV 파일로 저장
result_df.to_csv('result.csv', index=False)

In [31]:
# 결과 확인
print(result_df.head())

  intent                    text predicted_intent  match
0   교통정보       지금 고속도로 상황이 어떤가요?             교통정보      1
1   교통정보  도로 막힘 없이 갈 수 있는 길 있나요?             교통정보      1
2   교통정보      실시간 교통 정보를 알고 싶어요.             교통정보      1
3   교통정보    오늘 도로 교통 상황 좀 알려주세요.             교통정보      1
4   교통정보     현재 고속도로가 막히는지 궁금해요.             교통정보      1
