In [None]:
! pip install tensorflow_datasets

In [None]:
import os
import pandas as pd
from konlpy.tag import Okt
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from konlpy.tag import Mecab
import numpy as np
from collections import Counter
import re

## 전처리된 데이터 로드

In [None]:
# 형태소 분석하는 함수 
def morph_analyze(text):
    
    # Okt 형태소 분석기 객체 생성
    okt = Okt()
    # 텍스트에서 형태소 분석
    morphs = okt.morphs(text)
    return morphs

In [None]:
# 각 문장의 불용어, 특수 문자 등을 제거하는 function: {return: 정리된 문장}
def sentence_analysis(sentence):
    #불용어
    stopwords = ['은','는','이','가','을','를','에','이가','이는']
    sentence = re.sub(r'@[^@]+@', 'pronoun', sentence)
    sentence = re.sub(r'name1', 'pronoun', sentence)
    sentence = re.sub(r'company-name' , 'pronoun', sentence)
    sentence = re.sub(r'\n', " ", sentence)
    
    # 특수문자 제거 (문장내의 특수 문자제거)
    sentence = re.sub(r"[^ㄱ-ㅎㅏ-ㅣ가-힣0-9a-zA-Z?.!\s]", "", sentence)
    
    # 영어라면 소문자로 변환
    sentence = sentence.lower() # 텍스트 소문자화
    #형태소 분석
    sentence = morph_analyze(sentence)
    
    # 불용어 제거 
    sen = []
    for word in sentence:
        if word in stopwords:
            continue
        sen.append(word)
       
    sentence = ' '.join(sen)

    return sentence


In [None]:
# 결측치 제거, 중복 제거, 불용어 제거한 데이터를 제공하는 fucntion
def load_data(path):
    train_data_path = path
    data = pd.read_csv(train_data_path)
    
    # # 결측치 제거
    # null_check = data.isnull().sum()
    # check = False
    # for i in range(len(null_check)):
    #     if null_check[i] > 0:
    #         check = True
            
    # if check == True:
    #     data = data.dropna()
    
    # 중복 제거
    data.drop_duplicates(subset = ['conversation'], inplace=True)
    
    
    data['conversation'] = data['conversation'].map(lambda x: sentence_analysis(x))

    return data

In [None]:
# 결측치 제거, 중복 제거, 불용어 제거한 데이터를 제공하는 fucntion
def load_test(data):
        
#     # 결측치 제거
#     null_check = data.isnull().sum()
#     check = False
#     for i in range(len(null_check)):
#         if null_check[i] > 0:
#             check = True
            
#     if check == True:
#         data = data.dropna()
    
#     # 중복 제거
#     data.drop_duplicates(subset = ['conversation'], inplace=True)
    
    
    data['conversation'] = data['conversation'].map(lambda x: sentence_analysis(x))

    return data

In [None]:
train_data_path ="data/train_with_normal_nikl.csv"
train_data = load_data(train_data_path)

In [None]:
train_data[:5]

In [None]:
train_data[train_data['idx'] == 806]

In [None]:
def get_encoded_sentence(sentence, word_to_index):
    return [word_to_index['<BOS>']]+[word_to_index[word] if word in word_to_index else word_to_index['<UNK>'] for word in sentence.split()]

In [None]:
sentence = train_data['conversation'][806]
print(sentence)
encoded = get_encoded_sentence(sentence,words_dict)
print(encoded)

In [None]:
import json

# JSON 파일 경로
json_file_path = '/aiffel/aiffel/dktc/data/test.json' 

# JSON 파일 열기
with open(json_file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)
#     data = json.dumps(data, ensure_ascii = False)
    
# 변환된 데이터를 저장할 리스트
converted_data = []

# 데이터 변환 및 리스트에 추가
for key, value in data.items():
    converted_text = json.dumps(value['text'], ensure_ascii = False)
    converted_data.append({
        "file_name": key,
        "conversation": converted_text
    })
df = pd.DataFrame(converted_data)
# 결과 출력
print(df)

output_file_path = '~/aiffel/dktc/test.csv'
df.to_csv(output_file_path, index=False, encoding='utf-8-sig')

In [None]:
test_data_path ="~/aiffel/dktc/test.csv"
test_data = load_test(test_df)

In [None]:
test_df

## 단어사전

In [None]:
#단어 사전 만들어주는 function: {토큰: index} 와 {index: 토큰} dictionary를 제공
def makeVocab(train):
    words = []
    
    for sentence in train['conversation']:
        temp = list(sentence.split(" "))
        words.extend(temp)
        
    counter = Counter(words)
    counter = counter.most_common(10000-4)
    
    vocab = ['', '', '', ''] + [key for key, _ in counter]
    word_to_index = {word:index for index, word in enumerate(vocab)}
    #실제 인코딩 인덱스는 제공된 word_to_index에서 index 기준으로 3씩 뒤로 밀려 있습니다.  
    word_to_index = {k:(v+3) for k,v in word_to_index.items()}

    # 처음 몇 개 인덱스는 사전에 정의되어 있습니다.
    word_to_index["<PAD>"] = 0
    word_to_index["<BOS>"] = 1
    word_to_index["<UNK>"] = 2  # unknown
    word_to_index["<UNUSED>"] = 3

    index_to_word = {index:word for word, index in word_to_index.items()}
    return word_to_index, index_to_word

In [None]:
words_dict, idx_dict = makeVocab(train_data)

## 토큰 정수화

In [None]:
#token화된 list를 정수화로 바꿔주는 function : {return: 정수화된 list}
def wordlist_to_indexlist(wordlist, word_to_index):
        return [word_to_index[word] if word in word_to_index else word_to_index[''] for word in wordlist]

In [None]:
X_train = []
for sen in train_data['conversation']:
    sen = list(sen.split(" "))
    X_train.append(sen)

In [None]:
X_train = [wordlist_to_indexlist(wordlist, words_dict) for wordlist in X_train]

In [None]:
X_test = []
for sen in test_df['conversation']:
    sen = list(sen.split(" "))
    X_test.append(sen)
    
# X_test = [wordlist_to_indexlist(wordlist, words_dict) for wordlist in X_test]

In [None]:
X_test = np.array(X_test,dtype = 'object')

## class 한글 옵션 숫자 옵션으로 전환

In [None]:
def changeClassInt(data):
    # class_list = {'협박 대화': 0, '갈취 대화': 1. '직장 내 괴롭힘 대화': 2, '기타 괴롭힘 대화': 3, '일반 대화': 4}
    data.loc[data['class'] == '협박 대화', 'class'] = 0
    data.loc[data['class'] == '갈취 대화', 'class'] = 1
    data.loc[data['class'] == '직장 내 괴롭힘 대화', 'class'] = 2
    data.loc[data['class'] == '기타 괴롭힘 대화', 'class'] = 3
    data.loc[data['class'] == '일반 대화', 'class'] = 4
    
    return data

In [None]:
train_data = changeClassInt(train_data)

In [None]:
# target data 만들기
tar_data = train_data['class']

In [None]:
! pip install Keras-Preprocessing

In [None]:
from keras_preprocessing.sequence import pad_sequences

total_data_text = list(X_train)+list(X_test)
# 텍스트데이터 문장길이의 리스트를 생성한 후
num_tokens = [len(tokens) for tokens in total_data_text]
num_tokens = np.array(num_tokens)
# 문장길이의 평균값, 최대값, 표준편차를 계산해 본다. 
print('문장길이 평균 : ', np.mean(num_tokens))
print('문장길이 최대 : ', np.max(num_tokens))
print('문장길이 표준편차 : ', np.std(num_tokens))

# 예를들어, 최대 길이를 (평균 + 2*표준편차)로 한다면,  
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_len = int(max_tokens)
print('pad_sequences maxlen : ', max_len)
print(f'전체 문장의 {np.sum(num_tokens < max_tokens) / len(num_tokens)}%가 maxlen 설정값 이내에 포함됩니다. ')

In [None]:
X_train = np.array(X_train, dtype = 'object')
X_train = pad_sequences(X_train, value = words_dict['<PAD>'], padding='pre', maxlen=max_len)

X_test = np.array(X_test, dtype = 'object')
X_test = pad_sequences(X_test, value = words_dict['<PAD>'], padding='pre', maxlen=max_len)

## 데이터셋 분리

In [None]:
# 데이터 분리 function
from sklearn.model_selection import train_test_split

def makeDataset(cov_data, tar_data):
    # stratify : class가 균등하게 나눠지게 됨.train_test_split stratify
    
    X_train, X_val, y_train,y_val = train_test_split(cov_data, tar_data, test_size = 0.2, random_state = 928, stratify = tar_data)    
    
    return X_train, X_val, y_train, y_val 

In [None]:
X_train, X_val, y_train, y_val = makeDataset(X_train, tar_data)

In [None]:
X_train = np.asarray(X_train).astype(np.float32)
X_val = np.asarray(X_val).astype(np.float32)
y_train = np.asarray(y_train).astype(np.float32)
y_val = np.asarray(y_val).astype(np.float32)
X_test = np.asarray(X_test).astype(np.float32)
print(len(X_train))

# Baseline

- Bidirectional LSTM

In [None]:
class_map = {
    "협박 대화": 0,
    "갈취 대화": 1,
    "직장 내 괴롭힘 대화": 2,
    "기타 괴롭힘 대화": 3,
    "일반 대화": 4,
}
label_to_class = {v: k for k, v in class_map.items()}

## Training Model

In [None]:
import pandas as pd
from datetime import datetime, timezone, timedelta

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

KST = timezone(timedelta(hours=9))
NOW = datetime.now(KST).strftime('%Y%m%d%H%M%S') # YYYYMMDDHHmmss

# parameters
RANDOM_SEED = 42
TRAIN_CSV_DATA_PATH = "data/train_with_normal_nikl.csv"
TEST_JSON_DATA_PATH = "data/test.json"
SUBMISSION_CSV_DATA_PATH = "data/submission.csv"
EPOCHS = 50
MODEL_PATH = f"models/baseline_{NOW}.keras"

VOCAB_SIZE = len(words_dict)
SEQ_LEN = 100

# fix random seed
tf.random.set_seed(RANDOM_SEED)

# # Data preprocessing
# data = pd.read_csv("data/train_with_normal_nikl.csv")
# data["class_label"] = data["class"].apply(lambda x: class_map[x]) # str -> int

# # Tokenization
# tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<UNK>")
# tokenizer.fit_on_texts(data["conversation"])
# train_sequences = tokenizer.texts_to_sequences(data["conversation"])
# train_sequences = pad_sequences(train_sequences, padding='post', maxlen=SEQ_LEN)

# # Split the data
# X_train, X_val, y_train, y_val = train_test_split(train_sequences, 
#                                                   data["class_label"], 
#                                                   test_size=0.2,
#                                                   random_state=RANDOM_SEED, 
#                                                   stratify=data["class_label"])

def get_model(X, vocab_size, num_classes):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=64, input_length=X.shape[-1]),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(num_classes, activation="softmax")
    ])
    
    model.compile(loss="sparse_categorical_crossentropy", 
                  optimizer="adam", 
                  metrics=["accuracy"])
    return model

model = get_model(X_train, VOCAB_SIZE, num_classes=len(class_map))
callbacks = [
    tf.keras.callbacks.ModelCheckpoint(MODEL_PATH,
                                       monitor="val_loss",
                                       save_best_only=True)
]
history = model.fit(X_train, y_train, 
                    epochs=EPOCHS,
                    validation_data=(X_val, y_val),
                    callbacks=callbacks)

In [None]:
import matplotlib.pyplot as plt

history_dict = history.history

loss = history_dict['loss']
val_loss = history_dict['val_loss']
epochs = range(1, len(loss) + 1)

fig = plt.figure(figsize=(12, 5))

ax1 = fig.add_subplot(1, 2, 1)
ax1.plot(epochs, loss, color='blue', label='train_loss')
ax1.plot(epochs, val_loss, color='red', label='val_loss')
ax1.set_title('Train and Validation Loss')
ax1.set_xlabel('Epochs')
ax1.set_ylabel('Loss')
ax1.grid()
ax1.legend()

accuracy = history_dict['accuracy']
val_accuracy = history_dict['val_accuracy']

ax2 = fig.add_subplot(1, 2, 2)
ax2.plot(epochs, accuracy, color='blue', label='train_accuracy')
ax2.plot(epochs, val_accuracy, color='red', label='val_accuracy')
ax2.set_title('Train and Validation Accuracy')
ax2.set_xlabel('Epochs')
ax2.set_ylabel('Accuracy')
ax2.grid()
ax2.legend()

plt.show()

## Submission

In [None]:
import json

def load_test_df():
    with open("data/test.json") as file:
        test_json = json.load(file)
    
    file_names = []
    conversations = []
    for file_name in test_json:
        conversation = test_json[file_name]["text"]

        file_names.append(file_name)
        conversations.append(conversation)
        
    return pd.DataFrame({"file_name": file_names, "conversation": conversations})
    
test_df = load_test_df()
test_df

In [None]:
import numpy as np

# # Tokenization
# test_sequences = tokenizer.texts_to_sequences(test_df["conversation"])
# test_sequences = pad_sequences(test_sequences, padding="post", maxlen=SEQ_LEN)

# Prediction
model = tf.keras.models.load_model(MODEL_PATH)
predictions_prob = model.predict(X_test)
predictions = np.argmax(predictions_prob, axis=1)
test_df["class"] = predictions
test_df["class_str"] = test_df["class"].apply(lambda x: label_to_class[x])
test_df

In [None]:
submission_df = pd.read_csv("~/aiffel/dktc/data/new_submission.csv")
test_submission_df = (submission_df
                      .merge(test_df[["file_name", "class"]], on="file_name")
                      .drop(columns=["class_x"])
                      .rename(columns={"class_y": "class"}))
test_submission_df.to_csv(f"~/aiffel/dktc/baseline_{NOW}.csv", index=False)
pd.read_csv(f"~/aiffel/dktc/baseline_{NOW}.csv")

## RNN - LSTM

In [None]:
# 전처리 아주 간단하게만, 형태소없이, 일반 대화 없이 한번 해보기로.
# 각 문장의 불용어, 특수 문자 등을 제거하는 function: {return: 정리된 문장}
def sentence_organizer(sentence):

    sentence = re.sub(r'@[^@]+@', 'pronoun', sentence)
    sentence = re.sub(r'name1', 'pronoun', sentence)
    sentence = re.sub(r'company-name' , 'pronoun', sentence)
    sentence = re.sub(r'\n', " ", sentence)
    
    # 특수문자 제거 (문장내의 특수 문자제거)
    sentence = re.sub(r"[^ㄱ-ㅎㅏ-ㅣ가-힣0-9a-zA-Z?.!\s]", "", sentence)
    
    # 영어라면 소문자로 변환
    sentence = sentence.lower() # 텍스트 소문자화
    

    return sentence

In [None]:
# 결측치 제거, 중복 제거, 불용어 제거한 데이터를 제공하는 fucntion
def load_data(path):
    train_data_path = path
    data = pd.read_csv(train_data_path)
    
    # 중복 제거
    data.drop_duplicates(subset = ['conversation'], inplace=True)
    
    
    data['conversation'] = data['conversation'].map(lambda x: sentence_organizer(x))

    return data

In [None]:
train_data_path ="data/train.csv"
train = load_data(train_data_path)

In [None]:
class_map = {
    "협박 대화": 0,
    "갈취 대화": 1,
    "직장 내 괴롭힘 대화": 2,
    "기타 괴롭힘 대화": 3,
    "일반 대화": 4,
}
label_to_class = {v: k for k, v in class_map.items()}

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, f1_score
import matplotlib.pyplot as plt

# 랜덤 시드 고정
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

# 데이터 로드
texts = train['conversation']  # 텍스트 데이터 컬럼 이름 수정
train = changeClassInt(train)
labels = train['class']  # 클래스 레이블 컬럼 이름 수정
labels = np.asarray(labels).astype('int')
texts = np.asarray(texts).astype('str')

# 토크나이저를 사용
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(texts)

# 훈련 및 검증 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(texts, labels, test_size=0.2, random_state=RANDOM_SEED, stratify=labels)

#텍스트 데이터 토큰화
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

# 패딩을 통해 동일한 길이로 조정
max_length = 535  # 적절한 길이로 조정
X_train_padded = tf.keras.preprocessing.sequence.pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_val_padded = tf.keras.preprocessing.sequence.pad_sequences(X_val_seq, maxlen=max_length, padding='post')

vocab_size = len(tokenizer.word_index) + 1  # 단어 사전 크기
num_classes = len(set(labels))  # 클래스 수

In [None]:
def create_lstm_model(vocab_size, max_length, num_classes):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length),
        tf.keras.layers.LSTM(128, dropout=0.2, recurrent_dropout=0.2),
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [None]:
# LSTM 모델 훈련
lstm_model = create_lstm_model(vocab_size, max_length, num_classes)
lstm_history = lstm_model.fit(X_train_padded, y_train, epochs=50, validation_data=(X_val_padded, y_val), batch_size=32)

# 모델 저장
lstm_model.save('saved_models/lstm_model.h5')

In [None]:
import matplotlib.pyplot as plt

history_dict = lstm_history.history

loss = history_dict['loss']
val_loss = history_dict['val_loss']
epochs = range(1, len(loss) + 1)

fig = plt.figure(figsize=(12, 5))

ax1 = fig.add_subplot(1, 2, 1)
ax1.plot(epochs, loss, color='blue', label='train_loss')
ax1.plot(epochs, val_loss, color='red', label='val_loss')
ax1.set_title('Train and Validation Loss')
ax1.set_xlabel('Epochs')
ax1.set_ylabel('Loss')
ax1.grid()
ax1.legend()

accuracy = history_dict['accuracy']
val_accuracy = history_dict['val_accuracy']

ax2 = fig.add_subplot(1, 2, 2)
ax2.plot(epochs, accuracy, color='blue', label='train_accuracy')
ax2.plot(epochs, val_accuracy, color='red', label='val_accuracy')
ax2.set_title('Train and Validation Accuracy')
ax2.set_xlabel('Epochs')
ax2.set_ylabel('Accuracy')
ax2.grid()
ax2.legend()

plt.show()

In [None]:
import json

def load_test_df():
    with open("data/test.json") as file:
        test_json = json.load(file)
    
    file_names = []
    conversations = []
    for file_name in test_json:
        conversation = test_json[file_name]["text"]

        file_names.append(file_name)
        conversations.append(conversation)
        
    return pd.DataFrame({"file_name": file_names, "conversation": conversations})
    
test = load_test_df()
test

In [None]:
# 결측치 제거, 중복 제거, 불용어 제거한 데이터를 제공하는 fucntion
def load_test(data):
    
    data['conversation'] = data['conversation'].map(lambda x: sentence_analysis(x))

    return data

In [None]:
test = load_test(test)
#텍스트 데이터 토큰화
X_test_seq = tokenizer.texts_to_sequences(test)

# 패딩을 통해 동일한 길이로 조정
max_length = 535  # 적절한 길이로 조정
X_test_padded = tf.keras.preprocessing.sequence.pad_sequences(X_test_seq, maxlen=max_length, padding='post')


In [None]:
def evaluate_model(model, X_val_padded, threshold=0.7):
    predictions = model.predict(X_val_padded)
    predicted_labels = np.argmax(predictions, axis=1)
    predicted_confidences = np.max(predictions, axis=1)
    
    # 새로운 클래스로 분류
    new_class_label = num_classes  # 기존 클래스 + 1
    final_predictions = [new_class_label if conf < threshold else label for label, conf in zip(predicted_labels, predicted_confidences)]
    
    # accuracy = accuracy_score(y_val, final_predictions)
    # f1 = f1_score(y_val, final_predictions, average='weighted')
    
    # print(f"Accuracy: {accuracy}")
    # print(f"F1 Score: {f1}")
    return final_predictions

# # Prediction
# model = lstm_model
# accuracy, f1 = evaluate_model(model, X_val_padded, y_val, threshold = 0.8)

# predictions_prob = model.predict(X_test_padded)
# predictions = np.argmax(predictions_prob, axis=1)
# test["class"] = predictions
# test["class_str"] = test_df["class"].apply(lambda x: label_to_class[x])
# test

In [None]:
predictions = evaluate_model(model, X_test_padded)
print(predictions)
test["class"] = predictions
test["class_str"] = test_df["class"].apply(lambda x: label_to_class[x])
test


In [None]:
submission_df = pd.read_csv("data/new_submission.csv")
test_submission_df = (submission_df
                      .merge(test_df[["file_name", "class"]], on="file_name")
                      .drop(columns=["class_x"])
                      .rename(columns={"class_y": "class"}))
test_submission_df.to_csv(f"results/lstm_{NOW}.csv", index=False)
pd.read_csv(f"results/lstm_{NOW}.csv")

## RNN - GRU

In [None]:
# GRU 모델 정의
def create_gru_model(vocab_size, max_length, num_classes):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length),
        tf.keras.layers.GRU(128, dropout=0.2, recurrent_dropout=0.2),
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [None]:
# GRU 모델 훈련
gru_model = create_gru_model(vocab_size, max_length, num_classes)
gru_history = gru_model.fit(X_train_padded, y_train, epochs=50, validation_data=(X_val_padded, y_val), batch_size=32)

# 모델 저장
gru_model.save('gru_model.h5')

In [None]:
import matplotlib.pyplot as plt

history_dict = gru_history.history

loss = history_dict['loss']
val_loss = history_dict['val_loss']
epochs = range(1, len(loss) + 1)

fig = plt.figure(figsize=(12, 5))

ax1 = fig.add_subplot(1, 2, 1)
ax1.plot(epochs, loss, color='blue', label='train_loss')
ax1.plot(epochs, val_loss, color='red', label='val_loss')
ax1.set_title('Train and Validation Loss')
ax1.set_xlabel('Epochs')
ax1.set_ylabel('Loss')
ax1.grid()
ax1.legend()

accuracy = history_dict['accuracy']
val_accuracy = history_dict['val_accuracy']

ax2 = fig.add_subplot(1, 2, 2)
ax2.plot(epochs, accuracy, color='blue', label='train_accuracy')
ax2.plot(epochs, val_accuracy, color='red', label='val_accuracy')
ax2.set_title('Train and Validation Accuracy')
ax2.set_xlabel('Epochs')
ax2.set_ylabel('Accuracy')
ax2.grid()
ax2.legend()

plt.show()

In [None]:
predictions = evaluate_model(model, X_test_padded)
print(predictions)
test["class"] = predictions
test["class_str"] = test_df["class"].apply(lambda x: label_to_class[x])
test

In [None]:
submission_df = pd.read_csv("data/new_submission.csv")
test_submission_df = (submission_df
                      .merge(test_df[["file_name", "class"]], on="file_name")
                      .drop(columns=["class_x"])
                      .rename(columns={"class_y": "class"}))
test_submission_df.to_csv(f"results/gru_{NOW}.csv", index=False)
pd.read_csv(f"results/gru_{NOW}.csv")

In [None]:
! pip install tf-keras

In [None]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from transformers import TFBertForSequenceClassification, TFElectraForSequenceClassification, AutoTokenizer
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

# 설정
EPOCHS = 5  # epochs를 더 늘릴 수 있지만, 실습을 위해 5로 설정합니다.
BATCH_SIZE = 16
MAX_LENGTH = max_len  # 문장 최대 길이
MODEL_SAVE_PATH = "saved_models/"
NUM_CLASSES = 4  # 클래스 수
RANDOM_SEED = 42

# 랜덤 시드 고정
tf.random.set_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

## KOBERT

In [None]:
# 결측치 제거, 중복 제거, 불용어 제거한 데이터를 제공하는 fucntion
def load_data(path):
    train_data_path = path
    data = pd.read_csv(train_data_path)
    
    # 중복 제거
    data.drop_duplicates(subset = ['conversation'], inplace=True)
    
    
    data['conversation'] = data['conversation'].map(lambda x: sentence_organizer(x))

    return data

In [None]:
train_data_path ="data/train.csv"
train_data = load_data(train_data_path)
train_text = train_data['conversation']
train_data = changeClassInt(train_data)
tar_data = train_data['class']

train_text = list(train_text)
tar_data = list(tar_data)


# 훈련 및 검증 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(train_text, tar_data, test_size=0.2, random_state=RANDOM_SEED, stratify=tar_data)



In [None]:
print(X_train[:2])

In [None]:
# 토크나이저 선택
def get_tokenizer(model_name):
    return AutoTokenizer.from_pretrained(model_name)

# 토크나이징 함수
def tokenize_sentences(tokenizer, sentences, max_length):
    # X_train = []

    # for sen in sentences:
    #     print(sen)
    #     t = sen.split(" ")
    #     print(t)
    #     tokened = tokenizer(
    #     text = t,
    #     padding='max_length',
    #     truncation=True,
    #     return_tensors='tf',
    #     max_length=max_length
    # )
    #     X_train.append(tokened)    

    return tokenizer(
        text=sentences,
        padding='max_length',
        truncation=True,
        return_tensors='tf',
        max_length=max_length
    )


In [None]:
print(X_train[:2])

In [None]:
# KoBERT 모델 훈련
model_name = 'monologg/kobert'
tokenizer = get_tokenizer(model_name)
X_train_tokenized = tokenize_sentences(tokenizer, X_train, MAX_LENGTH)
X_val_tokenized = tokenize_sentences(tokenizer, X_val, MAX_LENGTH)

# X_train_tokenized = []
# for s in X_train:
#     t = tokenizer(text = s,  padding= 'max_length', truncation = True, max_length = MAX_LENGTH, return_tensors='tf')
#     X_train_tokenized.append(t)


# X_val_tokenized = []
# for s in X_val:
#     t = tokenizer(text = s,  padding= 'max_length', truncation = True, max_length = MAX_LENGTH, return_tensors='tf')
#     X_val_tokenized.append(t)


In [None]:
print(len(X_train_tokenized))
print(X_train_tokenized)
# print(X_train_tokenized[0][0])

# print(X_val_tokenized[1])

In [None]:
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=NUM_CLASSES)

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# 모델 훈련
kobert_history = model.fit(
    [X_train_tokenized['input_ids'], X_train_tokenized['attention_mask']],
    y_train,
    validation_data=([X_val_tokenized['input_ids'], X_val_tokenized['attention_mask']], y_val),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE
)

# 모델 저장
model.save_pretrained(MODEL_SAVE_PATH + 'kobert')
tokenizer.save_pretrained(MODEL_SAVE_PATH + 'kobert')

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split


def tokenize(texts, labels, max_length):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
    dataset = tf.data.Dataset.from_tensor_slices((
        dict(encodings),
        labels
    ))
    return dataset

train_df = load_data(TRAIN_DATA_PATH)
train_df["class_num"] = train_df["class"].map(class_map)

X_train, X_val, y_train, y_val = train_test_split(train_df["conversation"], train_df["class_num"], 
                                                  test_size=0.2, random_state=42, 
                                                  stratify=train_df["class_num"])

train_dataset = (tokenize(X_train.tolist(), y_train, MAX_LENGTH)
           .shuffle(len(X_train))
           .batch(BATCH_SIZE)
           .prefetch(tf.data.experimental.AUTOTUNE))

val_dataset = (tokenize(X_val.tolist(), y_val, MAX_LENGTH)
           .shuffle(len(X_val))
           .batch(BATCH_SIZE)
           .prefetch(tf.data.experimental.AUTOTUNE))

## KoELECTRA

## KLUE-BERT