In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd

import os
import random
from tqdm import tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, train_test_split

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, TFAutoModel, AdamWeightDecay, AutoModelForSequenceClassification, AutoConfig, AutoTokenizer, TrainingArguments, Trainer
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.callbacks import EarlyStopping

import time
import pickle


from torchvision import transforms
import random

from sklearn.metrics import f1_score 


import tensorflow_addons as tfa

In [None]:
tf.config.run_functions_eagerly(False)

class SAMModel(tf.keras.Model):
    def __init__(self, my_model, rho=0.05):
        """
        p, q = 2 for optimal results as suggested in the paper
        (Section 2)
        """
        super(SAMModel, self).__init__()
        self.my_model = my_model
        self.rho = rho

    def train_step(self, data):
        (text, labels) = data
        e_ws = []
        with tf.GradientTape() as tape:
            predictions = self.my_model(text)
            loss = self.compiled_loss(labels, predictions)
        trainable_params = self.my_model.trainable_variables
        gradients = tape.gradient(loss, trainable_params)
        grad_norm = self._grad_norm(gradients)
        scale = self.rho / (grad_norm + 1e-10)

        for (grad, param) in zip(gradients, trainable_params):
            e_w = grad * scale
            param.assign_add(e_w)
            e_ws.append(e_w)

        with tf.GradientTape() as tape:
            predictions = self.my_model(text)
            loss = self.compiled_loss(labels, predictions)    
        
        sam_gradients = tape.gradient(loss, trainable_params)
        for (param, e_w) in zip(trainable_params, e_ws):
            param.assign_sub(e_w)
        
        self.optimizer.apply_gradients(
            zip(sam_gradients, trainable_params))
        
        self.compiled_metrics.update_state(labels, predictions)
        return {m.name: m.result() for m in self.metrics}

    def test_step(self, data):
        (text, labels) = data
        predictions = self.my_model(text, training=False)
        loss = self.compiled_loss(labels, predictions)
        self.compiled_metrics.update_state(labels, predictions)
        return {m.name: m.result() for m in self.metrics}

    def _grad_norm(self, gradients):
        norm = tf.norm(
            tf.stack([
                tf.norm(grad) for grad in gradients if grad is not None
            ])
        )
        return norm  
    
    def call(self, inputs):
        """Forward pass of SAM.
        SAM delegates the forward pass call to the wrapped model.
        Args:
          inputs: Tensor. The model inputs.
        Returns:
          A Tensor, the outputs of the wrapped model for given `inputs`.
        """
        return self.my_model(inputs)

In [None]:
#텍스트 모델 전처리 함수
def preprocessing(train,val,test, modelname, max_seq_len, add_token=False, emphasize_token=False, token_change = False):
    encoder = LabelEncoder()
    y_train = train['cat3']
    y_val =  val['cat3']

    y_train = encoder.fit_transform(y_train)
    y_val = encoder.transform(y_val)

    y_train_data = pd.Series(y_train)
    y_val_data = pd.Series(y_val)  
    
    model_name = modelname       
    f = open('add_token.txt')        
    add_token_ls = f.read().split()
    emphasize_token_ls =  ['상설시장','채식주의','채식주의자','비건','비거니즘','고택','펜션','관아','팔각','주심포','건물','오일시장'] #반복을 통해 강조시켜줄 토큰
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    if add_token == True :
        for token in add_token_ls :
            tokenizer.add_tokens(token)

    def convert_examples_to_features(examples, labels, max_seq_len, tokenizer):
        
        input_ids, attention_masks, token_type_ids, data_labels = [], [], [], []
        

        for example, label in tqdm(zip(examples, labels), total=len(examples)):
            if token_change == True :     # 특정 라벨에 특정 단어가 들어가는 경우 그 단어를 살작 변형 -> 사용했을때와 안했을때 모두 뽑아내서 앙상블
                if label == '뮤지컬' :
                    if example.find('뮤지컬') != -1 :
                        example = example.replace('뮤지컬','뮤지컬(뮤지컬공연)')
                if label == '분수' :
                    if example.find('분수쇼') != -1 :
                        example = example.replace('분수쇼','분수(분수쇼)')        
                if label == '채식전문점' :
                    if example.find('채식') != -1 :
                        example = example.replace('채식','채식(채식전문점)')
                if label == '게스트하우스' :
                    if example.find('게스트하우스') != -1 :
                        example = example.replace('게스트하우스','게스트하우스(게하)')        
            

            if emphasize_token == True :     #  특정 단어를 강조하기 위해 반복
                for em in emphasize_token_ls :
                    if example.find(em) != -1 :
                        example = example.replace(em,'{} {} {}'.format(em,em,em))


            input_id = tokenizer.encode(example, max_length=max_seq_len, pad_to_max_length=True)      # 토크나이저를 통해 인코딩
            padding_count = input_id.count(tokenizer.pad_token_id)
            attention_mask = [1] * (max_seq_len - padding_count) + [0] * padding_count
            token_type_id = [0] * max_seq_len

            assert len(input_id) == max_seq_len, "Error with input length {} vs {}".format(len(input_id), max_seq_len)
            assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len)
            assert len(token_type_id) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_id), max_seq_len)

            input_ids.append(input_id)
            attention_masks.append(attention_mask)
            token_type_ids.append(token_type_id)
            data_labels.append(label)

        input_ids = np.array(input_ids, dtype=int)
        attention_masks = np.array(attention_masks, dtype=int)
        token_type_ids = np.array(token_type_ids, dtype=int)

        data_labels = np.asarray(data_labels, dtype=np.int32)

        return (input_ids, attention_masks, token_type_ids), data_labels  

    y_test_data = [i for i in range(len(test))] #test data의 Y값 임의로 설정
    x_train, y_train = convert_examples_to_features(train['overview'], y_train_data, max_seq_len=max_seq_len, tokenizer=tokenizer) 
    x_val, y_val = convert_examples_to_features(val['overview'], y_val_data, max_seq_len=max_seq_len, tokenizer=tokenizer) 
    x_test, _ = convert_examples_to_features(test['overview'], y_test_data, max_seq_len=max_seq_len, tokenizer=tokenizer) 
    return x_train, y_train, x_val, y_val, x_test

In [None]:
def train_inference(x_train, y_train, x_val, y_val, x_test, modelname, optimizer='Adam', dropout=0.1, TruncatedNormal=0.02, epocs=4, batch_size=16, sam = False):
    
    class TFBertForSequenceClassification(tf.keras.Model):
        def __init__(self, model_name):
            super(TFBertForSequenceClassification, self).__init__()
            self.bert = TFAutoModel.from_pretrained(modelname, from_pt=True)
            self.dropout = tf.keras.layers.Dropout(dropout)
            self.classifier = tf.keras.layers.Dense(128,
                                                    kernel_initializer=tf.keras.initializers.TruncatedNormal(0.02),
                                                    activation='softmax',
                                                    name='classifier')

        def call(self, inputs):
            input_ids, attention_mask, token_type_ids = inputs
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            cls_token = outputs[1]
            prediction = self.classifier(cls_token)

            return prediction    


    model = TFBertForSequenceClassification(modelname)

    if sam == True :
        model = SAMModel(model)
        
    loss = tf.keras.losses.sparse_categorical_crossentropy
    model.compile(optimizer=optimizer, loss=loss, metrics = ['accuracy'])
    model.fit(x_train, y_train, epochs=epocs, batch_size=batch_size, validation_data=(x_val,y_val))

    pred = model.predict(x_test, batch_size=batch_size) 

    del model
    tf.keras.backend.clear_session()
    
    return pred     

In [None]:
os.mkdir('final_result')

In [None]:
#Text Model

lr = 5e-6
wd = 1e-2 * lr
optimizer = tfa.optimizers.AdamW(learning_rate=lr, weight_decay=wd)

# optimizer = tf.keras.optimizers.Nadam(learning_rate=5e-6)

train = pd.read_csv('train.csv')
val= pd.read_csv('val_set.csv')
test = pd.read_csv('test.csv')

# 원본데이터, 사용자사전&강조&반복X
x_train, y_train, x_val, y_val, x_test = preprocessing(train,val,test,'klue/roberta-large',200)
pred1 = train_inference(x_train, y_train, x_val, y_val, x_test,'klue/roberta-large',epocs=7,dropout=0.1,TruncatedNormal=0.01,optimizer=optimizer,batch_size=8,sam=True,name = 'pred1')

with open('final_result/pred1.pickle', 'wb') as f:
    pickle.dump(pred1, f, pickle.HIGHEST_PROTOCOL)



# 원본데이터, 사용자사전&강조&반복O
x_train, y_train, x_val, y_val, x_test = preprocessing(train,val,test,'klue/roberta-large',200,add_token=True,emphasize_token=True,token_change=True)
pred2 = train_inference(x_train, y_train, x_val, y_val, x_test,'klue/roberta-large',epocs=8,dropout=0.1,TruncatedNormal=0.02,optimizer=optimizer,batch_size=8, sam=True,name = 'pred2')
with open('final_result/pred2.pickle', 'wb') as f:
    pickle.dump(pred2, f, pickle.HIGHEST_PROTOCOL)

In [None]:
train = pd.read_csv('train(adj,50).csv')   #형용사만 사용 증강


# 형용사 이용 증강, 사용자사전&강조&반복X
x_train, y_train, x_val, y_val, x_test = preprocessing(train,val,test,'klue/roberta-large',200)
pred3 = train_inference(x_train, y_train, x_val, y_val, x_test,'klue/roberta-large',epocs=8,dropout=0.1,TruncatedNormal=0.01,optimizer=optimizer,batch_size=8,sam=False,name = 'pred3')
with open('final_result/pred3.pickle', 'wb') as f:
    pickle.dump(pred3, f, pickle.HIGHEST_PROTOCOL)


# 형용사 이용 증강, 사용자사전&강조&반복O
x_train, y_train, x_val, y_val, x_test = preprocessing(train,val,test,'klue/roberta-large',200,add_token=True,emphasize_token=True,token_change=True)
pred4 = train_inference(x_train, y_train, x_val, y_val, x_test,'klue/roberta-large',epocs=8,dropout=0.1,TruncatedNormal=0.02,optimizer=optimizer,batch_size=8, sam=True,name = 'pred4')
with open('final_result/pred4.pickle', 'wb') as f:
    pickle.dump(pred4, f, pickle.HIGHEST_PROTOCOL)

In [None]:
train = pd.read_csv('train(ad,50).csv')   #부사만 사용 증강

# 부사만 이용 증강, 사용자사전&강조&반복X
x_train, y_train, x_val, y_val, x_test = preprocessing(train,val,test,'klue/roberta-large',200)
pred5 = train_inference(x_train, y_train, x_val, y_val, x_test,'klue/roberta-large',epocs=6,dropout=0.1,TruncatedNormal=0.01,optimizer=optimizer,batch_size=8,sam=False,name = 'pred5')
with open('final_result/pred5.pickle', 'wb') as f:
    pickle.dump(pred5, f, pickle.HIGHEST_PROTOCOL)


# 부사만 이용 증강, 사용자사전&강조&반복O
x_train, y_train, x_val, y_val, x_test = preprocessing(train,val,test,'klue/roberta-large',200,add_token=True,emphasize_token=True,token_change=True)
pred6 = train_inference(x_train, y_train, x_val, y_val, x_test,'klue/roberta-large',epocs=6,dropout=0.1,TruncatedNormal=0.02,optimizer=optimizer,batch_size=8, sam=True,name = 'pred')
with open('final_result/pred6.pickle', 'wb') as f:
    pickle.dump(pred6, f, pickle.HIGHEST_PROTOCOL)

In [None]:
train = pd.read_csv('train(adj,ad,50).csv')   #형용사&부사 사용 증강

# 형용사&부사 이용 증강, 사용자사전&강조&반복X
x_train, y_train, x_val, y_val, x_test = preprocessing(train,val,test,'klue/roberta-large',200)
pred7 = train_inference(x_train, y_train, x_val, y_val, x_test,'klue/roberta-large',epocs=6,dropout=0.1,TruncatedNormal=0.01,optimizer=optimizer,batch_size=8,sam=True,name = 'pred7')
with open('final_result/pred7.pickle', 'wb') as f:
    pickle.dump(pred7, f, pickle.HIGHEST_PROTOCOL)


# 형용사&부사 이용 증강, 사용자사전&강조&반복O
x_train, y_train, x_val, y_val, x_test = preprocessing(train,val,test,'klue/roberta-large',200,add_token=True,emphasize_token=False,token_change=True)
pred8 = train_inference(x_train, y_train, x_val, y_val, x_test,'klue/roberta-large',epocs=6,dropout=0.1,TruncatedNormal=0.02,optimizer=optimizer,batch_size=8, sam=True,name = 'pred8')
with open('final_result/pred8.pickle', 'wb') as f:
    pickle.dump(pred8, f, pickle.HIGHEST_PROTOCOL)

In [None]:
train = pd.read_csv('train(adj,ad,sim,100).csv')   #형용사&부사&유의어 사용 증강

# 형용사&부사 이용 증강, 사용자사전&강조&반복X
x_train, y_train, x_val, y_val, x_test = preprocessing(train,val,test,'klue/roberta-large',200)
pred9 = train_inference(x_train, y_train, x_val, y_val, x_test,'klue/roberta-large',epocs=6,dropout=0.1,TruncatedNormal=0.01,optimizer=optimizer,batch_size=8,sam=True,name = 'pred9')
with open('final_result/pred9.pickle', 'wb') as f:
    pickle.dump(pred9, f, pickle.HIGHEST_PROTOCOL)


# 형용사&부사 이용 증강, 사용자사전&강조&반복O
x_train, y_train, x_val, y_val, x_test = preprocessing(train,val,test,'klue/roberta-large',200,add_token=True,emphasize_token=True,token_change=True)
pred10 = train_inference(x_train, y_train, x_val, y_val, x_test,'klue/roberta-large',epocs=6,dropout=0.1,TruncatedNormal=0.02,optimizer=optimizer,batch_size=8, sam=True,name = 'pred10')
with open('final_result/pred10.pickle', 'wb') as f:
    pickle.dump(pred10, f, pickle.HIGHEST_PROTOCOL)

In [None]:
encoder = LabelEncoder()
y_train = train['cat3']


y_train = encoder.fit_transform(y_train)

In [None]:
# Ensemble
def mode(list):
    count = 0
    mode = 0
    for x in list: 
        if list.count(x) > count:
            count = list.count(x)
            mode = x

    return mode



pred_ls = []
for i in range(len(pred1)):
  index = mode([
      pred2[i].argmax(),
      pred1[i].argmax(),
      pred3[i].argmax(),
      pred4[i].argmax(),
      pred5[i].argmax(),
      pred6[i].argmax(),
      pred7[i].argmax(),
      pred8[i].argmax(),
      pred9[i].argmax(),
      pred10[i].argmax(),
      ])
  pred_ls.append(encoder.classes_[index])
y_pre= encoder.transform(pred_ls)

In [None]:
test['cat3'] = y_pre

In [None]:
submission = test[['id','cat3']]
submission.to_csv('submission_text.csv', index=False, encoding='utf-8')