# Anxiety labeling

## Import Library

In [1]:
import os
import pickle

import pandas as pd
import re

from konlpy.tag import Komoran, Okt

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model

from tqdm import tqdm

## Hyper_parameters

In [2]:
# Colab
# DATA_DIR = '/content/drive/MyDrive/Colab Notebooks/miso'

# Local
DATA_DIR = '/Users/inseoklee/Desktop/miso/data'
SAVE_DIR = '/Users/inseoklee/Desktop/miso/data'
LOAD_DIR = '/Users/inseoklee/Desktop/miso/model/CNN'

# MAX_LEN -> 패딩 단계에서 설정

## Load Data

In [3]:
df_anxious = pd.read_csv(DATA_DIR + '/label_anxious.csv')

In [4]:
df_anxious.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15799 entries, 0 to 15798
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    15799 non-null  object
 1   label   15799 non-null  object
dtypes: object(2)
memory usage: 247.0+ KB


## Tokenize

In [5]:
def preprocessing(text, okt, remove_stopwords=False, stop_words=[]):
    # 함수의 인자는 다음과 같다.
    # text: 전처리할 텍스트
    # okt: okt객체를 반복적으로 생성하지 않고 미리 생성한 후 인자로 받는다.
    # remove_stopwords: 불용어를 제거할지 여부 선택. 기본값은 False
    # stop_words: 불용어 사전은 사용자가 직접 입력.

    # 1. 한글 및 공백을 제외한 문자를 모두 제거
    text = re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ\\s]", "", text)

    # 2. okt 객체를 활용하여 형태소 단위로 나눈다
    word_text = okt.morphs(text, stem=True)

    if remove_stopwords:
        word_text = [token for token in word_text if (not token in stop_words)] #and len(token) > 1]

    return word_text

In [6]:
stop_words = set(['은', '는', '이', '가', '하', '아', '것', '들', '의', '있', '되', '수', '보', '주', '등', '한'])
okt = Okt()



## Load Model and Tokenizer

In [7]:
def load_tokenizer(path):
    with open(path, 'rb') as f:
        tokenizer = pickle.load(f)
    return tokenizer

model_name = 'trained_model.h5'
tokenizer_name = 'tokenizer.pickle'
model_path = os.path.join(LOAD_DIR, model_name)
tokenizer_path = os.path.join(LOAD_DIR, tokenizer_name)

model = load_model(model_path)
tokenizer = load_tokenizer(tokenizer_path)

Metal device set to: Apple M1


2022-07-13 16:24:11.280973: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-07-13 16:24:11.281072: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


## Labeling

In [8]:
def getResult(predict):
    highest = 0
    highestIdx = 0
    for x in range (0,len(predict)) :
        if(predict[x] > highest) :
            highestIdx = x
            highest =predict[x]
    return highestIdx

angry
sad
fear
disgusting
neutral
happy
surprised

In [13]:
def predict_sentiment(text, model):
    tokens = []
    if type(text) == str:
        tokens.append(preprocessing(text, okt, remove_stopwords=True, stop_words=stop_words))
    else:
        tokens.append([]) #string이 아니면 비어있는 값 추가
    token_sequences = tokenizer.texts_to_sequences(tokens)
    if(len(token_sequences) == 0) :
        return "너무 짧아"
    padded_sequences = pad_sequences(token_sequences, maxlen=15)
    predict = model.predict(padded_sequences, verbose=0)
    # print(predict)
    result = getResult(predict[0])
    if result == 0 :
        return 'angry'
    elif result == 1 :
        return 'sad'
    elif result == 2 :
        return 'fear'
    elif result == 3 :
        return 'disgusting'
    elif result == 4 :
        return 'neutral'
    elif result == 5 :
        return 'happy'
    elif result == 6 :
        return 'surprised'

In [14]:
label_anxious = []

for idx in tqdm(range(len(df_anxious['text'])), desc="Labeling"):
    label = predict_sentiment(df_anxious['text'][idx], model)
    label_anxious.append(label)

label_anxious

Labeling: 100%|██████████| 15799/15799 [06:05<00:00, 43.20it/s]


['disgusting',
 'angry',
 'sad',
 'sad',
 'happy',
 'sad',
 'sad',
 'angry',
 'sad',
 'happy',
 'fear',
 'sad',
 'sad',
 'happy',
 'fear',
 'sad',
 'neutral',
 'sad',
 'sad',
 'happy',
 'sad',
 'sad',
 'sad',
 'happy',
 'disgusting',
 'fear',
 'sad',
 'sad',
 'happy',
 'angry',
 'angry',
 'angry',
 'sad',
 'angry',
 'sad',
 'angry',
 'angry',
 'sad',
 'sad',
 'angry',
 'happy',
 'angry',
 'angry',
 'sad',
 'angry',
 'neutral',
 'sad',
 'sad',
 'fear',
 'angry',
 'sad',
 'angry',
 'sad',
 'happy',
 'sad',
 'sad',
 'sad',
 'sad',
 'sad',
 'sad',
 'sad',
 'sad',
 'happy',
 'sad',
 'sad',
 'disgusting',
 'sad',
 'sad',
 'sad',
 'angry',
 'sad',
 'sad',
 'angry',
 'angry',
 'sad',
 'sad',
 'sad',
 'angry',
 'sad',
 'happy',
 'angry',
 'sad',
 'sad',
 'sad',
 'sad',
 'sad',
 'angry',
 'sad',
 'sad',
 'sad',
 'fear',
 'sad',
 'angry',
 'sad',
 'angry',
 'sad',
 'sad',
 'angry',
 'angry',
 'sad',
 'angry',
 'angry',
 'sad',
 'angry',
 'angry',
 'angry',
 'sad',
 'sad',
 'sad',
 'angry',
 'angr

In [None]:
df_anxious['label'] = label_anxious
df_anxious[:20]

## re labeling

### '불안', '걱정' 이 들어가는 텍스트는 '두려움'으로 레이블링

In [None]:
df_anxious['text'][0]

In [None]:
for idx in tqdm(range(len(df_anxious['text'])), desc="Labeling", mininterval=0.01):
    if "불안" and "걱정" in df_anxious['text'][idx]:
        df_anxious['label'][idx] = "fear"

In [None]:
df_anxious[2:3]

### '행복'으로 레이블링 된 샘플의 레이블을 "두려움"으로 레이블링

angry
sad
fear
disgusting
neutral
happy
surprised

In [None]:
df_anxious.loc[df_anxious['label'] == 'happiness', 'label'] = 'fear'

In [None]:
df_anxious['label'].unique()

## Save data

In [None]:
df_anxious.to_csv(SAVE_DIR+'/relabel_anxious.csv', encoding='utf-8-sig', index=False)