In [None]:
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers==3.0.2
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting mxnet
  Downloading mxnet-1.9.1-py3-none-manylinux2014_x86_64.whl (49.1 MB)
[K     |████████████████████████████████| 49.1 MB 193 kB/s 
Collecting graphviz<0.9.0,>=0.8.1
  Downloading graphviz-0.8.4-py2.py3-none-any.whl (16 kB)
Installing collected packages: graphviz, mxnet
  Attempting uninstall: graphviz
    Found existing installation: graphviz 0.10.1
    Uninstalling graphviz-0.10.1:
      Successfully uninstalled graphviz-0.10.1
Successfully installed graphviz-0.8.4 mxnet-1.9.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gluonnlp
  Downloading gluonnlp-0.10.0.tar.gz (344 kB)
[K     |████████████████████████████████| 344 kB 11.7 MB/s 
Building wheels for collected packages: gluonnlp
  Building wheel for gluonnlp (setup.py) ... [?25l[?25hdone
  Created wheel for gluonnlp: filename=gluonnlp-0.10.0-cp37-cp37

In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook
from sklearn.model_selection import train_test_split
#from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import traceback



In [None]:
!pip install --upgrade -q pyproj

[K     |████████████████████████████████| 6.3 MB 34.0 MB/s 
[?25h

In [None]:
def mount_drive():
    from google.colab import drive
    drive.mount('/content/gdrive')
    %cd /content/gdrive/MyDrive/NLP

In [None]:
if __name__ == '__main__':
  mount_drive()

Mounted at /content/gdrive
/content/gdrive/MyDrive/NLP


In [None]:
if __name__ == '__main__':
  whole_dataset = pd.read_excel('data/chat_data.xlsx')
  whole_dataset.head()

In [None]:
whole_dataset = pd.read_excel('data/chat_data.xlsx')

#transformed_data = whole_dataset['Emotion']
#transformed_data.head()
transformed_data = whole_dataset.loc[(whole_dataset['Emotion']=='공포'), 'Emotion']='0'
whole_dataset.head()

Unnamed: 0,Sentence,Emotion
0,첨부터 그사람이 어떤사람인지도 모르고 결혼전제로 일단 만날수가 있나요?,0
1,기분이 묘하내요,0
2,벌써 4시를 향해가고 있음...,0
3,근데 엊그제 차단 풀고 친구추가까지 했더라구요.,0
4,분명 새벽 2시30분부터 9시30분까지 잤는데..,0


In [None]:
def data_processing(raw_data):    

    # 0: '공포', 1: '놀람', 2: '분노', 3: '슬픔', 4: '중립', 5: '행복, 6: '혐오'
    # Emotion 열에 있는 문자열을 대응되는 정수로 변환해주자
    # Hint : loc 함수를 활용하여 바꾸어 보자.
    ## 여기에 코드 작성
  
    transformed_data = raw_data.loc[(raw_data['Emotion']=='공포'), 'Emotion']='0'
    transformed_data = raw_data.loc[(raw_data['Emotion']=='놀람'), 'Emotion']='1'
    transformed_data = raw_data.loc[(raw_data['Emotion']=='분노'), 'Emotion']='2'
    transformed_data = raw_data.loc[(raw_data['Emotion']=='슬픔'), 'Emotion']='3'
    transformed_data = raw_data.loc[(raw_data['Emotion']=='중립'), 'Emotion']='4'
    transformed_data = raw_data.loc[(raw_data['Emotion']=='행복'), 'Emotion']='5'
    transformed_data = raw_data.loc[(raw_data['Emotion']=='혐오'), 'Emotion']='6'

    # 판다스의 concat을 활용하여 'document' 데이터와 'label' 데이터를 연결해보자.
    # 연결한 데이터의 이름은 processed_data라고 하자.
    ## 여기에 코드 작성
    emotion_data = raw_data['Emotion']
    document_data = raw_data['Sentence']
    processed_data = pd.concat([document_data, emotion_data], axis=1)
    processed_data.columns = ['sentence', 'label']

    return processed_data

In [None]:
def data_to_token_ids(tokenizer, single_sentence):
    # CLS 토큰과 SEP 토큰을 문장의 시작과 끝에 붙여보자.
    special_token_added = "[CLS] " + str(single_sentence) + " [SEP]"
    
    # KoBERTTokenizer의 tokenize 함수를 활용하여 문장을 토큰화해보자.
    tokenized_text = tokenizer.tokenize(special_token_added)

    # KoBERTTokenizer의 convert_tokens_to_ids 함수를 활용하여 생성된 토큰을 숫자 형태로 바꿔주자.
    token_ids = [tokenizer.convert_tokens_to_ids(tokenized_text)]

    MAX_LEN = 128
    # pad_sequences 함수를 활용하여 문장의 빈 칸에 padding을 넣어주자.
    token_ids_padded = pad_sequences(token_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    token_ids_flatten = token_ids_padded.flatten()
    return token_ids_flatten

In [None]:
def token_ids_to_mask(token_ids):
    
    # token_id에서 0보다 큰 숫자만 유효하도록 하는 'mask' 리스트를 만들자.
    mask = [float(i>0) for i in token_ids]
    
    return mask

In [None]:
def tokenize_processed_data(tokenizer, processed_dataset):
    labels = processed_dataset['label'].to_numpy()

    labels = labels.astype(np.int)
    
    # list comprehension을 활용하여 processed_dataset의 'sentence' 데이터를 id값으로 토큰화하자.
    tokenized_data = [data_to_token_ids(tokenizer, processed_data) for processed_data in processed_dataset['sentence']]

    # list comprehension을 활용하여 앞서 토큰화한 데이터 id를 mask로 변환하자.
    attention_masks = [token_ids_to_mask(token_ids) for token_ids in tokenized_data]
    
    return tokenized_data, labels, attention_masks

In [None]:
def split_into_train_validation(whole_data, whole_label, whole_masks):
    print("length of whole_data : " + str(len(whole_data)))
    
    # split_into_train_test의 코드를 참조하여 data와 mask를  train을 위한 것과 validation을 위한 것으로 나누자.
    
    train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(whole_data,
                                                                                    whole_label, 
                                                                                    random_state=2022, 
                                                                                    test_size=0.1)
    train_masks, validation_masks, _, _ = train_test_split(whole_masks, 
                                                       whole_data,
                                                       random_state=2022, 
                                                       test_size=0.1)
    
    
   # print("length of train_data : " + str(len(train_data)))
    
    return train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks

In [None]:
def data_to_tensor(inputs, labels, masks):
    inputs_tensor = torch.tensor(inputs)
    labels_tensor = torch.tensor(labels)
    masks_tensor = torch.tensor(masks)
    return inputs_tensor, labels_tensor, masks_tensor

In [None]:
def tensor_to_dataloader(inputs, labels, masks, mode):
    from torch.utils.data import RandomSampler, SequentialSampler
    
    batch_size=32
    data = TensorDataset(inputs, masks, labels)
    
    if mode == "train":
        # train에 적합한 sampler을 지정하자.
        sampler = RandomSampler(data)
    else:
        # test에 적합한 sampler을 지정하자.
        sampler = SequentialSampler(data)
    
    dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size)
    
    return dataloader

In [None]:
def preproc(tokenizer, whole_dataset):
    # whole_dataset을 전처리하자.
    processed_dataset = data_processing(whole_dataset)
    
    # 전처리한 전체 데이터를 토큰화하자.
    tokenized_dataset, labels, attention_masks = tokenize_processed_data(tokenizer, processed_dataset)

    # 토큰화한 train용 데이터를 train용과 validation용으로 분리하자.
    train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks = split_into_train_validation(tokenized_dataset, labels, attention_masks)

    # train용, validation용 데이터 각각을 텐서로 변환하자.
    train_inputs, train_labels, train_masks = data_to_tensor(train_inputs, train_labels, train_masks)
    validation_inputs, validation_labels, validation_masks = data_to_tensor(validation_inputs, validation_labels, validation_masks)

    # train용, validation용 텐서를 dataloader로 변환하자. 
    train_dataloader = tensor_to_dataloader(train_inputs, train_labels, train_masks, "train")
    validation_dataloader = tensor_to_dataloader(validation_inputs, validation_labels, validation_masks, "validation")

    return train_dataloader, validation_dataloader

In [None]:
def main():
    from nlp_tokenization import KoBertTokenizer

    # 전체 데이터를 불러오자.
    whole_dataset = pd.read_excel('/content/gdrive/MyDrive/NLP/data/chat_data.xlsx')        

    # KoBERTTokenizer를 불러오자.
    tokenizer = KoBertTokenizer.from_pretrained("monologg/kobert")
  
    train, valid = preproc(tokenizer, whole_dataset)        

In [None]:
if __name__ == '__main__':
    main()

Downloading:   0%|          | 0.00/371k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/77.8k [00:00<?, ?B/s]

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  after removing the cwd from sys.path.


length of whole_data : 34388


  
