# Data Processing  
* Intent Dataset을 corpus로 활용하여 Word2vec학습을 위한 데이터 처리  

In [3]:
import os
import sys
import json
import torch
import pandas as pd
import numpy as np

from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec



In [5]:
!pip install torch

Collecting pandas
  Downloading pandas-1.3.5-cp38-cp38-win_amd64.whl (10.2 MB)
Collecting pytz>=2017.3
  Using cached pytz-2021.3-py2.py3-none-any.whl (503 kB)
Collecting numpy>=1.17.3; platform_machine != "aarch64" and platform_machine != "arm64" and python_version < "3.10"
  Downloading numpy-1.22.0-cp38-cp38-win_amd64.whl (14.7 MB)
Installing collected packages: pytz, numpy, pandas
Successfully installed numpy-1.22.0 pandas-1.3.5 pytz-2021.3
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'C:\Users\khj_a\AppData\Local\Programs\Python\Python38\python.exe -m pip install --upgrade pip' command.


In [2]:
!pip install --upgrade gensim==3.4.0

Collecting gensim==3.4.0
  Using cached gensim-3.4.0-cp38-cp38-win_amd64.whl
Collecting smart-open>=1.2.1
  Using cached smart_open-5.2.1-py3-none-any.whl (58 kB)
Installing collected packages: smart-open, gensim
Successfully installed gensim-3.4.0 smart-open-5.2.1


In [4]:
class Preprocessing:
    
    def __init__(self, max_len = 20):
        self.max_len = max_len
        self.PAD = 0
    
    def pad_idx_sequencing(self, q_vec):
        q_len = len(q_vec)
        diff_len = q_len - self.max_len
        if(diff_len>0):
            q_vec = q_vec[:self.max_len]
            q_len = self.max_len
        else:
            pad_vac = [0] * abs(diff_len)
            q_vec += pad_vac

        return q_vec
    
class MakeDataset:
    
    def __init__(self):
        
        self.intent_data_dir = "./data/dataset/intent_data.csv"
        self.prep = Preprocessing()
    
    def tokenize(self, sentence):
        ''' 띄어쓰기 단위로 tokenize 적용'''
        return sentence.split()
    
    def tokenize_dataset(self, dataset):
        ''' Dataset에 tokenize 적용'''
        token_dataset = []
        for data in dataset:
            token_dataset.append(self.tokenize(data))
        return token_dataset
    
    def make_embed_dataset(self, ood = False):
        embed_dataset = pd.read_csv(self.intent_data_dir)
        embed_dataset = embed_dataset["question"].to_list()
        embed_dataset = self.tokenize_dataset(embed_dataset)

        return embed_dataset  

In [5]:
dataset = MakeDataset()
embed_dataset = dataset.make_embed_dataset()

In [6]:
embed_dataset

[['야', '먼지', '알려주겠니'],
 ['아니', '먼지', '정보', '알려주세요'],
 ['그', '때', '미세먼지', '어떨까'],
 ['그', '때', '먼지', '좋으려나'],
 ['미세먼지', '어떨', '것', '같은데'],
 ['그러면', '미세먼지', '어때'],
 ['미세먼지', '말해줄래'],
 ['먼지', '미세먼지'],
 ['참', '먼지', '많이', '꼈나'],
 ['참', '먼지', '어떠냐'],
 ['야', '미세먼지', '어때'],
 ['토요일', '먼지', '알려줘'],
 ['헐', '먼지', '어떠려나'],
 ['전주', '미세먼지', '어떤지', '말해줘'],
 ['미세먼지', '궁금해서', '물어봤어'],
 ['아', '그러면', '먼지', '알려주세요'],
 ['아', '맞다', '먼지', '어떠려나'],
 ['저기', '현재', '먼지', '어떠려나'],
 ['토요일', '먼지', '알려주겠니'],
 ['먼지', '궁금해'],
 ['지금', '먼지', '어떤데'],
 ['저기', '미세먼지', '어떨까'],
 ['먼지', '완전', '문젠데'],
 ['세종', '먼지', '한', '번', '알려줘'],
 ['모레', '먼지', '궁금해'],
 ['먼지', '궁금해요'],
 ['미세먼지', '어떨까'],
 ['지금', '미세먼지', '어떨까', '심해지네'],
 ['현재', '미세먼지', '어떨까'],
 ['공기', '미세먼지', '좀', '알려줄래'],
 ['미세먼지', '먼지'],
 ['미세먼지', '좀', '먼지'],
 ['월요일', '근데', '먼지', '어떠려나'],
 ['미세먼지', '없니', '현재'],
 ['지금', '미세먼지', '없니', '현재'],
 ['공기', '미세먼지', '현재'],
 ['공기상태', '알려줘'],
 ['아', '먼지', '심하네'],
 ['어떤데', '먼지'],
 ['먼지', '알려줬으면', '졸겠어'],
 ['미세먼지', '없나', '하'],
 ['지금', '먼지', 

### Embedding  

In [7]:
class EpochLogger(CallbackAny2Vec):
    # 학습 중간 프린트를 하기 위한 logger
    def __init__(self):
        self.epoch = 0
        
    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
        print("Epoch #{} end".format(self.epoch))
        self.epoch += 1

# https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
# https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#online-training-resuming-training
class MakeEmbed:
    
    def __init__(self):
        self.model_dir = "./"
        self.vector_size = 300 # 임베딩 사이즈
        self.window_size = 3 # 몇개의 단어로 예측을 할것인지
        self.workers = 8 # 학습 스레드의 수
        self.min_count = 2 # 단어의 최소 빈도수
        self.iter = 1000 # 1epoch당 학습 수
        self.sg = 1 # 1: skip-gram, 0: CBOW
        self.model_file = "./data/pretraining/word2vec_skipgram_{}_{}_{}".format(self.vector_size, self.window_size, self.min_count)
        self.epoch_logger = EpochLogger()
        
    def word2vec_init(self): # word2vec 초기화 및 세팅
        self.word2vec = Word2Vec(size=self.vector_size,
                         window=self.window_size,
                         workers=self.workers,
                         min_count=self.min_count,
                         compute_loss=True,
                         iter=self.iter)
        
    def word2vec_build_vocab(self, dataset): # 단어장 만들기
        self.word2vec.build_vocab(dataset)
        
    def word2vec_most_similar(self, query): # 비슷한 단어 계산
        print(self.word2vec.most_similar(query))
        
    def word2vec_train(self,embed_dataset, epoch = 0): # 학습
        if(epoch == 0):
            epoch = self.word2vec.epochs + 1
        self.word2vec.train(
            sentences=embed_dataset,
            total_examples=self.word2vec.corpus_count,
            epochs=epoch,
            callbacks=[self.epoch_logger]
        )

        self.word2vec.save(self.model_file + '.gensim')
        self.vocab = self.word2vec.wv.index2word
        self.vocab = {word: i for i, word in enumerate(self.vocab)}

    def load_word2vec(self):

        if not os.path.exists(self.model_file+'.gensim'):
            raise Exception("모델 로딩 실패 "+ self.model_file+'.gensim')

  
        self.word2vec = Word2Vec.load(self.model_file+'.gensim')
        self.vocab = self.word2vec.wv.index2word
        self.vocab.insert(0,"<UNK>") # vocal에 없는 토큰이 등장할 경우를 대비한 <UNK>
        self.vocab.insert(0,"<PAD>") # 길이를 맞추기 위한 padding <PAD>
        self.vocab = {word: i for i, word in enumerate(self.vocab)}
        
    def query2idx(self, query):
        sent_idx = []

        for word in query:
            if(self.vocab.get(word)):
                idx = self.vocab[word]
            else:
                idx = 1

            sent_idx.append(idx)

        return sent_idx

In [8]:
embed = MakeEmbed()

embed.word2vec_init()

embed.word2vec.build_vocab(embed_dataset)



In [9]:
embed.word2vec.wv.most_similar('미세먼지')

[('만두', 0.17288994789123535),
 ('카레', 0.1723318248987198),
 ('눈올까', 0.1624746024608612),
 ('해남', 0.1547224521636963),
 ('속초', 0.15429911017417908),
 ('그럼', 0.14867854118347168),
 ('아구찜', 0.14808298647403717),
 ('수제', 0.14792512357234955),
 ('베이커리', 0.14602509140968323),
 ('안되는데', 0.14533887803554535)]

In [10]:
embed.word2vec_train(embed_dataset,10)

Epoch #0 start
Epoch #0 end
Epoch #1 start
Epoch #1 end
Epoch #2 start
Epoch #2 end
Epoch #3 start
Epoch #3 end
Epoch #4 start
Epoch #4 end
Epoch #5 start
Epoch #5 end
Epoch #6 start
Epoch #6 end
Epoch #7 start
Epoch #7 end
Epoch #8 start
Epoch #8 end
Epoch #9 start
Epoch #9 end


In [11]:
embed.word2vec.wv.most_similar('미세먼지')

[('먼지', 0.8919802308082581),
 ('날씨', 0.8596293926239014),
 ('내일모레', 0.8261765837669373),
 ('초미세먼지', 0.7969179153442383),
 ('그니까', 0.7905692458152771),
 ('화요일', 0.7827087640762329),
 ('황사', 0.775054931640625),
 ('같은데', 0.7727575302124023),
 ('오늘', 0.7715519666671753),
 ('가려는데', 0.7663064002990723)]