In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git

Cloning into 'Mecab-ko-for-Google-Colab'...
remote: Enumerating objects: 115, done.[K
remote: Counting objects: 100% (24/24), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 115 (delta 11), reused 10 (delta 3), pack-reused 91[K
Receiving objects: 100% (115/115), 1.27 MiB | 9.36 MiB/s, done.
Resolving deltas: 100% (50/50), done.


In [3]:
cd Mecab-ko-for-Google-Colab

/content/Mecab-ko-for-Google-Colab


In [4]:
!bash install_mecab-ko_on_colab_light_220429.sh 

Installing konlpy.....
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 1.2 MB/s 
[?25hCollecting JPype1>=0.7.0
  Downloading JPype1-1.4.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (453 kB)
[K     |████████████████████████████████| 453 kB 58.4 MB/s 
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.4.0 konlpy-0.6.0
Done
Installing mecab-0.996-ko-0.9.2.tar.gz.....
Downloading mecab-0.996-ko-0.9.2.tar.gz.......
from https://bitbucket.org/eunjeon/mecab-ko/downloads/mecab-0.996-ko-0.9.2.tar.gz
--2022-09-25 13:24:18--  https://bitbucket.org/eunjeon/mecab-ko/downloads/mecab-0.996-ko-0.9.2.tar.gz
Resolving bitbucket.org (bitbucket.org)... 104.192.141.1, 2406:da00:ff00::22e9:9f55, 2406:da00:ff00::3403:4be7, ...
Connecting to bitbucket.org (bitbucket.org)|104.192.141.1|:443... co

In [5]:
def mecabsplit(mecab_tagger,inputs, pos):
    r=[]
    inputs = mecab_tagger.parse(inputs)
    t = inputs.split('\n')[:-2]
    for i in t:
        field = i.split('\t')
        if field[1].split(',')[-1] is not '*':
            r.extend( [ (x.split('/')[0],x.split('/')[1]) for x in field[1].split(',')[-1].split('+') ] )
        else:
            r.append( (field[0],field[1].split(',')[0]) )
    if pos:
        return r
    else:
        return [ x[0] for x in r ]
    return r

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import pickle
import MeCab

tagger = MeCab.Tagger()


In [7]:
class CNN_Text(nn.Module):
    
    def __init__(self, embed_num, class_num, ):
        super(CNN_Text, self).__init__()
        
        # 단어 사전 크기
        V = embed_num
        # 임베딩벡터 크기
        D = 100 #args.embed_dim
        # 분류하고자 하는 클래스의 개수
        C = class_num
        # 입력 채널 수
        Ci = 1
        # 출력 채널 수
        Co = 20 #args.kernel_num
        # 커널(필터) 사이즈
        Ks = [1,2,3]

        self.embed = nn.Embedding(V, D)
        # padding numbers for (height,width)
        self.convs1 = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D), padding=(2,0)) for K in Ks])
        # dropout 설정
        self.dropout = nn.Dropout(0.2)
        # FC 레이어
        self.fc1 = nn.Linear(len(Ks)*Co, C)

    def forward(self, x):
        x = self.embed(x)  # (B, W, D)
        
        # 입력 x를 4D로 변환
        x = x.unsqueeze(1)  # (B(batch), Ci(input channel), W(sent), D(dimension))
        # output = F.relu(x) -> B x Co x W x 1
        # max_pool1D는 3D 입력만 받음 -> size 1인 차원을 제거(squeeze)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1]  # [(B, Co, W), ...]*len(Ks)
        
        # (B x Co x 1) -> size 1인 차원을 제거(squeeze)
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [(B, Co), ...]*len(Ks)
        
        # concatenate
        x = torch.cat(x, 1)

        x = self.dropout(x)  # (B, len(Ks)*Co)
        logit = self.fc1(x)  
        return logit

In [8]:
# 학습데이터에 나타난 단어 사전
content_vocab = {'unk':0}
# 의도 레이블 사전
intent_vocab={}
# 의도 클래스
intent_list=[]

data_intent=''
intent_idx=0
vocab_idx=1

for line in open('/content/gdrive/My Drive/Colab Notebooks/aivle/data/sonny/mydata.txt','r',encoding='utf-8'):
    line = line.strip().split('\t')
    if len(line)>1:
        intent=line[1]
        if intent not in intent_vocab:
            intent_vocab[intent]=intent_idx
            intent_list.append(intent)
            intent_idx +=1
    else:
        line = mecabsplit(tagger,line[0],False)
        for it in line:
            if it not in content_vocab:
                content_vocab[it] = vocab_idx
                vocab_idx +=1
                
                

In [9]:
cnn = CNN_Text(vocab_idx,intent_idx)
print(vocab_idx, intent_idx) # 파일에 나타난 단어수, 의도 갯수(Class 갯수)
optimizer = torch.optim.Adam(cnn.parameters())
cnn.train()

31 3


CNN_Text(
  (embed): Embedding(31, 100)
  (convs1): ModuleList(
    (0): Conv2d(1, 20, kernel_size=(1, 100), stride=(1, 1), padding=(2, 0))
    (1): Conv2d(1, 20, kernel_size=(2, 100), stride=(1, 1), padding=(2, 0))
    (2): Conv2d(1, 20, kernel_size=(3, 100), stride=(1, 1), padding=(2, 0))
  )
  (dropout): Dropout(p=0.2, inplace=False)
  (fc1): Linear(in_features=60, out_features=3, bias=True)
)

In [10]:
epoch = 10
for e in range(epoch):
    totalloss = 0
    for line in open('/content/gdrive/My Drive/Colab Notebooks/aivle/data/sonny/mydata.txt','r',encoding='utf-8'):
        line = line.strip().split('\t')
    
        if len(line)> 1:
            target = Variable(torch.LongTensor([intent_vocab[line[1]]]))
            continue

        optimizer.zero_grad()

        cont = []
        line = mecabsplit(tagger,line[0],False)
        for it in line:
            cont.append(content_vocab[it])
        # view : 원소의 수를 유지하면서 텐서를 reshape, 텐서의 첫번째 차원을 1로 reshape
        cont = Variable(torch.LongTensor(cont)).view(1,-1)
        pred = cnn(cont)

        loss = F.cross_entropy(pred,target)
        totalloss += loss.data
        loss.backward()
        optimizer.step()
    print (e, 'epoch')
    print('loss : {:.3f}'.format(totalloss.numpy()))

0 epoch
loss : 65.657
1 epoch
loss : 14.887
2 epoch
loss : 2.616
3 epoch
loss : 1.419
4 epoch
loss : 1.063
5 epoch
loss : 0.687
6 epoch
loss : 0.628
7 epoch
loss : 0.470
8 epoch
loss : 0.251
9 epoch
loss : 0.218


In [11]:
response = []
for line in open('/content/gdrive/My Drive/Colab Notebooks/aivle/data/sonny/response.txt','r',encoding='utf-8'):
    line=line.strip()
    response.append(line)

In [13]:
cnn.eval()
for line in open('/content/gdrive/My Drive/Colab Notebooks/aivle/data/sonny/testdata.txt','r',encoding='utf-8'):
    line = line.strip()
    
    line = mecabsplit(tagger,line,False)
    cont = []
    for it in line:
        if it in content_vocab:
            cont.append(content_vocab[it]) # cont에는 입력문에 나타난 단어들의 index 저장됨
        else:
            cont.append(content_vocab['unk'])
    cont = Variable(torch.LongTensor(cont)).view(1,-1)
    pred = cnn(cont)
    v,i = torch.max(pred,1) # pred는 (p1, p2) 즉, 클래스별 확률 v: 둘중 큰값 i:큰값 클래스의 인덱스
    
    print('input : ',line)
    # 3개 클래스의 확률값
    probs = torch.nn.functional.softmax(pred,dim=-1).data.numpy()[0]
    print ([probs[0], probs[1], probs[2]])
    print('intent : ',intent_list[int(i)])
    print(response[int(i)])
    print()

input :  ['손흥민', '하이라이트', '영상', '보이', '어', '주', '어']
[0.00025377495, 0.00024150661, 0.9995047]
intent :  ASK_GOALS_VIDEO
손흥민 선수의 최신 하이라이트 영상입니다.

input :  ['손흥민', '생일', '은', '?']
[0.01654511, 0.97843105, 0.0050238403]
intent :  ASK_BIRTHDATE
손흥민 선수의 생일은 1992년 7월 8일입니다.

input :  ['Sonny', '뉴스', '알리', '어', '주', '어']
[0.999684, 0.0002768042, 3.9158935e-05]
intent :  ASK_NEWS
손흥민 선수에 대한 최신 뉴스입니다.

