### [자연어처리] 텍스트 분류 모델 구현
- 데이터셋 : 200개 한국어 뉴스 기사
- 라벨 : 정치(0), 경제(1), 사회(2), 생활/문화(3), 세계(4), 기술/IT(5), 연예(6), 스포츠(7)

In [1]:
# 모듈 로딩
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch import optim
from konlpy.tag import *

from collections import Counter

import os
import re

import pandas as pd
import numpy as np

In [2]:
# 데이터 불러오기
DATA_PATH = r"C:\Users\kdp\Desktop\DATA\NLP_work_dataset"

In [3]:
label_list = os.listdir(DATA_PATH)
print(label_list)

['0', '1', '2', '3', '4', '5', '6', '7']


In [4]:
dict1 = {'text' : [] , 'label' : []}
for num in label_list:
    folder_path = DATA_PATH+'\\'+num
    file_list = os.listdir(folder_path)
    for file in file_list:
        with open(folder_path+'\\'+file, 'r', encoding='utf-8') as f:
            data = f.read()
            # 한글만 남기기 + 공백 여러개 -> 1개로 변환
            data = re.sub(r'[^가-힣\s]', '', data)
            data = re.sub(r'\s', ' ', data)

            dict1['text'].append(data)
            dict1['label'].append(int(num))
print(len(dict1['text']), len(dict1['label']))

1600 1600


In [5]:
textDF = pd.DataFrame(dict1)
textDF.head()

Unnamed: 0,text,label
0,동남아 담당 최희철 부상 베이징 도착싱가포르행 주목 최 부상 행선지방문 목적 질문...,0
1,예결위 추경 막바지 심사 진통여야 충돌 서울연합뉴스 김남권 기자 국회 예산결산특별...,0
2,외압 논란항명 사태산 넘고 물 건넌 권성동 영장 청구 안미현 검사 외압 폭로 파문으...,0
3,친문 홍영표 문빠에 찍혔다특검 합의에 문자폭탄 공격 대표적인 친문재인계인 홍영표 더...,0
4,연일 비난정상회담 전 경고성 메시지 발신 맥스선더태영호 등 불만 표시 최종타켓...,0


In [6]:
# 훈련, 테스트용 데이터 나누기
train = textDF.sample(frac=0.9, random_state=18)
test = textDF.drop(train.index)

print(train.head(5).to_markdown())
print(len(train), len(test))

|      | text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           

In [7]:
# 불용어 리스트 생성
stop_path = '../data/stopwords_1009.txt'

In [8]:
with open(stop_path, 'r', encoding='utf-8') as f:
    wordlist = f.readlines()
wordlist

['가\n',
 '가까스로\n',
 '가령\n',
 '각\n',
 '각각\n',
 '각자\n',
 '각종\n',
 '갖고말하자면\n',
 '같다\n',
 '같이\n',
 '개의치않고\n',
 '거니와\n',
 '거바\n',
 '거의\n',
 '것\n',
 '것과 같이\n',
 '것들\n',
 '게다가\n',
 '게우다\n',
 '겨우\n',
 '견지에서\n',
 '결과에 이르다\n',
 '결국\n',
 '결론을 낼 수 있다\n',
 '겸사겸사\n',
 '고려하면\n',
 '고로\n',
 '곧\n',
 '공동으로\n',
 '과\n',
 '과연\n',
 '관계가 있다\n',
 '관계없이\n',
 '관련이 있다\n',
 '관하여\n',
 '관한\n',
 '관해서는\n',
 '구\n',
 '구체적으로\n',
 '구토하다\n',
 '그\n',
 '그들\n',
 '그때\n',
 '그래\n',
 '그래도\n',
 '그래서\n',
 '그러나\n',
 '그러니\n',
 '그러니까\n',
 '그러면\n',
 '그러므로\n',
 '그러한즉\n',
 '그런 까닭에\n',
 '그런데\n',
 '그런즉\n',
 '그럼\n',
 '그럼에도 불구하고\n',
 '그렇게 함으로써\n',
 '그렇지\n',
 '그렇지 않다면\n',
 '그렇지 않으면\n',
 '그렇지만\n',
 '그렇지않으면\n',
 '그리고\n',
 '그리하여\n',
 '그만이다\n',
 '그에 따르는\n',
 '그위에\n',
 '그저\n',
 '그중에서\n',
 '그치지 않다\n',
 '근거로\n',
 '근거하여\n',
 '기대여\n',
 '기점으로\n',
 '기준으로\n',
 '기타\n',
 '까닭으로\n',
 '까악\n',
 '까지\n',
 '까지 미치다\n',
 '까지도\n',
 '꽈당\n',
 '끙끙\n',
 '끼익\n',
 '나\n',
 '나머지는\n',
 '남들\n',
 '남짓\n',
 '너\n',
 '너희\n',
 '너희들\n',
 '네\n',
 '넷\n',
 '년\n',
 '논하지 않다\n',
 '놀라다\n',

In [9]:
stopwords = []
for word in wordlist:
    stopwords.append(word.replace('\n',''))
print(len(wordlist),len(stopwords))
print(stopwords[-10:])

637 637
['대해', '밝혔다', '못', '안', '때문', '위해', '통해', '대', '된다', '더']


In [10]:
# 데이터 토큰화 및 단어사전 구축
counter = Counter()
okt = Okt()

In [11]:
train_tokens = [[token for token in okt.morphs(sentence) if token not in stopwords] for sentence in train.text]
test_tokens =  [[token for token in okt.morphs(sentence) if token not in stopwords] for sentence in test.text]

In [12]:
def build_vocab(corpus, n_vocab, special_tokens):
    for tokens in corpus:
        counter.update(tokens)
    vocab = special_tokens

    for token, count in counter.most_common(n_vocab):
        vocab.append(token)
    
    return vocab

In [13]:
vocab = build_vocab(corpus=train_tokens, n_vocab=5000, special_tokens=['<PAD>', '<UNK>'])
token_to_id = {token:idx for idx, token in enumerate(vocab)}
id_to_token = {idx:token for idx, token in enumerate(vocab)}

print(vocab[:20])
print(len(vocab))

print(token_to_id)
print(id_to_token)

['<PAD>', '<UNK>', '한국', '서울', '북한', '미국', '성', '시', '대한', '당', '중', '대통령', '지', '원', '사람', '중국', '관련', '서', '장', '분']
5002
{'<PAD>': 0, '<UNK>': 1, '한국': 2, '서울': 3, '북한': 4, '미국': 5, '성': 6, '시': 7, '대한': 8, '당': 9, '중': 10, '대통령': 11, '지': 12, '원': 13, '사람': 14, '중국': 15, '관련': 16, '서': 17, '장': 18, '분': 19, '될': 20, '경찰': 21, '정부': 22, '이후': 23, '내': 24, '하지': 25, '대표': 26, '보다': 27, '같은': 28, '문제': 29, '개': 30, '사실': 31, '에서는': 32, '있다고': 33, '후보': 34, '두': 35, '경우': 36, '조사': 37, '게': 38, '되는': 39, '지난해': 40, '뒤': 41, '김': 42, '대회': 43, '관계자': 44, '점': 45, '했다고': 46, '상황': 47, '주': 48, '세': 49, '이어': 50, '따르면': 51, '최근': 52, '연': 53, '의원': 54, '혐의': 55, '에도': 56, '진행': 57, '후': 58, '달': 59, '입니다': 60, '시장': 61, '하기': 62, '현재': 63, '확인': 64, '지역': 65, '결과': 66, '하며': 67, '기': 68, '라고': 69, '간': 70, '예정': 71, '주장': 72, '돼': 73, '상': 74, '하면': 75, '해야': 76, '설명': 77, '시작': 78, '팀': 79, '뉴스': 80, '조': 81, '세계': 82, '이나': 83, '문': 84, '경기': 85, '한다고': 86, '사건': 87, '곳': 88, '생각': 89, 

In [14]:
# 정수 인코딩 및 패딩
def pad_sequences(sequences, max_length, pad_value):
    result = []
    for sequence in sequences:
        sequence = sequence[:max_length]
        pad_length = max_length - len(sequence)
        pad_sequence = sequence + [pad_value]*pad_length
        result.append(pad_sequence)
        
    return np.asarray(result)

In [15]:
unk_id = token_to_id['<UNK>']
train_ids = [[token_to_id.get(token, unk_id)for token in text] for text in train_tokens]
test_ids = [[token_to_id.get(token, unk_id)for token in text] for text in test_tokens]

max_length = 200
pad_id = token_to_id['<PAD>']
train_ids = pad_sequences(train_ids, max_length, pad_id)
test_ids = pad_sequences(test_ids, max_length, pad_id)

print(train_ids[0])
print(test_ids[0])

[   1 4802 1354    1 1355    1 1354 3661    1 2968   80   42    1  722
    1    1 4802 1354 3347 2303 4803    1    1    1 1354 3661  315    1
  436  262 4802 1354   52    1 1464   46  429   50 4802 1354  173 1313
    1  146 1072  861  629 2443    1  481  202    1  513 4802 1354  209
  114    1    1 2969    1  155  630    1    1  816 3843 1877 2532  366
 2969    1   67 1072 2743    1 2625 4257  573 1010  694 1072  155   31
    1 1149    1    1 1438  114    1    1  213  178  166 1753    1    1
    1 2444    1  383 4802 1354 1010  301 1194  114   27  147  396    1
 2844 2626    1  258  213    1 1043 4804  250    1  617 2101  678 4802
 1354    1 3844    1    1 1354 3661    1    1    1  500    1 3845    1
 4805  825    1  918 2532 2970    1 3490  192 2038    1  189 4802 1354
 1355  959  617    1  384 4806    1    1 1072  210    1    1    1    1
    1    1    1    1    1    1  101  190  352 4507 3662 4032 1173    1
   18    1 1465  161  262    1    1 2744 4507  132  115  178    1    1
    1 

In [16]:
# 분류 클래스 생성
class SentenceClassifier(nn.Module):
    def __init__(self, n_vocab, hidden_dim, embedding_dim, n_layers, dropout=0.5, bidirectional=True, model_type = "lstm"):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=n_vocab, embedding_dim=embedding_dim, padding_idx=0)

        if model_type == 'rnn':
            self.model = nn.RNN(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout, batch_first=True)
        elif model_type == 'lstm':
            self.model = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout, batch_first=True)
        
        if bidirectional:
            self.classifier = nn.Linear(hidden_dim*2, 1)
        else :
            self.classifier = nn.Linear(hidden_dim, 1)
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, inputs):
        embeddings = self.embedding(inputs)
        output,_ = self.model(embeddings)
        last_output = output[:,-1,:]
        last_output = self.dropout(last_output)
        logits = self.classifier(last_output)
        return logits

In [17]:
# 데이터로더 적용
train_ids = torch.tensor(train_ids)
test_ids = torch.tensor(test_ids)

train_labels = torch.tensor(train['label'].values, dtype=torch.float32)
test_labels = torch.tensor(test['label'].values, dtype=torch.float32)

train_dataset = TensorDataset(train_ids, train_labels)
test_dataset = TensorDataset(test_ids, test_labels)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [18]:
# 손실함수와 최적화 함수 정의
n_vocab = len(token_to_id)
hidden_dim = 64
embedding_dim = 128
n_layers = 2

device = 'cuda' if torch.cuda.is_available() else 'cpu'

classifier = SentenceClassifier(n_vocab=n_vocab, hidden_dim=hidden_dim, embedding_dim=embedding_dim, n_layers=n_layers).to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(classifier.parameters(), lr=0.001)

In [19]:
# 모델 학습 및 테스트
def train(model, datasets, criterion, optimizer, device, interval):
    model.train
    losses = list()

    for step, (input_ids, labels) in enumerate(datasets):
        input_ids = input_ids.to(device)
        labels = labels.to(device).unsqueeze(1)

        logits = model(input_ids)
        loss = criterion(logits, labels)
        losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if step% interval == 0:
            print(f'Train Loss {step} : {np.mean(losses)}')

In [20]:
def test(model, datasets, criterion, device):
    model.eval()
    losses = list()
    corrects = list()

    for step, (input_ids, labels) in enumerate(datasets):
        input_ids = input_ids.to(device)
        labels = labels.to(device).unsqueeze(1)

        target = list(range(7))

        logits = model(input_ids)
        loss = criterion(logits, labels)
        losses.append(loss.item())
        yhat = torch.nn.Softmax(logits)
        corrects.extend(torch.eq(yhat, labels).cpu().tolist())

    print(f"Val Loss : {np.mean(losses)}, Val Accuracy : {np.mean(corrects)}")    

In [21]:
epochs = 5
interval = 50

for epoch in range(epochs):
    train(classifier, train_loader, criterion, optimizer, device, interval)
    test(classifier, test_loader, criterion, device)

Train Loss 0 : 0.0
Val Loss : 0.0, Val Accuracy : 0.0
Train Loss 0 : 0.0
Val Loss : 0.0, Val Accuracy : 0.0
Train Loss 0 : 0.0
Val Loss : 0.0, Val Accuracy : 0.0
Train Loss 0 : 0.0
Val Loss : 0.0, Val Accuracy : 0.0
Train Loss 0 : 0.0
Val Loss : 0.0, Val Accuracy : 0.0


In [22]:
# 다중 분류이기때문에 소프트맥스 함수 사용
# 그 중 값이 가장 높은 인덱스를 추출해서 타겟과 비교해보는 것이 맞는 방향인 것 같음
# 능력부족.....
# 꼭 다시 보완하기!!