- Preprocessing

In [1]:
import os
from collections import defaultdict
import re
import gensim
import numpy as np
import sqlite3
import pandas as pd

max_length = 800

pat = re.compile(r"[\w]+|[.,!?;|]")
codes = {'두산퓨얼셀':'336260', '한화솔루션':'009830', '한화케미칼':'009830','에코프로':'086520', '신성이엔지':'011930', '유니슨':'018000', '카카오':'035720', '네이버':'035420', 'NAVER':'035420', '엔씨소프트':'036570', '넥슨지티':'041140', 'SDS':'018260', '삼성에스디에스':'018260', '삼성전자':'005930', 'SK하이닉스':'000660', '하이닉스':'000660', 'SKC':'011790', '한솔케미칼':'014680', 'DB하이텍':'000990', '하이텍':'000990', '롯데쇼핑':'023530', '신세계':'004170', '호텔신라':'008770', '삼성물산':'028260', '동서':'026960', 'KB금융':'105560', '금융':'105560', '한국금융지주':'071050', '신한지주':'055550', '우리금융지주':'316140', '하나금융지주':'086790', '한빛소프트':'047080', '코세스':'089890', '드래곤플라이':'030350', '에이트원':'230980', '엔텔스':'069410'}
ko_model = gensim.models.fasttext.load_facebook_model('cc.ko.300.bin.gz') # https://fasttext.cc/docs/en/crawl-vectors.html 에서 다운받고, 같은 폴더에 넣기

# file list 불러오기
os.chdir('C:\\Users\\yungi\\Desktop\\AYOA\\preprocess\\PDFtoTEXT')
total_list = dict()
for cat in os.listdir():
    total_list[cat] = [(i, i.split('_')[1].split('(')[0]) for i in os.listdir(cat)]
        
# 각 파일을 열어서, 안의 내용 저장 (stopwords 처리 및 tokenizing)
total = defaultdict(lambda: defaultdict(lambda : []))
total_date = defaultdict(lambda: defaultdict(lambda : []))
for k, v in total_list.items():
    for i, j in v:
        with open(os.path.join(k, i), 'r', encoding='utf-8') as f:
            line = f.readlines()
        total[k][codes[j]].append(pat.findall(re.sub('[-_0-9.,]', ' ', ''.join(line))))
        total_date[k][codes[j]].append(re.sub('-', '', i).split('_')[0])
        
# 단어 목록 생성
vocab_candidate = []
for k in total.keys():
    for k_ in total[k].keys():
        for v_ in total[k][k_]:
            vocab_candidate.extend(v_)
vocab_candidate = list(set(vocab_candidate))

# 단어 목록 중 pre-trained embedding matrix에 있는 단어 추출 및 embedding matrix 추출
vocab = [i for i in vocab_candidate if ko_model.wv.vocab.get(i)]
vocab_size = len(vocab)

embedding_matrix = np.zeros((vocab_size + 1, 300))

word2idx = dict()
for i, v in enumerate(vocab):
    idx = ko_model.wv.index2word.index(v)
    word2idx[v] = i + 1
    embedding_matrix[i + 1] = ko_model.wv.vectors[idx]
    
# OOV words 처리
for k in total.keys():
    for k_ in total[k].keys():
        for e, v_ in enumerate(total[k][k_]):
            total[k][k_][e] = [word2idx[j] for j in v_ if word2idx.get(j)]
            
# 최대 길이 (300)로 padding
for k in total.keys():
    for k_ in total[k].keys():
        for e, v_ in enumerate(total[k][k_]):
            if len(v_) < max_length:
                total[k][k_][e] = np.array(v_ + [0] * (max_length - len(v_)))
            else:
                total[k][k_][e] = np.array(v_[:max_length])
            
# 불필요한 label 제거
tmp = dict()
for v in total.values():
    tmp.update(v)
total = tmp

tmp = dict()
for v in total_date.values():
    tmp.update(v)
total_date = tmp

# 날짜에 맞는 label 불러와서, 하나의 dataset으로 만들기
os.chdir('C:\\Users\\yungi\\Desktop')

conn = sqlite3.connect('Stock.db')
cur = conn.cursor()

first_price = cur.execute("SELECT code, date, price FROM (SELECT code, date, price, ROW_NUMBER() OVER (PARTITION BY code, date ORDER BY time ASC) AS RankNo FROM stock) T WHERE RankNo = 1").fetchall()
last_price = cur.execute("SELECT code, date, price FROM (SELECT code, date, price, ROW_NUMBER() OVER (PARTITION BY code, date ORDER BY time DESC) AS RankNo FROM stock) T WHERE RankNo = 1").fetchall()

first_price = pd.DataFrame(first_price, columns=['code', 'date', 'price'])
last_price = pd.DataFrame(last_price, columns=['code', 'date', 'price'])

first_price = first_price[1:].reset_index(drop=True)
last_price = last_price[:-1].reset_index(drop=True)
first_price['delta_price'] = first_price['price'].apply(lambda x : int(re.sub(',', '', x))) - last_price['price'].apply(lambda x: int(re.sub(',', '',x)))

for (k, v1), v2 in zip(total.items(), total_date.values()):
    v1_tmp = []
    for v1_, v2_ in zip(v1, v2):
        if len(first_price[first_price['code'] == k][first_price['date'] == v2_]):
            v1_tmp.append(np.append(v1_, np.cast['int32'](first_price[first_price['code'] == k][first_price['date'] == v2_]['delta_price'] > 0)))
    total[k] = np.array(v1_tmp)
    
# mask 생성
total_mask = dict()
for k, v in total.items():
    if v.any():
        total_mask[k] = np.cast[np.int32](v[:,:-1] != 0)

  if len(first_price[first_price['code'] == k][first_price['date'] == v2_]):
  v1_tmp.append(np.append(v1_, np.cast['int32'](first_price[first_price['code'] == k][first_price['date'] == v2_]['delta_price'] > 0)))
  if len(first_price[first_price['code'] == k][first_price['date'] == v2_]):
  v1_tmp.append(np.append(v1_, np.cast['int32'](first_price[first_price['code'] == k][first_price['date'] == v2_]['delta_price'] > 0)))
  if len(first_price[first_price['code'] == k][first_price['date'] == v2_]):
  v1_tmp.append(np.append(v1_, np.cast['int32'](first_price[first_price['code'] == k][first_price['date'] == v2_]['delta_price'] > 0)))
  if len(first_price[first_price['code'] == k][first_price['date'] == v2_]):
  v1_tmp.append(np.append(v1_, np.cast['int32'](first_price[first_price['code'] == k][first_price['date'] == v2_]['delta_price'] > 0)))
  if len(first_price[first_price['code'] == k][first_price['date'] == v2_]):
  v1_tmp.append(np.append(v1_, np.cast['int32'](first_price[first_price

  if len(first_price[first_price['code'] == k][first_price['date'] == v2_]):
  v1_tmp.append(np.append(v1_, np.cast['int32'](first_price[first_price['code'] == k][first_price['date'] == v2_]['delta_price'] > 0)))
  if len(first_price[first_price['code'] == k][first_price['date'] == v2_]):
  v1_tmp.append(np.append(v1_, np.cast['int32'](first_price[first_price['code'] == k][first_price['date'] == v2_]['delta_price'] > 0)))
  if len(first_price[first_price['code'] == k][first_price['date'] == v2_]):
  v1_tmp.append(np.append(v1_, np.cast['int32'](first_price[first_price['code'] == k][first_price['date'] == v2_]['delta_price'] > 0)))
  if len(first_price[first_price['code'] == k][first_price['date'] == v2_]):
  v1_tmp.append(np.append(v1_, np.cast['int32'](first_price[first_price['code'] == k][first_price['date'] == v2_]['delta_price'] > 0)))
  if len(first_price[first_price['code'] == k][first_price['date'] == v2_]):
  v1_tmp.append(np.append(v1_, np.cast['int32'](first_price[first_price

  v1_tmp.append(np.append(v1_, np.cast['int32'](first_price[first_price['code'] == k][first_price['date'] == v2_]['delta_price'] > 0)))
  if len(first_price[first_price['code'] == k][first_price['date'] == v2_]):
  v1_tmp.append(np.append(v1_, np.cast['int32'](first_price[first_price['code'] == k][first_price['date'] == v2_]['delta_price'] > 0)))
  if len(first_price[first_price['code'] == k][first_price['date'] == v2_]):
  v1_tmp.append(np.append(v1_, np.cast['int32'](first_price[first_price['code'] == k][first_price['date'] == v2_]['delta_price'] > 0)))
  if len(first_price[first_price['code'] == k][first_price['date'] == v2_]):
  v1_tmp.append(np.append(v1_, np.cast['int32'](first_price[first_price['code'] == k][first_price['date'] == v2_]['delta_price'] > 0)))
  if len(first_price[first_price['code'] == k][first_price['date'] == v2_]):
  v1_tmp.append(np.append(v1_, np.cast['int32'](first_price[first_price['code'] == k][first_price['date'] == v2_]['delta_price'] > 0)))
  if len(fir

  v1_tmp.append(np.append(v1_, np.cast['int32'](first_price[first_price['code'] == k][first_price['date'] == v2_]['delta_price'] > 0)))
  if len(first_price[first_price['code'] == k][first_price['date'] == v2_]):
  v1_tmp.append(np.append(v1_, np.cast['int32'](first_price[first_price['code'] == k][first_price['date'] == v2_]['delta_price'] > 0)))
  if len(first_price[first_price['code'] == k][first_price['date'] == v2_]):
  v1_tmp.append(np.append(v1_, np.cast['int32'](first_price[first_price['code'] == k][first_price['date'] == v2_]['delta_price'] > 0)))
  if len(first_price[first_price['code'] == k][first_price['date'] == v2_]):
  v1_tmp.append(np.append(v1_, np.cast['int32'](first_price[first_price['code'] == k][first_price['date'] == v2_]['delta_price'] > 0)))
  if len(first_price[first_price['code'] == k][first_price['date'] == v2_]):
  v1_tmp.append(np.append(v1_, np.cast['int32'](first_price[first_price['code'] == k][first_price['date'] == v2_]['delta_price'] > 0)))
  if len(fir

  if len(first_price[first_price['code'] == k][first_price['date'] == v2_]):
  v1_tmp.append(np.append(v1_, np.cast['int32'](first_price[first_price['code'] == k][first_price['date'] == v2_]['delta_price'] > 0)))
  if len(first_price[first_price['code'] == k][first_price['date'] == v2_]):
  v1_tmp.append(np.append(v1_, np.cast['int32'](first_price[first_price['code'] == k][first_price['date'] == v2_]['delta_price'] > 0)))
  if len(first_price[first_price['code'] == k][first_price['date'] == v2_]):
  v1_tmp.append(np.append(v1_, np.cast['int32'](first_price[first_price['code'] == k][first_price['date'] == v2_]['delta_price'] > 0)))
  if len(first_price[first_price['code'] == k][first_price['date'] == v2_]):
  v1_tmp.append(np.append(v1_, np.cast['int32'](first_price[first_price['code'] == k][first_price['date'] == v2_]['delta_price'] > 0)))
  if len(first_price[first_price['code'] == k][first_price['date'] == v2_]):
  v1_tmp.append(np.append(v1_, np.cast['int32'](first_price[first_price

- Model

In [149]:
import torch.nn as nn
import torch
import torch.nn.functional as F
import math

class MultiHeadAttention(nn.Module):
    def __init__(self, h: int, d_model: int, len_q: int, len_k: int, d_k: int, d_v: int):
        super(MultiHeadAttention, self).__init__()
        self.h = h
        self.d_model = d_model
        self.len_q = len_q
        self.len_k = len_k
        self.d_k = d_k
        self.d_v = d_v
        self.out_dim = self.h * self.d_v
        self.attention_scalar = math.sqrt(float(self.d_k))
        self.W_Q = nn.Linear(in_features=d_model, out_features=self.h*self.d_k, bias=True)
        self.W_K = nn.Linear(in_features=d_model, out_features=self.h*self.d_k, bias=True)
        self.W_V = nn.Linear(in_features=d_model, out_features=self.h*self.d_v, bias=True)

    def initialize(self):
        nn.init.xavier_uniform_(self.W_Q.weight)
        nn.init.zeros_(self.W_Q.bias)
        nn.init.xavier_uniform_(self.W_K.weight)
        nn.init.zeros_(self.W_K.bias)
        nn.init.xavier_uniform_(self.W_V.weight)
        nn.init.zeros_(self.W_V.bias)

    # Input
    # Q    : [batch_size, len_q, d_model]
    # K    : [batch_size, len_k, d_model]
    # V    : [batch_size, len_k, d_model]
    # mask : [batch_size, len_k]
    # Output
    # out  : [batch_size, len_q, h * d_v]
    def forward(self, Q, K, V, mask=None):
        batch_size = Q.size(0)
        Q = self.W_Q(Q).view([batch_size, self.len_q, self.h, self.d_k])                                           # [batch_size, len_q, h, d_k]
        K = self.W_K(K).view([batch_size, self.len_k, self.h, self.d_k])                                           # [batch_size, len_k, h, d_k]
        V = self.W_V(V).view([batch_size, self.len_k, self.h, self.d_v])                                           # [batch_size, len_k, h, d_v]
        Q = Q.permute(0, 2, 1, 3).contiguous().view([batch_size * self.h, self.len_q, self.d_k])                   # [batch_size * h, len_q, d_k]
        K = K.permute(0, 2, 1, 3).contiguous().view([batch_size * self.h, self.len_k, self.d_k])                   # [batch_size * h, len_k, d_k]
        V = V.permute(0, 2, 1, 3).contiguous().view([batch_size * self.h, self.len_k, self.d_v])                   # [batch_size * h, len_k, d_v]
        A = torch.bmm(Q, K.permute(0, 2, 1).contiguous()) / self.attention_scalar                                  # [batch_size * h, len_q, len_k]
        if mask != None:
            _mask = mask.repeat([1, self.h]).view([batch_size * self.h, 1, self.len_k]).repeat([1, self.len_q, 1]) # [batch_size * h, len_q, len_k]
            alpha = F.softmax(A.masked_fill(_mask == 0, -1e9), dim=2)                                              # [batch_size * h, len_q, len_k]
        else:
            alpha = F.softmax(A, dim=2)                                                                            # [batch_size * h, len_q, len_k]
        out = torch.bmm(alpha, V).view([batch_size, self.h, self.len_q, self.d_v])                                 # [batch_size, h, len_q, d_v]
        out = out.permute([0, 2, 1, 3]).contiguous().view([batch_size, self.len_q, self.out_dim])                  # [batch_size, len_q, h * d_v]
        return out
    
class Attention(nn.Module):
    def __init__(self, feature_dim: int, attention_dim: int):
        super(Attention, self).__init__()
        self.affine1 = nn.Linear(in_features=feature_dim, out_features=attention_dim, bias=True)
        self.affine2 = nn.Linear(in_features=attention_dim, out_features=1, bias=False)

    def initialize(self):
        nn.init.xavier_uniform_(self.affine1.weight, gain=nn.init.calculate_gain('tanh'))
        nn.init.zeros_(self.affine1.bias)
        nn.init.xavier_uniform_(self.affine2.weight)

    # Input
    # feature : [batch_size, length, feature_dim]
    # mask    : [batch_size, length]
    # Output
    # out     : [batch_size, feature_dim]
    def forward(self, feature, mask=None):
        attention = torch.tanh(self.affine1(feature))                                 # [batch_size, length, attention_dim]
        a = self.affine2(attention).squeeze(dim=2)                                    # [batch_size, length]
        if mask is not None:
            alpha = F.softmax(a.masked_fill(mask == 0, -1e9), dim=1).unsqueeze(dim=1) # [batch_size, 1, length]
        else:
            alpha = F.softmax(a, dim=1).unsqueeze(dim=1)                              # [batch_size, 1, length]
        out = torch.bmm(alpha, feature).squeeze(dim=1)                                # [batch_size, feature_dim]
        return out
    
class MHSA(nn.Module):
    def __init__(self, embedding_matrix, vocab_size, word_dim=300, max_length=800, head_num=20, head_dim=20, dropout_rate=0.2, att_dim=200):
        super(MHSA, self).__init__()
        self.word_dim = word_dim
        self.max_length = max_length
        self.head_num = head_num
        self.head_dim = head_dim
        self.output_dim = self.head_num * self.head_dim
        self.dropout_rate = dropout_rate
        self.att_dim = att_dim
        self.word_embedding = nn.Embedding(num_embeddings=vocab_size+1, embedding_dim=self.word_dim)
        self.word_embedding.weight.data.copy_(torch.tensor(embedding_matrix))
        self.dropout = nn.Dropout(p=self.dropout_rate, inplace=True)
        self.multiheadAttention = MultiHeadAttention(self.head_num, self.word_dim, self.max_length, self.max_length, self.head_dim, self.head_dim)
        self.attention = Attention(self.output_dim, self.att_dim)
        self.classifier = nn.Linear(in_features=self.output_dim, out_features=1, bias=True)

    def initialize(self):
        super().initialize()
        self.multiheadAttention.initialize()
        self.attention.initialize()
        nn.init.xavier_uniform_(self.classifier.weight)
        nn.init.zeros_(self.classifier.bias)

    def forward(self, text, mask):
        text = text.long()
        batch_size = text.size(0)
        
        # 1. word embedding
        w = self.dropout(self.word_embedding(text))
        # 2. multi-head self-attention
        c = self.dropout(self.multiheadAttention(w, w, w, mask))
        # 3. attention layer
        rep = self.attention(c, mask=mask)
        # 4. classification layer
        output = self.classifier(rep).view([batch_size,])
        
        return output

- Training

In [168]:
import torch.utils.data as data
from torch.utils.data import DataLoader
import platform
import torch.optim as optim
from tqdm import tqdm

EPOCHS = 10

class Train_Dataset(data.Dataset):
    def __init__(self, text, mask):
        self.text = text[:, :-1]
        self.mask = mask
        self.label = text[:, -1]
        self.num = len(self.text)

    def __getitem__(self, index):
        return self.text[index], self.mask[index], self.label[index]
    
    def __len__(self):
        return self.num
    
batch_size = 32
text_tmp = np.array([])
mask_tmp = np.array([])
for k, v in total.items():
    if v.any():
        if text_tmp.any():
            text_tmp = np.append(text_tmp, v, axis=0)
            mask_tmp = np.append(mask_tmp, total_mask[k], axis=0)
        else:
            text_tmp = v
            mask_tmp = total_mask[k]
total = text_tmp
total_mask = mask_tmp
            
train_dataset = Train_Dataset(total, total_mask)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=batch_size // 8 if platform.system() == 'Linux' else 0, pin_memory=True)

model = MHSA(embedding_matrix=embedding_matrix, vocab_size=vocab_size)
model.train()
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001)
sigmoid = nn.Sigmoid()
loss_obj = nn.BCELoss()
for epoch in tqdm(range(EPOCHS)):
    for text, mask, label in train_dataloader:
        pred = model(text, mask)
        loss = loss_obj(sigmoid(pred), label.float())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_acc = (sigmoid(pred).reshape(-1).detach().numpy().round() == np.array(label)).mean()
        print('Epoch: {}, training loss: {}, traing acc: {}'.format(epoch+1, loss, train_acc))

  0%|                                                                                                                              | 0/10 [00:00<?, ?it/s]

Epoch: 1, training loss: 0.698244035243988, traing acc: 0.40625
Epoch: 1, training loss: 0.693142294883728, traing acc: 0.46875
Epoch: 1, training loss: 0.6815590858459473, traing acc: 0.625
Epoch: 1, training loss: 0.7077873349189758, traing acc: 0.40625


 10%|███████████▊                                                                                                          | 1/10 [00:19<02:53, 19.29s/it]

Epoch: 1, training loss: 0.6761325597763062, traing acc: 0.5925925925925926
Epoch: 2, training loss: 0.6683357954025269, traing acc: 0.625
Epoch: 2, training loss: 0.6651118397712708, traing acc: 0.59375
Epoch: 2, training loss: 0.6720547676086426, traing acc: 0.5625
Epoch: 2, training loss: 0.7151299715042114, traing acc: 0.40625


 20%|███████████████████████▌                                                                                              | 2/10 [00:37<02:27, 18.44s/it]

Epoch: 2, training loss: 0.6983191967010498, traing acc: 0.48148148148148145
Epoch: 3, training loss: 0.6550689339637756, traing acc: 0.59375
Epoch: 3, training loss: 0.7379382252693176, traing acc: 0.34375
Epoch: 3, training loss: 0.6412619352340698, traing acc: 0.65625
Epoch: 3, training loss: 0.6516938209533691, traing acc: 0.65625


 30%|███████████████████████████████████▍                                                                                  | 3/10 [00:55<02:07, 18.22s/it]

Epoch: 3, training loss: 0.6553714275360107, traing acc: 0.5555555555555556
Epoch: 4, training loss: 0.6751400232315063, traing acc: 0.53125
Epoch: 4, training loss: 0.6359124183654785, traing acc: 0.625
Epoch: 4, training loss: 0.6668784618377686, traing acc: 0.53125
Epoch: 4, training loss: 0.6583999991416931, traing acc: 0.625


 40%|███████████████████████████████████████████████▏                                                                      | 4/10 [01:13<01:49, 18.17s/it]

Epoch: 4, training loss: 0.6164798736572266, traing acc: 0.6666666666666666
Epoch: 5, training loss: 0.6277587413787842, traing acc: 0.6875
Epoch: 5, training loss: 0.6706944704055786, traing acc: 0.5625
Epoch: 5, training loss: 0.655133843421936, traing acc: 0.625
Epoch: 5, training loss: 0.5858749747276306, traing acc: 0.8125


 50%|███████████████████████████████████████████████████████████                                                           | 5/10 [01:31<01:30, 18.19s/it]

Epoch: 5, training loss: 0.5670214295387268, traing acc: 0.8148148148148148
Epoch: 6, training loss: 0.6111074686050415, traing acc: 0.6875
Epoch: 6, training loss: 0.6432942152023315, traing acc: 0.65625
Epoch: 6, training loss: 0.544790506362915, traing acc: 0.78125
Epoch: 6, training loss: 0.5705369114875793, traing acc: 0.75


 60%|██████████████████████████████████████████████████████████████████████▊                                               | 6/10 [01:49<01:12, 18.05s/it]

Epoch: 6, training loss: 0.5195763111114502, traing acc: 0.8148148148148148
Epoch: 7, training loss: 0.5195821523666382, traing acc: 0.78125
Epoch: 7, training loss: 0.5727844834327698, traing acc: 0.65625
Epoch: 7, training loss: 0.566159188747406, traing acc: 0.71875
Epoch: 7, training loss: 0.5014485120773315, traing acc: 0.78125


 70%|██████████████████████████████████████████████████████████████████████████████████▌                                   | 7/10 [02:07<00:54, 18.02s/it]

Epoch: 7, training loss: 0.4556007385253906, traing acc: 0.8148148148148148
Epoch: 8, training loss: 0.4047117233276367, traing acc: 0.875
Epoch: 8, training loss: 0.5200387835502625, traing acc: 0.6875
Epoch: 8, training loss: 0.45303037762641907, traing acc: 0.8125
Epoch: 8, training loss: 0.45555445551872253, traing acc: 0.75


 80%|██████████████████████████████████████████████████████████████████████████████████████████████▍                       | 8/10 [02:25<00:36, 18.13s/it]

Epoch: 8, training loss: 0.4460126757621765, traing acc: 0.8518518518518519
Epoch: 9, training loss: 0.4163234531879425, traing acc: 0.8125
Epoch: 9, training loss: 0.40408802032470703, traing acc: 0.8125
Epoch: 9, training loss: 0.31004786491394043, traing acc: 0.9375
Epoch: 9, training loss: 0.4204442501068115, traing acc: 0.8125


 90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▏           | 9/10 [02:43<00:18, 18.09s/it]

Epoch: 9, training loss: 0.3041757047176361, traing acc: 0.8888888888888888
Epoch: 10, training loss: 0.2799375057220459, traing acc: 0.90625
Epoch: 10, training loss: 0.36211445927619934, traing acc: 0.90625
Epoch: 10, training loss: 0.3197527229785919, traing acc: 0.84375
Epoch: 10, training loss: 0.2376508116722107, traing acc: 0.96875


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:01<00:00, 18.15s/it]

Epoch: 10, training loss: 0.25086861848831177, traing acc: 0.9629629629629629



