In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

# References

- [SKTBrain/KoBERT](https://github.com/SKTBrain/KoBERT)
- [eagle705/pytorch-bert-crf-ner](https://github.com/eagle705/pytorch-bert-crf-ner/blob/master/Visualization_BERT_NER.ipynb)
- [BERT to the rescue!](https://towardsdatascience.com/bert-to-the-rescue-17671379687f)
- [eagle705/pytorch-bert-crf-ner](https://github.com/eagle705/pytorch-bert-crf-ner/blob/master/data_utils/vocab_tokenizer.py)
- [pytorch-pretrained-bert](https://pypi.org/project/pytorch-pretrained-bert/#usage)

# Load pre-trained model

In [2]:
import io
import time

import pandas as pd
import numpy as np

import tqdm
import multiprocessing
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
font_dirs = ['/usr/share/fonts/truetype/nanum']
font_files = font_manager.findSystemFonts(fontpaths=font_dirs)
font_list = font_manager.createFontList(font_files)
font_manager.fontManager.ttflist.extend(font_list)
plt.rcParams['font.family'] = 'NanumGothic'

import torch
import torch.nn as nn
from kobert.pytorch_kobert import get_pytorch_kobert_model
from gluonnlp.data import SentencepieceTokenizer
from kobert.utils import get_tokenizer
from keras.preprocessing.sequence import pad_sequences
import sentencepiece as spm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model, vocab = get_pytorch_kobert_model()
model.to(device)

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


using cached model
using cached model


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(8002, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Line

In [3]:
# Load pre-trained model tokenizer
tok_path = get_tokenizer()
print(tok_path)
sp = spm.SentencePieceProcessor()
print(sp)
sp.Load(tok_path)

using cached model
/root/kobert/tokenizer_78b3253a26.model
<sentencepiece.SentencePieceProcessor; proxy of <Swig Object of type 'sentencepiece::SentencePieceProcessor *' at 0x7f684992fcf0> >


True

In [4]:
# Tokenized input
text = ["누가 기침소리를 내었는가 ? 누구인가 ?"]
train_tokens = list(map(lambda t: ['[CLS]'] + sp.EncodeAsPieces(t) + ['[SEP]'], text))

print(text[0])
print(train_tokens[0])

train_tokens_ids = pad_sequences(list(map(vocab.to_indices, train_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")
print(train_tokens_ids[0][:20])

누가 기침소리를 내었는가 ? 누구인가 ?
['[CLS]', '▁누가', '▁기', '침', '소리', '를', '▁내', '었', '는', '가', '▁?', '▁누구', '인', '가', '▁?', '[SEP]']
[   2 1527 1258 7491 6609 6116 1434 6885 5760 5330  633 1528 7119 5330
  633    3    0    0    0    0]


# Load Data

In [5]:
num_of_cores = 8

In [6]:
# Load and preprocess
price = pd.read_csv('data/example/price_005930.csv', index_col=0)
print(price.shape)

'''
# convert timestamp of price dataframe as datetime object
pool = multiprocessing.pool.ThreadPool(num_of_cores)
args = list(range(price.shape[0]))
def price_to_datetime(i) : 
    price['time'].iloc[i] = pd.to_datetime(str(price['date'].iloc[i])+str(price['time'].iloc[i]), format='%Y%m%d%H%M')
    pbar.update(1)
    return
with tqdm.tqdm(total=len(args)) as pbar:
    for i in range(len(args)):
        pool.apply_async(price_to_datetime, [args[i]])
    pool.close()
    pool.join()       
'''
# convert timestamp of price dataframe as datetime object
with tqdm.tqdm(total=price.shape[0]) as pbar :
    for i in range(price.shape[0]) : 
        price['time'].iloc[i] = pd.to_datetime(str(price['date'].iloc[i])+str(price['time'].iloc[i]), format='%Y%m%d%H%M')
        pbar.update(1)   
# sort in ascending order
price = price.sort_values(by='time').reset_index(drop=True)
price.index = price['time']
price = price.drop(['date', 'time'], axis=1)
display(price)

  0%|          | 0/3429 [00:00<?, ?it/s]

(3429, 7)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
100%|██████████| 3429/3429 [00:03<00:00, 887.67it/s] 


Unnamed: 0_level_0,close,high,low,open,vol
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-10-31 09:01:00,50900,51100,50900,51000,580976
2019-10-31 09:02:00,51100,51100,50900,50900,46699
2019-10-31 09:03:00,51100,51100,51000,51100,13198
2019-10-31 09:04:00,51000,51100,51000,51100,37929
2019-10-31 09:05:00,51000,51100,50900,51000,83816
...,...,...,...,...,...
2019-11-12 15:17:00,52400,52500,52400,52500,12420
2019-11-12 15:18:00,52500,52500,52400,52400,20269
2019-11-12 15:19:00,52500,52500,52400,52500,18480
2019-11-12 15:20:00,52500,52500,52400,52500,47526


In [None]:
# Load and preprocess
reply = pd.read_csv('data/example/reply_005930.csv', index_col=0).reset_index(drop=True)
print(reply.shape)

# wrong data in the dataframe
#display(pd.DataFrame(reply.iloc[209]).T)
#display(pd.DataFrame(reply.iloc[210]).T)

'''
# convert timestamp of reply dataframe as datetime object, drop wrong rows
list_to_drop = []
pool = multiprocessing.pool.ThreadPool(num_of_cores)
args = list(range(reply.shape[0]))
def reply_to_datetime(i) : 
    try : 
        reply['Date'].iloc[i] = pd.to_datetime(reply['Date'].iloc[i], format='%Y.%m.%d %H:%M')
        pbar.update(1)
    except Exception as e :
        list_to_drop.append(i)
        pbar.update(1)
    return
with tqdm.tqdm(total=len(args)) as pbar:
    for i in range(len(args)):
        pool.apply_async(reply_to_datetime, [args[i]])
    pool.close()
    pool.join() 
'''
# convert timestamp of reply dataframe as datetime object, drop wrong rows
list_to_drop = []
with tqdm.tqdm(total=reply.shape[0]) as pbar : 
    for i in range(reply.shape[0]) : 
        try : 
            reply['Date'].iloc[i] = pd.to_datetime(reply['Date'].iloc[i], format='%Y.%m.%d %H:%M')
            pbar.update(1)
        except Exception as e :
            list_to_drop.append(i)
            pbar.update(1)
reply = reply.drop(list_to_drop, axis=0)
      
# sort in ascending order
reply = reply.sort_values(by='Date').reset_index(drop=True)
reply.index = reply['Date']
reply = reply.drop(['Date'], axis=1)

display(reply[0:3])

  0%|          | 75/20129 [00:00<00:27, 742.31it/s]

(20129, 7)


 88%|████████▊ | 17614/20129 [00:24<00:03, 706.60it/s]

# Merge Dataset

In [None]:
day_start = max(price.index[0], reply.index[0])
day_end = min(price.index[-1], reply.index[-1])
print(day_start)
print(day_end)

In [None]:
# merge dataframe
reply_common = reply[(reply.index >= day_start) & (reply.index <= day_end)]
price_common = price[(price.index >= day_start) & (price.index <= day_end)]
df = reply_common.merge(price_common, how='outer', left_on=reply_common.index, right_on=price_common.index)
df.index = df['key_0']
df = df.drop(['key_0'], axis=1)
display(df)

# check intersection of timestamp
plt.figure(figsize=(10, 4))
plt.plot(price.index, np.zeros(price.shape[0]), label='price')
plt.plot(reply.index, np.ones(reply.shape[0]), label='reply')
plt.plot(df.index, np.full(df.shape[0], 2), label='merged df')
plt.xticks(rotation=90)
plt.legend()
plt.show()

# Preprocess Functions

### 부탁드릴 내용
- 제목 미리보기?
    - '오늘 실적발표 일...'
    - 페이지에 들어가서 잘리지 않은 제목으로

In [None]:
from urlextract import URLExtract
import re

In [None]:
df[0:1]

In [None]:
def replace_tag(content) : 
    return content.replace('\r', '').replace('\n', '').replace('\\', '')

In [None]:
def replace_link(content): 
    extractor = URLExtract()
    urls = extractor.find_urls(content)
    for url in urls : 
        content = content.replace(url, '링크')

    return content

In [None]:
def replace_punctuation(content) : 
    return content.replace(',', '')

In [None]:
def get_numbers(content) : 
    content = content.replace('만원', '0000')
    return content, [int(x) for x in re.findall('\d+', content)]

In [None]:
band = 0.1
# price in txt가 close와 얼마나 비슷한지 확인하고, 너무 차이나면 관계없는 숫자이기때문에 걸러야 함
# price in txt의 dimension도 확인하고, 너무 많을 경우 의미없는 숫자들일 가능성이 큼
def get_price_in_txt(numbers_in_txt, close) : 
    if len(numbers_in_txt) == 1 :
        number_in_txt = numbers_in_txt[0]
        if (number_in_txt <= close * (1+band)) and (number_in_txt >= close * (1-band)) :
            return number_in_txt
        else : 
            return -1
    else :
        return -1

In [None]:
def preprocess_pipeline(text, close) : 
    bypass = replace_tag(text)
    bypass = replace_link(bypass)
    bypass = replace_punctuation(bypass)
    bypass, numbers_in_txt = get_numbers(bypass)
    price_in_txt = get_price_in_txt(numbers_in_txt, close)
    
    return bypass, price_in_txt

### Examples

In [None]:
idx = 10
close = df['close'][idx]
text = df['Title'][idx]
bypass, price_in_txt = preprocess_pipeline(text, close)
print(bypass)
print(close, price_in_txt)

# Tokenized input
bypass = [bypass]
train_tokens = list(map(lambda t: ['[CLS]'] + sp.EncodeAsPieces(t) + ['[SEP]'], bypass))
print(train_tokens[0])
train_tokens_ids = pad_sequences(list(map(vocab.to_indices, train_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")
print(train_tokens_ids[0][:20])

In [None]:
idx = 31
close = df['close'][idx]
text = df['Content'][idx]
bypass, price_in_txt = preprocess_pipeline(text, close)
print(bypass)
print(close, price_in_txt)

# Tokenized input
bypass = [bypass]
train_tokens = list(map(lambda t: ['[CLS]'] + sp.EncodeAsPieces(t) + ['[SEP]'], bypass))
print(train_tokens[0])
train_tokens_ids = pad_sequences(list(map(vocab.to_indices, train_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")
print(train_tokens_ids[0][:20])

# Apply Preprocessing Functions

In [None]:
def preprocess_content(idx) : 
    close = df['close'][idx]
    title = df['Title'][idx]
    content = df['Content'][idx]
    
    bypass_title, price_in_title = preprocess_pipeline(title, close)
    bypass_content, price_in_content = preprocess_pipeline(content, close)
    
    df['Title'].iloc[idx] = bypass_title
    df['Content'].iloc[idx] = bypass_content
    
    if df['PriceInTxt'].iloc[idx] == -1 : 
        df['PriceInTxt'].iloc[idx] = price_in_title
    if df['PriceInTxt'].iloc[idx] == -1 : 
        df['PriceInTxt'].iloc[idx] = price_in_content
    pbar.update(1)
    return

df['PriceInTxt'] = -1
preprocess_content(0)
display(df[0:1])

In [None]:
# convert timestamp of price dataframe as datetime object
pool = multiprocessing.pool.ThreadPool(num_of_cores)
args = list(range(df.shape[0]))
df['PriceInTxt'] = -1

with tqdm.tqdm(total=len(args)) as pbar:
    for i in range(len(args)):
        pool.apply_async(preprocess_content, [args[i]])
    pool.close()
    pool.join()       
    
df.to_pickle('data/example/preprocessed_005930.pkl')

# Handle Numeric Values

In [None]:
df

In [None]:
columns = df.select_dtypes('float64').columns
for column in columns : 
    df[column] = df[column].astype('Int64')

df.to_pickle('data/example/preprocessed_005930.pkl')
df.info()

# Prepare Dataset

In [None]:
df = pd.read_pickle('data/example/preprocessed_005930.pkl')
df

# Model

In [None]:
class BertRegressor(nn.Module) : 
    def __init__(self) : 
        super(BertRegressor, self).__init__()
        self.bert, self.vocab = get_pytorch_kobert_model()
        self.relu = nn.ReLU()
        self.linear = nn.Linear(768, 1)
    
    def forward(self, tokens) : 
        _, pooled_output = self.bert(tokens, utput_all=False)
        linear_output = self.relu(pooled_output)
        predicted_price = self.linear(linear_output)

        return predicted_price

In [None]:
bert_reg = BertRegressor()
bert_reg.to(device)
optimizer = torch.optim.Adam(bert_reg.parameters(), lr=3e-6)
bert_reg.train()

# Training