In [1]:
import pandas as pd
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

In [2]:
pwd

'C:\\Users\\jeong\\proj\\nlp_paper'

In [3]:
import glob

In [4]:
glob.glob('data/*.csv')

['data\\CBOE Volatility Index Historical Data.csv',
 'data\\href.csv',
 'data\\lkdf.csv',
 'data\\newslink.csv',
 'data\\news_vix.csv',
 'data\\news_vix1.csv',
 'data\\news_vix2.csv']

In [5]:
df = pd.read_csv('data/news_vix2.csv')

In [6]:
df.shape

(241, 3)

In [7]:
df.head()

Unnamed: 0,date,title,news
0,"Jun 23, 2019",Energy & Precious Metals - Weekly Review and C...,© Reuters.\n\nBy Barani Krishnan\n\nInvesting....
1,"Jun 23, 2019",3 Things Under the Radar This Week,© Reuters.\n\nInvesting.com - Here’s a look at...
2,"Jun 23, 2019",Economic Calendar - Top 5 Things to Watch This...,© Reuters.\n\nInvesting.com - Market watchers ...
3,"Jun 23, 2019",U.S. stocks lower at close of trade; Dow Jones...,© Reuters. U.S. stocks lower at close of trade...
4,"June 21, 2019 10:00am","Boeing Company (The) (NYSE:BA), Caterpillar, I...",It’s been a quick comeback for the S&P 500 Ind...


In [12]:
df['dat'] = pd.to_datetime(df.date)

In [13]:
df['ymd'] = df.dat.dt.strftime('%Y%m%d')

In [14]:
df.head()

Unnamed: 0,date,title,news,dat,ymd
0,"Jun 23, 2019",Energy & Precious Metals - Weekly Review and C...,© Reuters.\n\nBy Barani Krishnan\n\nInvesting....,2019-06-23 00:00:00,20190623
1,"Jun 23, 2019",3 Things Under the Radar This Week,© Reuters.\n\nInvesting.com - Here’s a look at...,2019-06-23 00:00:00,20190623
2,"Jun 23, 2019",Economic Calendar - Top 5 Things to Watch This...,© Reuters.\n\nInvesting.com - Market watchers ...,2019-06-23 00:00:00,20190623
3,"Jun 23, 2019",U.S. stocks lower at close of trade; Dow Jones...,© Reuters. U.S. stocks lower at close of trade...,2019-06-23 00:00:00,20190623
4,"June 21, 2019 10:00am","Boeing Company (The) (NYSE:BA), Caterpillar, I...",It’s been a quick comeback for the S&P 500 Ind...,2019-06-21 10:00:00,20190621


In [15]:
df.ymd.value_counts(dropna=False)

20190623    240
20190621      1
Name: ymd, dtype: int64

각 기사별로 doc2vec을 해보자 

In [16]:
df1 = df.loc[:,['ymd','title','news']]

In [17]:
df1.head()

Unnamed: 0,ymd,title,news
0,20190623,Energy & Precious Metals - Weekly Review and C...,© Reuters.\n\nBy Barani Krishnan\n\nInvesting....
1,20190623,3 Things Under the Radar This Week,© Reuters.\n\nInvesting.com - Here’s a look at...
2,20190623,Economic Calendar - Top 5 Things to Watch This...,© Reuters.\n\nInvesting.com - Market watchers ...
3,20190623,U.S. stocks lower at close of trade; Dow Jones...,© Reuters. U.S. stocks lower at close of trade...
4,20190621,"Boeing Company (The) (NYSE:BA), Caterpillar, I...",It’s been a quick comeback for the S&P 500 Ind...


In [1]:
# tokenize 및 stopwords 제거 
from nltk.corpus import stopwords
import nltk
import string
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kulib015\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [55]:
stopwz = set(stopwords.words('english')+['reuters','investing.com']+[p for p in string.punctuation]) # reuters,investing.com, 구두점 추가 

In [56]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')  # alphanumeric characters (영문자와 숫자만 가져옴)
# word_tokenize 대신 쓸 수 있음 

In [72]:
data = df1.news.tolist()
token_news = [word_tokenize(d.lower()) for d in data]
#token_news = [tokenizer.tokenize(d.lower()) for d in data]

In [73]:
token_news_filtered = []
for d in token_news:
    _d = [w for w in d if w not in stopwz]
    token_news_filtered.append(_d)

In [79]:
# tagged data 만들기 
tagged_news = [TaggedDocument(words = d, tags = [str(i)]) for i, _d in enumerate(token_news_filtered)]

In [80]:
# modeling
max_epochs = 100
vec_size = 100
alpha = 0.25

model = Doc2Vec(vector_size = vec_size,
               alpha = alpha,
               min_alpha = 0.00025,
               min_count = 1,
               dm = 1) 

In [81]:
model.build_vocab(tagged_news)

In [82]:
%%time
for epoch in range(max_epochs):
    if epoch % 10 ==0:
        print('iteration {0}'.format(epoch))
    model.train(tagged_news,
               total_examples = model.corpus_count,
               epochs = model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

iteration 0
iteration 10
iteration 20
iteration 30
iteration 40
iteration 50
iteration 60
iteration 70
iteration 80
iteration 90
Wall time: 37.2 s


In [83]:
model.save('data/d2v.model')
print('Model Saved')

# dm은 training 알고리즘을 결정함 
# dm=1: distributed memory(PV-DM)
# dm=0: distributed bag of words(PV_DBOW)
# distributed memory: 문서안의 단어순서를 보존한다 
# distributed bag of words: bag of approach를 이용한다(단어순서 보존안함) 

Model Saved


In [99]:
# vector 생성 
vectors = [model.infer_vector(x) for x in token_news_filtered]

In [102]:
dvec = pd.DataFrame(vectors, columns=['dv'+str(i) for i in range(100)])

In [101]:
df1['token_news'] = token_news_filtered
df1['tagged_news'] = tagged_news

In [105]:
df2 = pd.concat([df1,dvec],axis=1)

In [106]:
df2.head()

Unnamed: 0,ymd,title,news,tagged_news,token_news,dv0,dv1,dv2,dv3,dv4,...,dv90,dv91,dv92,dv93,dv94,dv95,dv96,dv97,dv98,dv99
0,20190623,Energy & Precious Metals - Weekly Review and C...,© Reuters.\n\nBy Barani Krishnan\n\nInvesting....,"([investing.com, -, u.s., stocks, traded, larg...","[©, ., barani, krishnan, -, one, week, ago, ,,...",3.681059,-5.299938,-14.765718,-3.964301,-4.60542,...,3.318588,9.164207,-9.478999,-1.168177,0.837244,-2.437616,-7.246206,-2.925656,0.212761,-7.314521
1,20190623,3 Things Under the Radar This Week,© Reuters.\n\nInvesting.com - Here’s a look at...,"([investing.com, -, u.s., stocks, traded, larg...","[©, ., -, ’, look, three, things, radar, past,...",0.003017,-0.00231,0.002924,0.004959,-0.002456,...,0.00027,0.00415,-0.002946,-0.001289,0.000254,0.002503,-0.003372,0.004024,0.001015,0.004009
2,20190623,Economic Calendar - Top 5 Things to Watch This...,© Reuters.\n\nInvesting.com - Market watchers ...,"([investing.com, -, u.s., stocks, traded, larg...","[©, ., -, market, watchers, looking, ahead, me...",-0.004053,0.003615,-0.003987,0.000428,-0.002788,...,0.000225,-0.004779,-0.001789,0.003122,-0.000413,0.001716,0.004466,0.001656,-0.004091,0.001994
3,20190623,U.S. stocks lower at close of trade; Dow Jones...,© Reuters. U.S. stocks lower at close of trade...,"([investing.com, -, u.s., stocks, traded, larg...","[©, ., u.s., stocks, lower, close, trade, ;, d...",0.004065,-0.058798,0.017575,-0.056746,-0.028863,...,0.023865,0.044793,-0.076246,0.063003,-0.007543,0.019365,-0.012899,0.028552,-0.04466,0.076078
4,20190621,"Boeing Company (The) (NYSE:BA), Caterpillar, I...",It’s been a quick comeback for the S&P 500 Ind...,"([investing.com, -, u.s., stocks, traded, larg...","[’, quick, comeback, &, p, 500, index, (, spx,...",-0.00327,0.001146,-0.003354,-0.004367,0.003661,...,-0.004649,-0.001999,-0.003349,-0.004594,0.004348,-0.001614,-0.003387,-0.000753,-0.002382,0.003448


In [84]:
# 모델을 이용할 차례 
model = Doc2Vec.load('data/d2v.model')
test_data = word_tokenize('I love chatbots'.lower())
v1 = model.infer_vector(test_data)
print('V1_infer',v1)

V1_infer [-1.78228260e-03 -3.45725333e-03 -3.76688666e-04  5.96411119e-04
  2.83469446e-03 -3.83976474e-03  1.49447843e-03 -1.87169004e-03
  4.96226409e-03  1.11447810e-03 -4.46231943e-03 -1.24006602e-03
  1.71593647e-03  4.98096133e-03  4.56714397e-03 -4.16751439e-03
  3.52493883e-03 -2.56393128e-03 -1.97161012e-03 -4.11430327e-03
  1.98350335e-03  4.68551088e-03  3.30670690e-03 -2.18727742e-03
 -2.70389416e-03 -2.91808555e-03  2.54630053e-04 -1.10330584e-04
 -1.77926151e-03 -3.74003430e-03  4.41862177e-03 -2.37182854e-03
 -1.31218252e-03  4.43235226e-03  4.43795789e-03  1.32272742e-03
 -1.75345095e-03  9.85375489e-04 -1.33362133e-03 -2.83664383e-04
  1.15636957e-03  1.38969335e-03  9.68040258e-04 -2.29854006e-04
 -3.85182025e-03  1.79697608e-03  1.27739005e-03 -1.89364527e-03
 -3.51843541e-03 -1.60779187e-03 -1.79216370e-03 -1.81992154e-03
 -1.70126616e-04  3.59108672e-03  3.62395006e-03 -3.96952173e-03
  4.86313691e-03  2.27208482e-03  3.55108059e-03 -3.95667925e-03
 -1.80476950e-03