In [1]:
import pandas as pd
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

In [2]:
pwd

'C:\\Users\\jeong\\proj\\nlp_paper'

In [3]:
import glob

In [4]:
glob.glob('data/*.csv')

['data\\CBOE Volatility Index Historical Data.csv',
 'data\\href.csv',
 'data\\lkdf.csv',
 'data\\newslink.csv',
 'data\\news_vix.csv',
 'data\\news_vix1.csv',
 'data\\news_vix2.csv']

In [5]:
df = pd.read_csv('data/news_vix2.csv')

In [6]:
df.shape

(241, 3)

In [7]:
df.head()

Unnamed: 0,date,title,news
0,"Jun 23, 2019",Energy & Precious Metals - Weekly Review and C...,© Reuters.\n\nBy Barani Krishnan\n\nInvesting....
1,"Jun 23, 2019",3 Things Under the Radar This Week,© Reuters.\n\nInvesting.com - Here’s a look at...
2,"Jun 23, 2019",Economic Calendar - Top 5 Things to Watch This...,© Reuters.\n\nInvesting.com - Market watchers ...
3,"Jun 23, 2019",U.S. stocks lower at close of trade; Dow Jones...,© Reuters. U.S. stocks lower at close of trade...
4,"June 21, 2019 10:00am","Boeing Company (The) (NYSE:BA), Caterpillar, I...",It’s been a quick comeback for the S&P 500 Ind...


In [12]:
df['dat'] = pd.to_datetime(df.date)

In [13]:
df['ymd'] = df.dat.dt.strftime('%Y%m%d')

In [14]:
df.head()

Unnamed: 0,date,title,news,dat,ymd
0,"Jun 23, 2019",Energy & Precious Metals - Weekly Review and C...,© Reuters.\n\nBy Barani Krishnan\n\nInvesting....,2019-06-23 00:00:00,20190623
1,"Jun 23, 2019",3 Things Under the Radar This Week,© Reuters.\n\nInvesting.com - Here’s a look at...,2019-06-23 00:00:00,20190623
2,"Jun 23, 2019",Economic Calendar - Top 5 Things to Watch This...,© Reuters.\n\nInvesting.com - Market watchers ...,2019-06-23 00:00:00,20190623
3,"Jun 23, 2019",U.S. stocks lower at close of trade; Dow Jones...,© Reuters. U.S. stocks lower at close of trade...,2019-06-23 00:00:00,20190623
4,"June 21, 2019 10:00am","Boeing Company (The) (NYSE:BA), Caterpillar, I...",It’s been a quick comeback for the S&P 500 Ind...,2019-06-21 10:00:00,20190621


In [15]:
df.ymd.value_counts(dropna=False)

20190623    240
20190621      1
Name: ymd, dtype: int64

각 기사별로 doc2vec을 해보자 

In [16]:
df1 = df.loc[:,['ymd','title','news']]

In [17]:
df1.head()

Unnamed: 0,ymd,title,news
0,20190623,Energy & Precious Metals - Weekly Review and C...,© Reuters.\n\nBy Barani Krishnan\n\nInvesting....
1,20190623,3 Things Under the Radar This Week,© Reuters.\n\nInvesting.com - Here’s a look at...
2,20190623,Economic Calendar - Top 5 Things to Watch This...,© Reuters.\n\nInvesting.com - Market watchers ...
3,20190623,U.S. stocks lower at close of trade; Dow Jones...,© Reuters. U.S. stocks lower at close of trade...
4,20190621,"Boeing Company (The) (NYSE:BA), Caterpillar, I...",It’s been a quick comeback for the S&P 500 Ind...


In [38]:
# tokenize 및 stopwords 제거 
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

In [55]:
stopwz = set(stopwords.words('english')+['reuters','investing.com']) # reuters,investing.com 추가 

In [56]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')  # alphanumeric characters (영문자와 숫자만 가져옴)
# word_tokenize 대신 쓸 수 있음 

In [72]:
data = df1.news.tolist()
token_news = [word_tokenize(d.lower()) for d in data]
#token_news = [tokenizer.tokenize(d.lower()) for d in data]

In [73]:
token_news_filtered = []
for d in token_news:
    _d = [w for w in d if w not in stopwz]
    token_news_filtered.append(_d)

In [79]:
# tagged data 만들기 
tagged_news = [TaggedDocument(words = d, tags = [str(i)]) for i, _d in enumerate(token_news_filtered)]

In [80]:
# modeling
max_epochs = 100
vec_size = 100
alpha = 0.25

model = Doc2Vec(vector_size = vec_size,
               alpha = alpha,
               min_alpha = 0.00025,
               min_count = 1,
               dm = 1) 

In [81]:
model.build_vocab(tagged_news)

In [82]:
%%time
for epoch in range(max_epochs):
    if epoch % 10 ==0:
        print('iteration {0}'.format(epoch))
    model.train(tagged_news,
               total_examples = model.corpus_count,
               epochs = model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

iteration 0
iteration 10
iteration 20
iteration 30
iteration 40
iteration 50
iteration 60
iteration 70
iteration 80
iteration 90
Wall time: 37.2 s
