In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import requests
from datetime import datetime as dt, timedelta, date, time
import datetime
from tqdm import tqdm
from bs4 import BeautifulSoup
import joblib
import spacy
from spacy.lang.pt.stop_words import STOP_WORDS
from spacy.lang.pt import PortugueseDefaults
from spacy.symbols import IS_PUNCT
from spacy.tokenizer import Tokenizer
from nltk.util import ngrams
import multiprocessing

In [2]:
def get_stock_data():
    pass

def update_stock_data():
    pass

In [3]:
def load_stock_data(ticker='VALE3'):
    '''Load csv file to a pandas dataframe.
    
    Args:
        symbol: company's symbol
        
    Returns:
        A pandas dataframe containing the stock's data.
    '''
    
    df = pd.read_csv(f'data/{ticker}.csv')
    df.columns = ['date', 'close', 'open', 'high', 'low', 'volume', 'pct_change']
    
    return df

In [4]:
def clean_stock_data(df):
    '''Clean dataframe: change variables to appropriate dtypes.
    
    Args:
        df: pandas dataframe
        
    Returns:
        A cleaned dataframe ready to be modeled.
    '''
    
    df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y')
    
    cols_num = ['close', 'open', 'high', 'low']
    df[cols_num] = df[cols_num].apply(lambda col: col.str.replace(',', '.').astype(float), axis=1)
    
    df['volume'] = df['volume'].str.replace(',', '').str.replace('M', '000000')
    df['volume'] = df['volume'].str.replace(',', '').str.replace('K', '000')
    df['volume'] = df['volume'].astype(int)
    
    df['pct_change'] = df['pct_change'].str.replace(',', '.').str.rstrip('%')
    df['pct_change'] = df['pct_change'].astype(float) / 100
    
    return df

In [5]:
def scraper(url):
    '''Scrape news data on stocks from a given url.
    
    Args:
        url: a string containing the news' url
        
    Returns:
        datetime: a datetime object containing date and time when news was released to the public
        text: the contents of the news
    '''
    
    page = requests.get(url, headers={'User-Agent': 'Chrome/79.0.3945.130'})
    soup = BeautifulSoup(page.content, 'html.parser')
    
    content_section = soup.find('div', {'class': 'contentSectionDetails'})
    datetime = dt.strptime(re.search(r'(\d{2}.\d{2}.\d{4} \d{2}:\d{2})',
                                     content_section.find('span').text).group(),
                           '%d.%m.%Y %H:%M')
    
    text_section = soup.find('div', {'class': 'WYSIWYG articlePage'}).find_all('p')
    text = ' '.join([text_section[i].text for i in range(1, len(text_section))])
    
    return datetime, text

In [6]:
# def get_news_data():
#     '''Get news on stocks from Investing.com website.
    
#     Returns:
#         A pandas dataframe containing datetime, title and contents from the news.
#     '''
    
#     url_investing = 'https://br.investing.com'
#     title_ls = []
#     dt_ls = []
#     text_ls = []
#     for page in tqdm(range(1, 321)):
#         url = f'/equities/vale-on-n1-news/{page}'
#         page = requests.get(url_investing + url,
#                             headers={'User-Agent': 'Chrome/79.0.3945.130'})
#         soup = BeautifulSoup(page.content, 'html.parser')
        
#         anchors_ls = [art.find('a') for art in soup.find_all('article')]
#         for a in anchors_ls:
#             if a['href'].startswith('/news'):
#                 title_ls.append(a.find('img').get('alt'))
#                 url_contents = scraper(url_investing + a['href'])
#                 dt_ls.append(url_contents[0])
#                 text_ls.append(url_contents[1])
        
#     news_data = {'datetime': dt_ls, 'title': title_ls, 'text': text_ls}
#     news_df = pd.DataFrame(news_data)
        
#     return news_df

In [7]:
def get_news_data(page):
    '''Get news on stocks from the page in Investing.com website.
    
    Args:
        page: the page from which the news will be extracted
    
    Returns:
        A pandas dataframe for the page containing datetime, title and contents from the news.
    '''
    
    url_investing = 'https://br.investing.com'
    title_ls = []
    dt_ls = []
    text_ls = []
    url = f'/equities/vale-on-n1-news/{page}'
    result = requests.get(url_investing + url,
                          headers={'User-Agent': 'Chrome/79.0.3945.130'})
    soup = BeautifulSoup(result.content, 'html.parser')

    anchors_ls = [art.find('a') for art in soup.find_all('article')]
    for a in anchors_ls:
        if a['href'].startswith('/news'):
            title_ls.append(a.find('img').get('alt'))
            url_contents = scraper(url_investing + a['href'])
            dt_ls.append(url_contents[0])
            text_ls.append(url_contents[1])

    news_data = {'datetime': dt_ls, 'title': title_ls, 'text': text_ls}
    news_df = pd.DataFrame(news_data)
    
    joblib.dump(news_df, f'data/news/news_df{page}.pkl')
    
    return news_df

In [8]:
def output_news_df():
    '''Parallelizes the extraction of news data using the get_news_data function.
    
    Returns:
        A pandas dataframe containing all the news from 2016 until today.
    '''
    
    pool = multiprocessing.Pool()
    df_ls = pool.map(get_news_data, range(1, 322))
    pool.terminate()
    pool.join()
    
    news_df = (pd.concat(df_ls, axis=0)
               .drop_duplicates()
               .sort_values('datetime', ascending=False)
               .reset_index(drop=True))
    
    joblib.dump(news_df, 'data/news_df.pkl')
    
    return news_df

In [9]:
# news_df = output_news_df()

In [10]:
news_df = joblib.load('data/news_df.pkl')

In [11]:
def clean_news_df(df):
    '''Clean and aggregate news by date. Given a date, only news realeased before market closure are aggregated.
    
    Args:
        df: pandas dataframe containing all the news ordered by datetime
        
    Returns:
        A pandas dataframe with all the news aggregated by date.
    '''
    
    df = (df[(df['datetime'] <= dt(2020, 1, 31, 18)) & (df['datetime'] >= dt(2016, 1, 4))]
          .reset_index(drop=True))
    
    start = date(2016, 1, 3)
    end = date(2020, 1, 31)
    date_span = [end - timedelta(days=d) for d in range(0, (end - start).days)]

    date_ls = []
    news_ls = []
    for d in date_span:
        date_ls.append(d)

        window = ((df['datetime'] < dt.combine(d, time(18))) &\
                  (df['datetime'] >= dt.combine(d - timedelta(days=1), time(18))))
        news_ls.append(list(df.loc[window, 'text'].values))

    agg_news = {'date': date_ls, 'news': news_ls}
    agg_news_df = pd.DataFrame(agg_news)
    agg_news_df['date'] = pd.to_datetime(agg_news_df['date'], infer_datetime_format=True)
    
    return agg_news_df

In [12]:
vale3_df = clean_stock_data(load_stock_data())
agg_news_df = clean_news_df(news_df)
full_df = vale3_df.merge(agg_news_df, on='date')

In [13]:
full_df.head()

Unnamed: 0,date,close,open,high,low,volume,pct_change,news
0,2020-01-31,50.27,50.96,51.06,49.8,2999000000,-0.0239,[SÃO PAULO (Reuters) - A Agência Nacional de T...
1,2020-01-30,51.5,49.97,51.5,49.8,2308000000,0.0148,[RIO DE JANEIRO (Reuters) - A Petrobras (SA:PE...
2,2020-01-29,50.75,51.7,51.89,50.46,2332000000,-0.0088,"[Entretanto, a empresa pontuou que não houve a..."
3,2020-01-28,51.2,51.2,52.09,51.08,2305000000,0.0137,[Os empregados da Vale na China estão em regim...
4,2020-01-27,50.51,51.66,51.82,50.48,3878000000,-0.0612,[Investing.com - Na parte final da tarde de on...


In [37]:
nlp = spacy.load('pt_core_news_sm')

with open('data/stopwords-pt.txt') as f:
    stopwords_pt = [line.rstrip() for line in f]
    
for w in stopwords_pt:
    nlp.vocab[w].is_stop = True
    
# tokens_clean_lemma = [token.lemma_ for token in doc if not (token.is_stop or token.is_punct)]
# def append_ngrams(tokens):
#     bigrams = [' '.join(bigram) for bigram in list(ngrams(tokens, 2))]
#     trigrams = [' '.join(trigram) for trigram in list(ngrams(tokens, 3))]
#     return tokens + bigrams + trigrams

In [38]:
def news_sent_tokenizer(news):
    doc = nlp(news)
    sentences = [sent for sent in doc.sents]
    news_sent_tokens = []
    for sent in sentences:
        sent_tokens = [token.lemma_ for token in sent if not (token.is_stop or token.is_punct)]
        news_sent_tokens.append(sent_tokens)
        
    return news_sent_tokens

In [44]:
print(full_df.loc[0, 'news'][5])

Segundo dados preliminares de fechamento, o Ibovespa teve oscilação negativa de 0,04%, a 115.341,52​ pontos. O giro financeiro da sessão somou 22,6 bilhões de reais. Nas pontas, Gerdau (SA:GGBR4) ​ subiu 2,8%, enquanto Braskem (SA:BRKM5) perdeu 3%. Entre as ações mais importantes do índice, Itaú Unibanco avançou 1,15%, enquanto Bradesco teve ganho de 1,06%. ​ Banco do Brasil (SA:BBAS3) teve valorização de 0,69% e Santander Brasil evoluiu 0,65%. Vale (SA:VALE3) fechou em alta de 0,95% e Petrobras PN teve  teve ganho de 0,35%, enquanto Petrobras (SA:PETR4) ON subiu 1,55%.  Com o desempenho da sessão, o Ibovespa acumula baixa de 1,7% no ano. O índice está 9,2% acima da média dos últimos 200 dias de negócios. Nas últimas 52 semanas, o Ibovespa acumula 17,2% de ganho. ​​ Veja o fechamento de outros índices da B3 nesta quinta-feira: - IBrX 100:-0,11%, 48.735,45​ pontos. - IBrX 50:0,02%, 18.796,78 pontos. - IBrA:-0,18%, 4.587,15 pontos. - Índice Small Cap (SMLL):-1,18%, 2.904,09 pontos. - Índ

In [42]:
news = full_df.loc[0, 'news'][5]
news_sent_tokenizer(news)

[['dar',
  'preliminar',
  'fechamento',
  'Ibovespa',
  'oscilação',
  'negativo',
  '0,04',
  '  ',
  '115.341,52\u200b'],
 ['girar', 'financeiro', 'sessão', 'somar', '22,6', 'bilião', 'real'],
 ['pontar',
  'Gerdau',
  'SA',
  'GGBR4',
  '\u200b',
  'subir',
  '2,8',
  'Braskem',
  'SA',
  'BRKM5',
  'perder',
  '3'],
 ['ações',
  'importante',
  'índice',
  'Itaú',
  'Unibanco',
  'avançar',
  '1,15',
  '  ',
  'Bradesco',
  '  ',
  'ganhar',
  '1,06',
  '  '],
 ['\u200b',
  'Banco',
  'Brasil',
  'SA',
  'BBAS3',
  '  ',
  'valorização',
  '0,69',
  '  ',
  'Santander',
  'Brasil',
  'evoluir',
  '0,65'],
 ['Vale',
  'SA',
  'VALE3',
  'fechar',
  '  ',
  'alto',
  '0,95',
  '  ',
  'Petrobras',
  'PN',
  '  ',
  'ganhar',
  '0,35',
  '  ',
  'Petrobras',
  'SA',
  'PETR4'],
 ['ON', '  ', 'subir', '1,55', '\xa0 '],
 ['desempenhar',
  '  ',
  'sessão',
  '  ',
  'Ibovespa',
  'acumular',
  '  ',
  'baixo',
  '1,7',
  '  '],
 ['índice',
  '  ',
  '9,2',
  'acima',
  '  ',
  'médio',

In [61]:
PortugueseDefaults.lex_attr_getters[IS_PUNCT]

<function spacy.lang.lex_attrs.is_punct(text)>