In [7]:
from bs4 import BeautifulSoup as bs
from urllib.parse import urlparse
from urllib.request import Request, urlopen
import urllib
import re
import requests
import csv
import pandas as pd
from datetime import datetime, date
import yfinance as yf

## Website with directions
https://blog.jovian.ai/web-scraping-yahoo-finance-using-python-7c4612fab70c

### Main Yahoo Finance Webscraping

In [24]:
def get_page(url):
    """Download a webpage and return a beautiful soup doc"""
    
    response = requests.get(url)
    if not response.ok:
        print('Status code:', response.status_code)
        # raise Exception('Failed to load page {}'.format(url))
    page_content = response.text
    doc = bs(page_content, 'html.parser')
    return doc

In [25]:
def get_news_tags(doc):
    """Get the list of tags containing news information"""
    
    news_class = "Ov(h) Pend(44px) Pstart(25px)" ## class name of div tag 
    news_list  = doc.find_all('div', {'class': news_class})
    return news_list

In [26]:
def parse_news(news_tag):
    """Get the news data point and return dictionary"""
    
    base_url = 'https://ca.finance.yahoo.com'
    news_source = news_tag.find('div').text #source
    news_headline = news_tag.find('a').text #heading
    news_url = news_tag.find('a')['href'] #link
    return { 'source' : news_source,
            'headline' : news_headline,
            'url' : base_url + news_url
           }

In [27]:
def scrape_yahoo_news(ticker):
    """Get the yahoo finance market news and write them to CSV file """
    
    yahoo_url = 'https://ca.finance.yahoo.com/quote/'
    url = yahoo_url + ticker
    
    doc = get_page(url)
    news_list = get_news_tags(doc)
    news_data = [parse_news(news_tag) for news_tag in news_list]
    news_df = pd.DataFrame(news_data)

    return news_df 

### Individual Article Page Scraping Functions

In [18]:
def parse_paragraphs(doc):
    """Get the list of tags containing news information"""
    news_class = "caas-body" ## class name of div tag 
    news_list  = doc.find_all('div', {'class': news_class})
    
    return news_list

In [29]:
def parse_news_article(news_tag):
    print('-----------------')
    table = news_tag[0].find_all('p')
    para_list = []
    body_string = ''
    
    for x in table:
        para_list.append(x.text)

    for x in para_list:
        body_string += ' ' + x

    body_string = body_string.replace("\"", '')
    
    return body_string

In [20]:
def get_article_date(url):
    date_class = 'caas-attr-meta-time' ## class name of time stamp
    doc = get_page(url)
    date_stamp = doc.find_all('time', {'class': date_class})
    date = date_stamp[0].get('datetime')
    
    return date

In [21]:
def get_article_body(url):
        
    doc = get_page(url)
    article = parse_paragraphs(doc)
    article_body = parse_news_article(article)

    return article_body

In [22]:
def dataframe_prep(news_df):
    news_df['body'] = news_df['url'][:2].apply(lambda x: get_article_body(x))
    news_df['date'] = news_df['url'][:2].apply(lambda x: get_article_date(x))
    news_df['date'] = pd.to_datetime(news_df['date'], format='%Y-%m-%dT%H:%M:%S.%fZ').dt.date
    news_df[['prob_posi', 'prob_nega', 'prob_neut']] = float()
    news_df['sentiment'] = int()

    return news_df

### Scrape Yahoo Finance, Individual Pages, Create Sentiment Columns

In [40]:
def stock_price(ticker, start, end):
    df = yf.download(
        ticker, 
        start=start, 
        end=end, 
        progress=False
    )

    df.loc[df['Open'] > df['Close'], 'target'] = 0
    df.loc[df['Open'] < df['Close'], 'target'] = 1
    df.loc[df['Open'] == df['Close'], 'target'] = 0
    df['target'] = df['target'].astype('int')

    df = df.reset_index()
    df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d').dt.date

    return df

In [41]:
def dataframe_price_sentiment(ticker, start, end):

    df_price = stock_price(ticker, start, end)
    df_news = scrape_yahoo_news(ticker)
    df_prepped = dataframe_prep(df_news)

    df_final = pd.merge(df_price, df_prepped, right_on='date', left_on='Date')

    return df_final

In [42]:
ticker = 'FM.TO'
start = '2021-01-01'
end = '2022-11-25'

df = dataframe_price_sentiment(
    ticker,
    start,
    end
)

df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,target,source,headline,url,body,date,prob_posi,prob_nega,prob_neut,sentiment
0,2022-11-11,33.0,34.849998,32.529999,33.560001,33.560001,4618400,1,Bloomberg,China Has Links to Dozens of Canadian Miners T...,https://ca.finance.yahoo.com/news/china-links-...,(Bloomberg) -- China has built up stakes in m...,2022-11-11,0.0,0.0,0.0,0
1,2022-11-22,30.790001,31.440001,30.059999,31.17,31.17,1707400,1,CNW Group,VOX ACQUIRES AUSTRALIAN GOLD ROYALTY OVER CARD...,https://ca.finance.yahoo.com/news/vox-acquires...,"TORONTO, Nov. 22, 2022 /CNW/ - Vox Royalty Co...",2022-11-22,0.0,0.0,0.0,0


## NLP Work

### Sentiment Analysis

In [43]:
import finbert as fb

In [44]:
def sentiment_applier(df):

    proba_list = fb.sentiment_analysis(df, bert_model='finbert')
    mean, sentiment, stacks = fb.get_mean_from_proba(proba_list)

    return mean, sentiment

In [45]:
def sentiment_poster(df):

    sent_list = []
    sent_list.append(df['body'].apply(lambda x: sentiment_applier(x)))
    list2 = sent_list[0]

    for n in range(len(list2)): 
        df.at[n, 'prob_posi'] = float(list2[n][0][0])
        df.at[n, 'prob_nega'] = float(list2[n][0][1])
        df.at[n, 'prob_neut'] = float(list2[n][0][2])
        df.at[n, 'sentiment'] = int(list2[n][1])

    return df

In [46]:
# df_new = sentiment_poster(df)
# df_new

## Test - Successful!

In [1]:
import ws_yahoo as wsy
import finbert as fb

In [2]:
ticker = 'HBM' #.TO
start = '2021-01-01'
end = '2022-11-25'

df = wsy.dataframe_price_sentiment(
    ticker,
    start,
    end
)

df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,target,source,headline,url,body,date,prob_posi,prob_nega,prob_neut,sentiment
0,2022-11-02,5.72,5.79,5.42,5.46,5.46,1919000,0,GlobeNewswire,Hudbay Provides Exploration Update and Announc...,https://ca.finance.yahoo.com/news/hudbay-provi...,Figure 1: General Location of the Llaguen Pro...,2022-11-02,0.0,0.0,0.0,0
1,2022-11-02,5.72,5.79,5.42,5.46,5.46,1919000,0,GlobeNewswire,Hudbay Announces Third Quarter 2022 Results,https://ca.finance.yahoo.com/news/hudbay-annou...,"TORONTO, Nov. 02, 2022 (GLOBE NEWSWIRE) -- Hu...",2022-11-02,0.0,0.0,0.0,0
2,2022-11-03,5.24,5.93,5.24,5.87,5.87,2530200,1,Zacks,"HudBay Minerals (HBM) Reports Q3 Loss, Tops Re...",https://ca.finance.yahoo.com/news/hudbay-miner...,HudBay Minerals (HBM) came out with a quarter...,2022-11-03,0.0,0.0,0.0,0
3,2022-11-11,7.3,7.43,7.16,7.27,7.27,1277800,0,Simply Wall St.,Hudbay Minerals Inc.'s (TSE:HBM) Financials Ar...,https://ca.finance.yahoo.com/news/hudbay-miner...,Most readers would already be aware that Hudb...,2022-11-11,0.0,0.0,0.0,0


In [3]:
df_final = fb.sentiment_poster(df)

Token indices sequence length is longer than the specified maximum sequence length for this model (10848 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (11902 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (882 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1123 > 512). Running this sequence through the model will result in indexing errors


In [4]:
df_final

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,target,source,headline,url,body,date,prob_posi,prob_nega,prob_neut,sentiment
0,2022-11-02,5.72,5.79,5.42,5.46,5.46,1919000,0,GlobeNewswire,Hudbay Provides Exploration Update and Announc...,https://ca.finance.yahoo.com/news/hudbay-provi...,Figure 1: General Location of the Llaguen Pro...,2022-11-02,0.096997,0.043384,0.859619,2
1,2022-11-02,5.72,5.79,5.42,5.46,5.46,1919000,0,GlobeNewswire,Hudbay Announces Third Quarter 2022 Results,https://ca.finance.yahoo.com/news/hudbay-annou...,"TORONTO, Nov. 02, 2022 (GLOBE NEWSWIRE) -- Hu...",2022-11-02,0.186034,0.290473,0.523494,2
2,2022-11-03,5.24,5.93,5.24,5.87,5.87,2530200,1,Zacks,"HudBay Minerals (HBM) Reports Q3 Loss, Tops Re...",https://ca.finance.yahoo.com/news/hudbay-miner...,HudBay Minerals (HBM) came out with a quarter...,2022-11-03,0.101192,0.266295,0.632513,2
3,2022-11-11,7.3,7.43,7.16,7.27,7.27,1277800,0,Simply Wall St.,Hudbay Minerals Inc.'s (TSE:HBM) Financials Ar...,https://ca.finance.yahoo.com/news/hudbay-miner...,Most readers would already be aware that Hudb...,2022-11-11,0.053361,0.558811,0.387828,1


In [10]:
def fv_get_page(ticker):
    """Download a webpage and return a beautiful soup doc"""

    headers = {
    'User-Agent': 'Mozilla/5.0',
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"}

    base_url = f'https://finviz.com/quote.ashx?t={ticker}&p=d'
    html = requests.get(base_url, headers=headers)
    doc = bs(html.content, "html.parser")
    
    # response = requests.get(url)
    # if not response.ok:
    #     print('Status code:', response.status_code)
    #     # raise Exception('Failed to load page {}'.format(url))
    # page_content = response.text
    # doc = bs(page_content, 'html.parser')
       
    return doc

In [11]:
def fv_get_news_tags(doc):
    """Get the list of tags containing news information"""
    
    # news_class = "Ov(h) Pend(44px) Pstart(25px)" ## class name of div tag 
    news_list  = doc.find('table', {'id': 'news-table'}) #attrs=
    return news_list

In [12]:
def fv_parse_news(news_tag):
    """Get the news data point and return dictionary"""

    news_source = news_tag.find('div').text #source
    news_headline = news_tag.find('a').text #heading
    news_url = news_tag.find('a')['href'] #link
    
    # base_url = 'https://ca.finance.yahoo.com'
    # news_source = news_tag.find('div', {'class': 'news-link-left'}).text #source
    # news_headline = news_tag.find('a') #heading .text
    
    # if news_headline == -1:
    #     print('yes')

    # news_headline = news_tag.find('div', {'class': 'news-link-left'})
    # print(news_headline)
    # news_url = news_tag.find('a')
    # news = news_url.find('href') #link ['href']
    # print(news)
    return {# 'source' : news_source,
            'headline' : news_headline,
            'url' : news_url # base_url + 
           }

In [36]:
def fv_get_article_body(url):
        
    doc = fv_get_page(url)
    article = parse_paragraphs(doc)
    article_body = parse_news_article(article)

    return article_body

In [13]:
def scrape_finviz2(ticker):
    """Get the yahoo finance market news and write them to CSV file """
    
    finviz_url = f'https://finviz.com/quote.ashx?t={ticker}&p=d'
    url = finviz_url + ticker
    
    doc = get_page(url)
    news_list = get_news_tags(doc)
    news_data = [parse_news(news_tag) for news_tag in news_list]
    news_df = pd.DataFrame(news_data)

    return news_df 

In [48]:
def fv_dataframe(news_list):

    url_list = news_list.find_all('a')

    df_finviz = pd.DataFrame()
    df_finviz[['url', 'headline', 'source']] = ""

    for n in range(len(url_list)):
            url = url_list[n]['href']
            headline = url_list[n].text
            if 'finance.yahoo' in url:
                df_finviz.at[n, 'url'] = url
                df_finviz.at[n, 'headline'] = headline
            else:
                pass

    return df_finviz


In [None]:
def get_article_date(url):
    date_class = 'caas-attr-meta-time' ## class name of time stamp
    doc = get_page(url)
    date_stamp = doc.find_all('time', {'class': date_class})
    date = date_stamp[0].get('datetime')
    
    return date

In [None]:
def get_article_body(url):
        
    doc = get_page(url)
    article = parse_paragraphs(doc)
    article_body = parse_news_article(article)

    return article_body

In [None]:
def dataframe_prep(news_df):
    news_df['body'] = news_df['url'][:4].apply(lambda x: get_article_body(x))
    news_df['date'] = news_df['url'][:4].apply(lambda x: get_article_date(x))
    news_df['date'] = pd.to_datetime(news_df['date'], format='%Y-%m-%dT%H:%M:%S.%fZ').dt.date
    news_df[['prob_posi', 'prob_nega', 'prob_neut']] = float()
    news_df['sentiment'] = int()

    return news_df

#### Working with FinViz scraping

In [66]:
doc = fv_get_page('HBM')
news_list = fv_get_news_tags(doc)
df_fv = fv_dataframe(news_list)
df_final = dataframe_prep(df_fv)

  news_df['body'] = news_df['url'][:4].apply(lambda x: get_article_body(x))


-----------------
-----------------
-----------------
-----------------


  news_df['date'] = news_df['url'][:4].apply(lambda x: get_article_date(x))


In [70]:
df_fv_final = fb.sentiment_poster(df_final[:4])

Token indices sequence length is longer than the specified maximum sequence length for this model (1123 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2323 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (882 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (10848 > 512). Running this sequence through the model will result in indexing errors


In [72]:
df_fv_final

Unnamed: 0,url,headline,body,date,prob_posi,prob_nega,prob_neut,sentiment
0,https://finance.yahoo.com/news/hudbay-minerals...,Hudbay Minerals Inc.'s (TSE:HBM) Financials Ar...,Most readers would already be aware that Hudb...,2022-11-11,0.053361,0.558811,0.387828,1
1,https://finance.yahoo.com/news/10-best-copper-...,10 Best Copper Stocks to Buy Now,"In this article, we discuss the 10 best coppe...",2022-11-08,0.396365,0.232585,0.37105,0
2,https://finance.yahoo.com/news/hudbay-minerals...,"HudBay Minerals (HBM) Reports Q3 Loss, Tops Re...",HudBay Minerals (HBM) came out with a quarter...,2022-11-03,0.101192,0.266295,0.632513,2
3,https://finance.yahoo.com/news/hudbay-provides...,Hudbay Provides Exploration Update and Announc...,Figure 1: General Location of the Llaguen Pro...,2022-11-02,0.096997,0.043384,0.859619,2
