In [1]:
from bs4 import BeautifulSoup as bs
from urllib.parse import urlparse
from urllib.request import Request, urlopen
import urllib
import re
import requests
import csv
import pandas as pd
from datetime import datetime, date
import yfinance as yf

## Website with directions
https://blog.jovian.ai/web-scraping-yahoo-finance-using-python-7c4612fab70c

### Main Yahoo Finance Webscraping

In [2]:
def get_page(url):
    """Download a webpage and return a beautiful soup doc"""
    
    response = requests.get(url)
    if not response.ok:
        print('Status code:', response.status_code)
        # raise Exception('Failed to load page {}'.format(url))
    page_content = response.text
    doc = bs(page_content, 'html.parser')
    return doc

In [3]:
def get_news_tags(doc):
    """Get the list of tags containing news information"""
    
    news_class = "Ov(h) Pend(44px) Pstart(25px)" ## class name of div tag 
    news_list  = doc.find_all('div', {'class': news_class})
    return news_list

In [4]:
def parse_news(news_tag):
    """Get the news data point and return dictionary"""
    
    base_url = 'https://ca.finance.yahoo.com'
    news_source = news_tag.find('div').text #source
    news_headline = news_tag.find('a').text #heading
    news_url = news_tag.find('a')['href'] #link
    return { 'source' : news_source,
            'headline' : news_headline,
            'url' : base_url + news_url
           }

In [5]:
def scrape_yahoo_news(ticker):
    """Get the yahoo finance market news and write them to CSV file """
    
    yahoo_url = 'https://ca.finance.yahoo.com/quote/'
    url = yahoo_url + ticker
    
    doc = get_page(url)
    news_list = get_news_tags(doc)
    news_data = [parse_news(news_tag) for news_tag in news_list]
    news_df = pd.DataFrame(news_data)

    return news_df 

### Individual Article Page Scraping Functions

In [6]:
def parse_paragraphs(doc):
    """Get the list of tags containing news information"""
    news_class = "caas-body" ## class name of div tag 
    news_list  = doc.find_all('div', {'class': news_class})
    
    return news_list

In [7]:
def parse_news_article(news_tag):
    table = news_tag[0].find_all('p')
    para_list = []
    body_string = ''
    
    for x in table:
        para_list.append(x.text)

    for x in para_list:
        body_string += ' ' + x

    body_string = body_string.replace("\"", '')
    
    return body_string

In [8]:
def get_article_date(url):
    date_class = 'caas-attr-meta-time' ## class name of time stamp
    doc = get_page(url)
    date_stamp = doc.find_all('time', {'class': date_class})
    date = date_stamp[0].get('datetime')
    
    return date

In [9]:
def get_article_body(url):
        
    doc = get_page(url)
    article = parse_paragraphs(doc)
    article_body = parse_news_article(article)

    return article_body

In [10]:
def dataframe_prep(news_df):
    news_df['body'] = news_df['url'][:2].apply(lambda x: get_article_body(x))
    news_df['date'] = news_df['url'][:2].apply(lambda x: get_article_date(x))
    news_df['date'] = pd.to_datetime(news_df['date'], format='%Y-%m-%dT%H:%M:%S.%fZ').dt.date
    news_df[['prob_posi', 'prob_nega', 'prob_neut']] = float()
    news_df['sentiment'] = int()

    return news_df

### Scrape Yahoo Finance, Individual Pages, Create Sentiment Columns

In [11]:
def stock_price(ticker, start, end):
    df = yf.download(
        ticker, 
        start=start, 
        end=end, 
        progress=False
    )

    df.loc[df['Open'] > df['Close'], 'target'] = 0
    df.loc[df['Open'] < df['Close'], 'target'] = 1
    df.loc[df['Open'] == df['Close'], 'target'] = 0
    df['target'] = df['target'].astype('int')

    df = df.reset_index()
    df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d').dt.date

    return df

In [12]:
def dataframe_price_sentiment(ticker, start, end):

    df_price = stock_price(ticker, start, end)
    df_news = scrape_yahoo_news(ticker)
    df_prepped = dataframe_prep(df_news)

    df_final = pd.merge(df_price, df_prepped, right_on='date', left_on='Date')

    return df_final

In [13]:
ticker = 'FM.TO'
start = '2021-01-01'
end = '2022-11-25'

df = dataframe_price_sentiment(
    ticker,
    start,
    end
)

df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,target,source,headline,url,body,date,prob_posi,prob_nega,prob_neut,sentiment
0,2022-11-11,33.0,34.849998,32.529999,33.560001,33.560001,4618400,1,Bloomberg,China Has Links to Dozens of Canadian Miners T...,https://ca.finance.yahoo.com/news/china-links-...,(Bloomberg) -- China has built up stakes in m...,2022-11-11,0.0,0.0,0.0,0
1,2022-11-22,30.790001,31.440001,30.059999,31.17,31.17,1707400,1,CNW Group,VOX ACQUIRES AUSTRALIAN GOLD ROYALTY OVER CARD...,https://ca.finance.yahoo.com/news/vox-acquires...,"TORONTO, Nov. 22, 2022 /CNW/ - Vox Royalty Co...",2022-11-22,0.0,0.0,0.0,0


## NLP Work

### Sentiment Analysis

In [14]:
import finbert as fb

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
def sentiment_applier(df):

    proba_list = fb.sentiment_analysis(df, bert_model='finbert')
    mean, sentiment, stacks = fb.get_mean_from_proba(proba_list)

    return mean, sentiment

In [16]:
def sentiment_poster(df):

    sent_list = []
    sent_list.append(df['body'].apply(lambda x: sentiment_applier(x)))
    list2 = sent_list[0]

    for n in range(len(list2)): 
        df.at[n, 'prob_posi'] = float(list2[n][0][0])
        df.at[n, 'prob_nega'] = float(list2[n][0][1])
        df.at[n, 'prob_neut'] = float(list2[n][0][2])
        df.at[n, 'sentiment'] = int(list2[n][1])

    return df

In [17]:
# df_new = sentiment_poster(df)

Token indices sequence length is longer than the specified maximum sequence length for this model (946 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (3873 > 512). Running this sequence through the model will result in indexing errors


In [18]:
df_new

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,target,source,headline,url,body,date,prob_posi,prob_nega,prob_neut,sentiment
0,2022-11-11,33.0,34.849998,32.529999,33.560001,33.560001,4618400,1,Bloomberg,China Has Links to Dozens of Canadian Miners T...,https://ca.finance.yahoo.com/news/china-links-...,(Bloomberg) -- China has built up stakes in m...,2022-11-11,0.062349,0.440163,0.497488,2
1,2022-11-22,30.790001,31.440001,30.059999,31.17,31.17,1707400,1,CNW Group,VOX ACQUIRES AUSTRALIAN GOLD ROYALTY OVER CARD...,https://ca.finance.yahoo.com/news/vox-acquires...,"TORONTO, Nov. 22, 2022 /CNW/ - Vox Royalty Co...",2022-11-22,0.069084,0.034001,0.896915,2


## Test - Successful!

In [19]:
import ws_yahoo as wsy
import finbert as fb

In [20]:
ticker = 'HBM.TO'
start = '2021-01-01'
end = '2022-11-25'

df = wsy.dataframe_price_sentiment(
    ticker,
    start,
    end
)

df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,target,source,headline,url,body,date,prob_posi,prob_nega,prob_neut,sentiment
0,2022-11-03,5.24,5.93,5.24,5.87,5.87,2530200,1,Zacks,"HudBay Minerals (HBM) Reports Q3 Loss, Tops Re...",https://ca.finance.yahoo.com/news/hudbay-miner...,HudBay Minerals (HBM) came out with a quarter...,2022-11-03,0.0,0.0,0.0,0
1,2022-11-11,7.3,7.43,7.16,7.27,7.27,1277800,0,Simply Wall St.,Hudbay Minerals Inc.'s (TSE:HBM) Financials Ar...,https://ca.finance.yahoo.com/news/hudbay-miner...,Most readers would already be aware that Hudb...,2022-11-11,0.0,0.0,0.0,0


In [21]:
df_final = fb.sentiment_poster(df)

Token indices sequence length is longer than the specified maximum sequence length for this model (882 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1123 > 512). Running this sequence through the model will result in indexing errors


In [22]:
df_final

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,target,source,headline,url,body,date,prob_posi,prob_nega,prob_neut,sentiment
0,2022-11-03,5.24,5.93,5.24,5.87,5.87,2530200,1,Zacks,"HudBay Minerals (HBM) Reports Q3 Loss, Tops Re...",https://ca.finance.yahoo.com/news/hudbay-miner...,HudBay Minerals (HBM) came out with a quarter...,2022-11-03,0.101192,0.266295,0.632513,2
1,2022-11-11,7.3,7.43,7.16,7.27,7.27,1277800,0,Simply Wall St.,Hudbay Minerals Inc.'s (TSE:HBM) Financials Ar...,https://ca.finance.yahoo.com/news/hudbay-miner...,Most readers would already be aware that Hudb...,2022-11-11,0.053361,0.558811,0.387828,1


In [23]:
def fv_get_page(ticker):
    """Download a webpage and return a beautiful soup doc"""

    headers = {
    'User-Agent': 'Mozilla/5.0'}

    base_url = f'https://finviz.com/quote.ashx?t={ticker}&p=d'
    html = requests.get(base_url, headers=headers)
    doc = bs(html.content, "html.parser")
    
    # response = requests.get(url)
    # if not response.ok:
    #     print('Status code:', response.status_code)
    #     # raise Exception('Failed to load page {}'.format(url))
    # page_content = response.text
    # doc = bs(page_content, 'html.parser')
       
    return doc

In [24]:
def fv_get_news_tags(doc):
    """Get the list of tags containing news information"""
    
    # news_class = "Ov(h) Pend(44px) Pstart(25px)" ## class name of div tag 
    news_list  = doc.find('table', {'id': 'news-table'}) #attrs=
    return news_list

In [25]:
def fv_parse_news(news_tag):
    """Get the news data point and return dictionary"""

    news_source = news_tag.find('div').text #source
    news_headline = news_tag.find('a').text #heading
    news_url = news_tag.find('a')['href'] #link
    
    # base_url = 'https://ca.finance.yahoo.com'
    # news_source = news_tag.find('div', {'class': 'news-link-left'}).text #source
    # news_headline = news_tag.find('a') #heading .text
    
    # if news_headline == -1:
    #     print('yes')

    # news_headline = news_tag.find('div', {'class': 'news-link-left'})
    # print(news_headline)
    # news_url = news_tag.find('a')
    # news = news_url.find('href') #link ['href']
    # print(news)
    return {# 'source' : news_source,
            'headline' : news_headline,
            'url' : news_url # base_url + 
           }

In [26]:
def scrape_finviz2(ticker):
    """Get the yahoo finance market news and write them to CSV file """
    
    finviz_url = f'https://finviz.com/quote.ashx?t={ticker}&p=d'
    url = finviz_url + ticker
    
    doc = get_page(url)
    news_list = get_news_tags(doc)
    news_data = [parse_news(news_tag) for news_tag in news_list]
    news_df = pd.DataFrame(news_data)

    return news_df 

In [27]:
doc = fv_get_page('HBM')
news_list = fv_get_news_tags(doc)
# news_data = [fv_parse_news(news_tag) for news_tag in news_list]

In [28]:
url_list = news_list.find_all('a')
url = url_list

In [34]:
df_finviz = pd.DataFrame()

df_finviz[['url', 'headline']] = ""

for n in range(len(url_list)):
        url = url_list[n]['href']
        headl = url_list[n].text
        if 'finance.yahoo' in url:
            df_finviz.at[n, 'url'] = url
            df_finviz.at[n, 'headline'] = headl
        else:
            pass

df_finviz


Unnamed: 0,url,headline
0,https://finance.yahoo.com/news/hudbay-minerals...,Hudbay Minerals Inc.'s (TSE:HBM) Financials Ar...
1,https://finance.yahoo.com/news/10-best-copper-...,10 Best Copper Stocks to Buy Now
2,https://finance.yahoo.com/news/hudbay-minerals...,"HudBay Minerals (HBM) Reports Q3 Loss, Tops Re..."
3,https://finance.yahoo.com/news/hudbay-provides...,Hudbay Provides Exploration Update and Announc...
4,https://finance.yahoo.com/news/hudbay-announce...,Hudbay Announces Third Quarter 2022 Results
...,...,...
93,https://finance.yahoo.com/news/hudbay-host-con...,Hudbay to Host Conference Call for First Quart...
94,https://finance.yahoo.com/news/hudbay-commence...,Hudbay Commences Pampacancha Pit Development
97,https://finance.yahoo.com/news/hudbay-announce...,Hudbay Announces Positive Preliminary Economic...
98,https://finance.yahoo.com/news/hudbay-announce...,Hudbay Announces Updated Constancia and Snow L...


In [None]:
dictt

{'url': 'https://finance.yahoo.com/news/hudbay-announces-significant-discovery-copper-212000046.html',
 'headline': 'Hudbay Announces Significant New Discovery at its Copper World Properties Adjacent to Rosemont'}

In [None]:
news_list.find('div').text

"Hudbay Minerals Inc.'s (TSE:HBM) Financials Are Too Obscure To Link With Current Share Price Momentum: What's In Store For the Stock? Simply Wall St."

In [None]:
headline

"Hudbay Minerals Inc.'s (TSE:HBM) Financials Are Too Obscure To Link With Current Share Price Momentum: What's In Store For the Stock?"

In [None]:
news_list.find('a')['href']

'https://finance.yahoo.com/news/10-best-copper-stocks-buy-161324595.html'

In [None]:
import pandas as pd

# To extract fundamental data
from bs4 import BeautifulSoup as bs
import requests

headers = {
    'User-Agent': 'Mozilla/5.0'}


base_url = 'https://finviz.com/quote.ashx?t=HBM&p=d'
html = requests.get(base_url, headers=headers)
soup = bs(html.content, "html.parser")
main_div = soup.find('table', attrs={'id': 'news-table'})
news_source = main_div.find('div').text #source
news_headline = main_div.find('a').text #heading
news_url = main_div.find('a')['href'] #link
# table = main_div.find('table')
# sub = table.findAll('tr')
# rows = sub[5].findAll('td')

In [None]:
main_div

In [None]:
import pandas as pd

# To extract fundamental data
from bs4 import BeautifulSoup as bs
import requests

headers = {
    'User-Agent': 'Mozilla/5.0'}


base_url = 'https://finviz.com/quote.ashx?t=HBM&p=d'
html = requests.get(base_url, headers=headers)
soup = bs(html.content, "html.parser")
main_div = soup.find('div', attrs={'id': 'news-table'})
table = main_div.find('table')
sub = table.findAll('tr')
rows = sub[5].findAll('td')

data = []

for row in rows:
    link = row.a
    if link is not None:

        data.append(link.get_text())

print(data)