# 1. Install and Import Baseline Dependencies

In [1]:
!pip install transformers





In [2]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests

# 2. Setup Summarization Model

In [3]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

In [4]:
url = "https://finance.yahoo.com/news/voyager-digital-announces-participation-upcoming-120000731.html"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
paragraphs = soup.find_all('p')

In [5]:
paragraphs[0].text

'Thank you for your patience.'

In [6]:
text = [paragraph.text for paragraph in paragraphs]
words = ' '.join(text).split(' ')[:400]
ARTICLE = ' '.join(words)

In [7]:
ARTICLE

'Thank you for your patience. Our engineers are working quickly to resolve the issue.'

In [8]:
input_ids = tokenizer.encode(ARTICLE, return_tensors='pt')
output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
summary = tokenizer.decode(output[0], skip_special_tokens=True)

In [9]:
summary

'We are aware of the issue and are working to resolve it.'

# 3. Building a News and Sentiment Pipeline

In [10]:
monitored_tickers = ['VYGVF']

## 3.1. Search for Stock News using Google and Yahoo Finance

In [11]:
def search_for_stock_news_urls(ticker):
    search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all('a')
    hrefs = [link['href'] for link in atags]
    return hrefs 

In [12]:
raw_urls = {ticker:search_for_stock_news_urls(ticker) for ticker in monitored_tickers}
raw_urls

{'VYGVF': ['/?sa=X&ved=0ahUKEwia-4vB54P0AhVDQjABHUQOBdsQOwgC',
  '/?output=search&ie=UTF-8&tbm=nws&sa=X&ved=0ahUKEwia-4vB54P0AhVDQjABHUQOBdsQPAgE',
  '/search?q=yahoo+finance+VYGVF&tbm=nws&ie=UTF-8&gbv=1&sei=hn6GYZqgK8OEwbkPxJyU2A0',
  '/search?q=yahoo+finance+VYGVF&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwia-4vB54P0AhVDQjABHUQOBdsQ_AUIBygA',
  '/search?q=yahoo+finance+VYGVF&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwia-4vB54P0AhVDQjABHUQOBdsQ_AUICSgC',
  '/search?q=yahoo+finance+VYGVF&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwia-4vB54P0AhVDQjABHUQOBdsQ_AUICigD',
  '/search?q=yahoo+finance+VYGVF&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwia-4vB54P0AhVDQjABHUQOBdsQ_AUICygE',
  'https://maps.google.com/maps?q=yahoo+finance+VYGVF&um=1&ie=UTF-8&sa=X&ved=0ahUKEwia-4vB54P0AhVDQjABHUQOBdsQ_AUIDCgF',
  '/search?q=yahoo+finance+VYGVF&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwia-4vB54P0AhVDQjABHUQOBdsQ_AUIDSgG',
  '/advanced_search',
  '/search?q=yahoo+finance+VYGVF&ie=UTF-8&tbm=nws&sour

In [13]:
raw_urls['VYGVF']

['/?sa=X&ved=0ahUKEwia-4vB54P0AhVDQjABHUQOBdsQOwgC',
 '/?output=search&ie=UTF-8&tbm=nws&sa=X&ved=0ahUKEwia-4vB54P0AhVDQjABHUQOBdsQPAgE',
 '/search?q=yahoo+finance+VYGVF&tbm=nws&ie=UTF-8&gbv=1&sei=hn6GYZqgK8OEwbkPxJyU2A0',
 '/search?q=yahoo+finance+VYGVF&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwia-4vB54P0AhVDQjABHUQOBdsQ_AUIBygA',
 '/search?q=yahoo+finance+VYGVF&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwia-4vB54P0AhVDQjABHUQOBdsQ_AUICSgC',
 '/search?q=yahoo+finance+VYGVF&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwia-4vB54P0AhVDQjABHUQOBdsQ_AUICigD',
 '/search?q=yahoo+finance+VYGVF&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwia-4vB54P0AhVDQjABHUQOBdsQ_AUICygE',
 'https://maps.google.com/maps?q=yahoo+finance+VYGVF&um=1&ie=UTF-8&sa=X&ved=0ahUKEwia-4vB54P0AhVDQjABHUQOBdsQ_AUIDCgF',
 '/search?q=yahoo+finance+VYGVF&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwia-4vB54P0AhVDQjABHUQOBdsQ_AUIDSgG',
 '/advanced_search',
 '/search?q=yahoo+finance+VYGVF&ie=UTF-8&tbm=nws&source=lnt&tbs=qdr:h&sa=

## 3.2. Strip out unwanted URLs

In [14]:
import re

In [15]:
exclude_list = ['maps', 'policies', 'preferences', 'accounts', 'support']

In [16]:
def strip_unwanted_urls(urls, exclude_list):
    val = []
    for url in urls: 
        if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))

In [17]:
cleaned_urls = {ticker:strip_unwanted_urls(raw_urls[ticker], exclude_list) for ticker in monitored_tickers}
cleaned_urls

{'VYGVF': ['https://finance.yahoo.com/news/voyager-digital-market-rebellion-announce-120000107.html',
  'https://finance.yahoo.com/news/voyager-digital-acquires-leading-global-130000594.html',
  'https://finance.yahoo.com/news/voyager-digital-announces-participation-september-120000518.html',
  'https://finance.yahoo.com/news/voyager-digital-becomes-official-cryptocurrency-154300849.html',
  'https://finance.yahoo.com/news/voyager-digital-announces-conditional-approval-120000242.html',
  'https://finance.yahoo.com/news/voyager-digital-provides-business-april-120000956.html',
  'https://finance.yahoo.com/news/voyager-digital-secures-75-million-114500730.html',
  'https://finance.yahoo.com/news/voyager-digital-announces-record-quarterly-200500164.html',
  'https://finance.yahoo.com/news/voyager-digital-business-quarter-ended-120000046.html',
  'https://finance.yahoo.com/news/voyager-digital-reports-revenue-us-121700287.html']}

## 3.3. Search and Scrape Cleaned URLs

In [18]:
def scrape_and_process(URLs):
    ARTICLES = []
    for url in URLs: 
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:250]
        ARTICLE = ' '.join(words)
        ARTICLES.append(ARTICLE)
    return ARTICLES

In [19]:
articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}
articles

{'VYGVF': ['Thank you for your patience. Our engineers are working quickly to resolve the issue.',
  'Thank you for your patience. Our engineers are working quickly to resolve the issue.',
  'Thank you for your patience. Our engineers are working quickly to resolve the issue.',
  ' Voyager is the first international partner of the Dallas Mavericks, joining forces to make crypto more accessible for all TSX: VOYGOTCQX: VYGVFBorse Frankfurt: UCD2 NEW YORK, Oct. 27, 2021 /CNW/ - Voyager Digital Ltd. ("Voyager" or the "Company") (TSX: VOYG) (OTCQX: VYGVF) (FRA: UCD2), one of the fastest-growing, publicly traded cryptocurrency platforms in the United States, has entered into a five-year exclusive, integrated partnership with the Dallas Mavericks, becoming the team\'s first cryptocurrency brokerage and international partner. A press conference will be hosted in Dallas today at 4:00 p.m. Central Time to discuss the partnership. To watch, please visit: https://www.mavs.com/voyager/. Voyager and

In [20]:
articles['VYGVF'][2]

'Thank you for your patience. Our engineers are working quickly to resolve the issue.'

## 3.4. Summarise all Articles

In [21]:
def summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article, return_tensors='pt')
        output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries

In [22]:
summaries = {ticker:summarize(articles[ticker]) for ticker in monitored_tickers}
summaries

{'VYGVF': ['We are aware of the issue and are working to resolve it.',
  'We are aware of the issue and are working to resolve it.',
  'We are aware of the issue and are working to resolve it.',
  'One of the fastest-growing, publicly traded cryptocurrency platforms in the United States. partnership includes naming rights to the Mavs Gaming Hub',
  'We are aware of the issue and are working to resolve it.',
  'We are aware of the issue and are working to resolve it.',
  'Alameda to invest $75 million in Voyager. Partners to focus on execution, asset management, and broader crypto initiatives',
  'We are aware of the issue and are working to resolve it.',
  'We are aware of the issue and are working to resolve it.',
  'Fiscal 2021 was a breakout year for Voyager as crypto and related blockchain technologies increasingly embraced.']}

In [23]:
summaries['VYGVF']

['We are aware of the issue and are working to resolve it.',
 'We are aware of the issue and are working to resolve it.',
 'We are aware of the issue and are working to resolve it.',
 'One of the fastest-growing, publicly traded cryptocurrency platforms in the United States. partnership includes naming rights to the Mavs Gaming Hub',
 'We are aware of the issue and are working to resolve it.',
 'We are aware of the issue and are working to resolve it.',
 'Alameda to invest $75 million in Voyager. Partners to focus on execution, asset management, and broader crypto initiatives',
 'We are aware of the issue and are working to resolve it.',
 'We are aware of the issue and are working to resolve it.',
 'Fiscal 2021 was a breakout year for Voyager as crypto and related blockchain technologies increasingly embraced.']

# 4. Adding Sentiment Analysis

In [24]:
from transformers import pipeline
sentiment = pipeline('sentiment-analysis')

In [25]:
sentiment(summaries['VYGVF'])

[{'label': 'POSITIVE', 'score': 0.9979087710380554},
 {'label': 'POSITIVE', 'score': 0.9979087710380554},
 {'label': 'POSITIVE', 'score': 0.9979087710380554},
 {'label': 'POSITIVE', 'score': 0.9991520047187805},
 {'label': 'POSITIVE', 'score': 0.9979087710380554},
 {'label': 'POSITIVE', 'score': 0.9979087710380554},
 {'label': 'POSITIVE', 'score': 0.9869519472122192},
 {'label': 'POSITIVE', 'score': 0.9979087710380554},
 {'label': 'POSITIVE', 'score': 0.9979087710380554},
 {'label': 'POSITIVE', 'score': 0.996300220489502}]

In [26]:
scores = {ticker:sentiment(summaries[ticker]) for ticker in monitored_tickers}
scores

{'VYGVF': [{'label': 'POSITIVE', 'score': 0.9979087710380554},
  {'label': 'POSITIVE', 'score': 0.9979087710380554},
  {'label': 'POSITIVE', 'score': 0.9979087710380554},
  {'label': 'POSITIVE', 'score': 0.9991520047187805},
  {'label': 'POSITIVE', 'score': 0.9979087710380554},
  {'label': 'POSITIVE', 'score': 0.9979087710380554},
  {'label': 'POSITIVE', 'score': 0.9869519472122192},
  {'label': 'POSITIVE', 'score': 0.9979087710380554},
  {'label': 'POSITIVE', 'score': 0.9979087710380554},
  {'label': 'POSITIVE', 'score': 0.996300220489502}]}

In [27]:
print(summaries['VYGVF'][3], scores['VYGVF'][3]['label'], scores['VYGVF'][3]['score'])

One of the fastest-growing, publicly traded cryptocurrency platforms in the United States. partnership includes naming rights to the Mavs Gaming Hub POSITIVE 0.9991520047187805


In [28]:
scores['VYGVF'][0]['score']

0.9979087710380554

# 5. Exporting Results to CSV

In [29]:
summaries

{'VYGVF': ['We are aware of the issue and are working to resolve it.',
  'We are aware of the issue and are working to resolve it.',
  'We are aware of the issue and are working to resolve it.',
  'One of the fastest-growing, publicly traded cryptocurrency platforms in the United States. partnership includes naming rights to the Mavs Gaming Hub',
  'We are aware of the issue and are working to resolve it.',
  'We are aware of the issue and are working to resolve it.',
  'Alameda to invest $75 million in Voyager. Partners to focus on execution, asset management, and broader crypto initiatives',
  'We are aware of the issue and are working to resolve it.',
  'We are aware of the issue and are working to resolve it.',
  'Fiscal 2021 was a breakout year for Voyager as crypto and related blockchain technologies increasingly embraced.']}

In [30]:
scores

{'VYGVF': [{'label': 'POSITIVE', 'score': 0.9979087710380554},
  {'label': 'POSITIVE', 'score': 0.9979087710380554},
  {'label': 'POSITIVE', 'score': 0.9979087710380554},
  {'label': 'POSITIVE', 'score': 0.9991520047187805},
  {'label': 'POSITIVE', 'score': 0.9979087710380554},
  {'label': 'POSITIVE', 'score': 0.9979087710380554},
  {'label': 'POSITIVE', 'score': 0.9869519472122192},
  {'label': 'POSITIVE', 'score': 0.9979087710380554},
  {'label': 'POSITIVE', 'score': 0.9979087710380554},
  {'label': 'POSITIVE', 'score': 0.996300220489502}]}

In [31]:
cleaned_urls

{'VYGVF': ['https://finance.yahoo.com/news/voyager-digital-market-rebellion-announce-120000107.html',
  'https://finance.yahoo.com/news/voyager-digital-acquires-leading-global-130000594.html',
  'https://finance.yahoo.com/news/voyager-digital-announces-participation-september-120000518.html',
  'https://finance.yahoo.com/news/voyager-digital-becomes-official-cryptocurrency-154300849.html',
  'https://finance.yahoo.com/news/voyager-digital-announces-conditional-approval-120000242.html',
  'https://finance.yahoo.com/news/voyager-digital-provides-business-april-120000956.html',
  'https://finance.yahoo.com/news/voyager-digital-secures-75-million-114500730.html',
  'https://finance.yahoo.com/news/voyager-digital-announces-record-quarterly-200500164.html',
  'https://finance.yahoo.com/news/voyager-digital-business-quarter-ended-120000046.html',
  'https://finance.yahoo.com/news/voyager-digital-reports-revenue-us-121700287.html']}

In [32]:
range(len(summaries['VYGVF']))

range(0, 10)

In [33]:
summaries['VYGVF'][3]

'One of the fastest-growing, publicly traded cryptocurrency platforms in the United States. partnership includes naming rights to the Mavs Gaming Hub'

In [34]:
def create_output_array(summaries, scores, urls):
    output = []
    for ticker in monitored_tickers:
        for counter in range(len(summaries[ticker])):
            output_this = [
                ticker,
                summaries[ticker][counter],
                scores[ticker][counter]['label'],
                scores[ticker][counter]['score'],
                urls[ticker][counter]
            ]
            output.append(output_this)
    return output

In [35]:
final_output = create_output_array(summaries, scores, cleaned_urls)
final_output

[['VYGVF',
  'We are aware of the issue and are working to resolve it.',
  'POSITIVE',
  0.9979087710380554,
  'https://finance.yahoo.com/news/voyager-digital-market-rebellion-announce-120000107.html'],
 ['VYGVF',
  'We are aware of the issue and are working to resolve it.',
  'POSITIVE',
  0.9979087710380554,
  'https://finance.yahoo.com/news/voyager-digital-acquires-leading-global-130000594.html'],
 ['VYGVF',
  'We are aware of the issue and are working to resolve it.',
  'POSITIVE',
  0.9979087710380554,
  'https://finance.yahoo.com/news/voyager-digital-announces-participation-september-120000518.html'],
 ['VYGVF',
  'One of the fastest-growing, publicly traded cryptocurrency platforms in the United States. partnership includes naming rights to the Mavs Gaming Hub',
  'POSITIVE',
  0.9991520047187805,
  'https://finance.yahoo.com/news/voyager-digital-becomes-official-cryptocurrency-154300849.html'],
 ['VYGVF',
  'We are aware of the issue and are working to resolve it.',
  'POSITIVE

In [36]:
final_output.insert(0, ['Ticker', 'Summary', 'Label', 'Confidence', 'URL'])

In [37]:
final_output

[['Ticker', 'Summary', 'Label', 'Confidence', 'URL'],
 ['VYGVF',
  'We are aware of the issue and are working to resolve it.',
  'POSITIVE',
  0.9979087710380554,
  'https://finance.yahoo.com/news/voyager-digital-market-rebellion-announce-120000107.html'],
 ['VYGVF',
  'We are aware of the issue and are working to resolve it.',
  'POSITIVE',
  0.9979087710380554,
  'https://finance.yahoo.com/news/voyager-digital-acquires-leading-global-130000594.html'],
 ['VYGVF',
  'We are aware of the issue and are working to resolve it.',
  'POSITIVE',
  0.9979087710380554,
  'https://finance.yahoo.com/news/voyager-digital-announces-participation-september-120000518.html'],
 ['VYGVF',
  'One of the fastest-growing, publicly traded cryptocurrency platforms in the United States. partnership includes naming rights to the Mavs Gaming Hub',
  'POSITIVE',
  0.9991520047187805,
  'https://finance.yahoo.com/news/voyager-digital-becomes-official-cryptocurrency-154300849.html'],
 ['VYGVF',
  'We are aware of 

In [38]:
import csv
with open('assetsummaries.csv', mode='w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerows(final_output)

PermissionError: [Errno 13] Permission denied: 'assetsummaries.csv'