# 1. Install and Import Baseline Dependencies

In [1]:
!pip install transformers





In [2]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests

# 2. Setup Summarization Model

In [3]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

# 3. Summarize a Single Article

In [4]:
url = "https://finance.yahoo.com/news/bitcoin-crashes-below-50000-163721506.html"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
paragraphs = soup.find_all('p')

In [5]:
paragraphs[0].text

'Thank you for your patience.'

In [6]:
text = [paragraph.text for paragraph in paragraphs]
words = ' '.join(text).split(' ')[:400]
ARTICLE = ' '.join(words)

In [7]:
ARTICLE

'Thank you for your patience. Our engineers are working quickly to resolve the issue.'

In [8]:
input_ids = tokenizer.encode(ARTICLE, return_tensors='pt')
output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
summary = tokenizer.decode(output[0], skip_special_tokens=True)

In [9]:
summary

'We are aware of the issue and are working to resolve it.'

# 4. Building a News and Sentiment Pipeline

In [10]:
monitored_tickers = ['ADA', 'ETH', 'BTC']

## 4.1. Search for Stock News using Google and Yahoo Finance

In [11]:
def search_for_stock_news_urls(ticker):
    search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all('a')
    hrefs = [link['href'] for link in atags]
    return hrefs 

In [12]:
raw_urls = {ticker:search_for_stock_news_urls(ticker) for ticker in monitored_tickers}
raw_urls

{'ADA': ['/?sa=X&ved=0ahUKEwik5fnniu7yAhU3BGMBHR77CtkQOwgC',
  '/?output=search&ie=UTF-8&tbm=nws&sa=X&ved=0ahUKEwik5fnniu7yAhU3BGMBHR77CtkQPAgE',
  '/search?q=yahoo+finance+ADA&tbm=nws&ie=UTF-8&gbv=1&sei=4v43YaTsKbeIjLsPnvaryA0',
  '/search?q=yahoo+finance+ADA&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwik5fnniu7yAhU3BGMBHR77CtkQ_AUIBygA',
  '/search?q=yahoo+finance+ADA&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwik5fnniu7yAhU3BGMBHR77CtkQ_AUICSgC',
  '/search?q=yahoo+finance+ADA&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwik5fnniu7yAhU3BGMBHR77CtkQ_AUICigD',
  'https://maps.google.com/maps?q=yahoo+finance+ADA&um=1&ie=UTF-8&sa=X&ved=0ahUKEwik5fnniu7yAhU3BGMBHR77CtkQ_AUICygE',
  '/search?q=yahoo+finance+ADA&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwik5fnniu7yAhU3BGMBHR77CtkQ_AUIDCgF',
  '/search?q=yahoo+finance+ADA&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwik5fnniu7yAhU3BGMBHR77CtkQ_AUIDSgG',
  '/advanced_search',
  '/search?q=yahoo+finance+ADA&ie=UTF-8&tbm=nws&source=lnt&tbs=qdr:h&s

In [13]:
raw_urls['BTC']

['/?sa=X&ved=0ahUKEwi11sDoiu7yAhUgAWMBHUpPBeMQOwgC',
 '/?output=search&ie=UTF-8&tbm=nws&sa=X&ved=0ahUKEwi11sDoiu7yAhUgAWMBHUpPBeMQPAgE',
 '/search?q=yahoo+finance+BTC&tbm=nws&ie=UTF-8&gbv=1&sei=4_43YfXYM6CCjLsPyp6VmA4',
 '/search?q=yahoo+finance+BTC&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwi11sDoiu7yAhUgAWMBHUpPBeMQ_AUIBygA',
 '/search?q=yahoo+finance+BTC&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwi11sDoiu7yAhUgAWMBHUpPBeMQ_AUICSgC',
 '/search?q=yahoo+finance+BTC&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwi11sDoiu7yAhUgAWMBHUpPBeMQ_AUICigD',
 '/search?q=yahoo+finance+BTC&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwi11sDoiu7yAhUgAWMBHUpPBeMQ_AUICygE',
 'https://maps.google.com/maps?q=yahoo+finance+BTC&um=1&ie=UTF-8&sa=X&ved=0ahUKEwi11sDoiu7yAhUgAWMBHUpPBeMQ_AUIDCgF',
 '/search?q=yahoo+finance+BTC&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwi11sDoiu7yAhUgAWMBHUpPBeMQ_AUIDSgG',
 '/advanced_search',
 '/search?q=yahoo+finance+BTC&ie=UTF-8&tbm=nws&source=lnt&tbs=qdr:h&sa=X&ved=0ahUKEwi11

## 4.2. Strip out unwanted URLs

In [14]:
import re

In [15]:
exclude_list = ['maps', 'policies', 'preferences', 'accounts', 'support']

In [16]:
def strip_unwanted_urls(urls, exclude_list):
    val = []
    for url in urls: 
        if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))

In [17]:
cleaned_urls = {ticker:strip_unwanted_urls(raw_urls[ticker], exclude_list) for ticker in monitored_tickers}
cleaned_urls

{'ADA': ['https://finance.yahoo.com/news/what-augusts-record-breaking-month-for-crypto-flows-means-for-bitcoin-163906276.html',
  'https://finance.yahoo.com/news/institutions-load-24m-eth-ada-142945976.html',
  'https://finance.yahoo.com/news/cardano-ada-expect-pullback-another-192023603.html',
  'https://finance.yahoo.com/news/ada-sol-lead-way-crypto-110019476.html',
  'https://finance.yahoo.com/video/explaining-crypto-price-plunge-212051886.html',
  'https://finance.yahoo.com/news/cardano-hits-3-mark-first-155147241.html',
  'https://finance.yahoo.com/news/cardano-launch-erc-20-ethereum-110017667.html',
  'https://finance.yahoo.com/news/cardaswap-finance-determined-turn-uniswap-180900658.html',
  'https://finance.yahoo.com/video/ethereum-hits-highest-levels-since-201441481.html',
  'https://finance.yahoo.com/news/cardano-ada-reaches-time-high-221801619.html'],
 'ETH': ['https://finance.yahoo.com/news/bitcoin-ether-war-standard-chartered-115729714.html',
  'https://finance.yahoo.com/n

## 4.3. Search and Scrape Cleaned URLs

In [18]:
def scrape_and_process(URLs):
    ARTICLES = []
    for url in URLs: 
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:350]
        ARTICLE = ' '.join(words)
        ARTICLES.append(ARTICLE)
    return ARTICLES

In [19]:
articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}
articles

{'ADA': [" Over the past month, the crypto market has looked like a rising tide for all coins from Bitcoin to Cardano — but data suggest growth across the asset class hasn’t been equal. Last week, Bitcoin (BTC-USD) breached $50,000 for the second time in two weeks, extending a rally that put a grim sell-off that started in May further in the rear-view mirror. While notable for its volatility, gains in the largest cryptocurrency may have gotten lost in the swell of rising prices across the entire asset class. With a majority of decentralized finance and non-fungible token (NFT) trading happening on the Ethereum (ETH-USD) blockchain, the second largest cryptocurrency by market capitalization rose by a third from $2,700 to $3,900, a growth rate 17 percent higher than BTC. And other blockchain-based currencies such as the third highest valued cryptocurrency, Cardano (ADA-USD) has more than doubled while a newer one, Solana (SOL-USD), has more than tripled in value over the past month. ADA 

In [20]:
articles['BTC'][2]

'Thank you for your patience. Our engineers are working quickly to resolve the issue.'

## 4.4. Summarise all Articles

In [21]:
def summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article, return_tensors='pt')
        output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries

In [22]:
summaries = {ticker:summarize(articles[ticker]) for ticker in monitored_tickers}
summaries

{'ADA': ['Cardano, Solana, Dogecoin among fastest-growing cryptos. Bitcoin IRA sees ‘record-breaking inflows’ of new accounts',
  'Ethereum, Cardano and Solana also see inflows. Bitcoin continues to see outflows for eighth consecutive week',
  'Is the recent low of $2.47 a 4th wave correction already over? Wave-b is now underway, targeting as high as $2.97-3.17',
  'Dogecoin, polkadot and ADA all fall more than 8% on debut in Japan. Bitcoin, Ether, Polkadot and SOL all trade lower',
  'We are aware of the issue and are working to resolve it.',
  'ADA has surpassed BNB to become third-largest cryptocurrency by market cap. Network prepares for mainnet launch later this month',
  'Ethereum-based tokens to be migrated to Cardano. Hoskinson says more than 100 companies have already announced plans to use the tool',
  'We are aware of the issue and are working to resolve it.',
  'We are aware of the issue and are working to resolve it.',
  'Cardano’s native token has been on a tear in recent

In [23]:
summaries['BTC']

['Every Salvadoran to be offered a government-run digital wallet. Law comes amid protest and uncertainty about access to the cryptocurrency',
 'Cardano, Solana, Dogecoin among fastest-growing cryptos. Bitcoin IRA sees ‘record-breaking inflows’ of new accounts',
 'We are aware of the issue and are working to resolve it.',
 'Locals can now use the cryptocurrency in physical and online shops. El Salvador is the first country in the world to adopt Bitcoin',
 'The cryptocurrency market is growing due to an increase in desire for transparency in financial payment systems.',
 'We are aware of the issue and are working to resolve it.',
 'China, Europe trade data keep global recovery on track. Bitcoin briefly rises above $52,000 on wave of positive sentiment',
 'Wood: ‘He has been a gold bug, for most of his career',
 'Bill Miller’s Miller Opportunity Trust has recently invested in the coin. Bitcoin is closing in on its all-time high price of $55k per coin',
 'Chartered bank Vast Bank becomes f

# 5. Adding Sentiment Analysis

In [24]:
from transformers import pipeline
sentiment = pipeline('sentiment-analysis')

In [25]:
sentiment(summaries['ETH'])

[{'label': 'POSITIVE', 'score': 0.9979087710380554},
 {'label': 'POSITIVE', 'score': 0.9976463317871094},
 {'label': 'NEGATIVE', 'score': 0.9816083908081055},
 {'label': 'POSITIVE', 'score': 0.9979087710380554},
 {'label': 'POSITIVE', 'score': 0.9979087710380554},
 {'label': 'NEGATIVE', 'score': 0.9996261596679688},
 {'label': 'NEGATIVE', 'score': 0.9699082970619202},
 {'label': 'POSITIVE', 'score': 0.9861598610877991},
 {'label': 'NEGATIVE', 'score': 0.9868157505989075},
 {'label': 'NEGATIVE', 'score': 0.9837801456451416}]

In [26]:
scores = {ticker:sentiment(summaries[ticker]) for ticker in monitored_tickers}
scores

{'ADA': [{'label': 'POSITIVE', 'score': 0.9976463317871094},
  {'label': 'POSITIVE', 'score': 0.9340047836303711},
  {'label': 'NEGATIVE', 'score': 0.9983871579170227},
  {'label': 'NEGATIVE', 'score': 0.9990785717964172},
  {'label': 'POSITIVE', 'score': 0.9979087710380554},
  {'label': 'POSITIVE', 'score': 0.8167056441307068},
  {'label': 'NEGATIVE', 'score': 0.9975221157073975},
  {'label': 'POSITIVE', 'score': 0.9979087710380554},
  {'label': 'POSITIVE', 'score': 0.9979087710380554},
  {'label': 'NEGATIVE', 'score': 0.962354302406311}],
 'ETH': [{'label': 'POSITIVE', 'score': 0.9979087710380554},
  {'label': 'POSITIVE', 'score': 0.9976463317871094},
  {'label': 'NEGATIVE', 'score': 0.9816083908081055},
  {'label': 'POSITIVE', 'score': 0.9979087710380554},
  {'label': 'POSITIVE', 'score': 0.9979087710380554},
  {'label': 'NEGATIVE', 'score': 0.9996261596679688},
  {'label': 'NEGATIVE', 'score': 0.9699082970619202},
  {'label': 'POSITIVE', 'score': 0.9861598610877991},
  {'label': 'N

In [27]:
print(summaries['ETH'][3], scores['ETH'][3]['label'], scores['ETH'][3]['score'])

We are aware of the issue and are working to resolve it. POSITIVE 0.9979087710380554


In [28]:
scores['ETH'][0]['score']

0.9979087710380554

# 6. Exporting Results to CSV

In [29]:
summaries

{'ADA': ['Cardano, Solana, Dogecoin among fastest-growing cryptos. Bitcoin IRA sees ‘record-breaking inflows’ of new accounts',
  'Ethereum, Cardano and Solana also see inflows. Bitcoin continues to see outflows for eighth consecutive week',
  'Is the recent low of $2.47 a 4th wave correction already over? Wave-b is now underway, targeting as high as $2.97-3.17',
  'Dogecoin, polkadot and ADA all fall more than 8% on debut in Japan. Bitcoin, Ether, Polkadot and SOL all trade lower',
  'We are aware of the issue and are working to resolve it.',
  'ADA has surpassed BNB to become third-largest cryptocurrency by market cap. Network prepares for mainnet launch later this month',
  'Ethereum-based tokens to be migrated to Cardano. Hoskinson says more than 100 companies have already announced plans to use the tool',
  'We are aware of the issue and are working to resolve it.',
  'We are aware of the issue and are working to resolve it.',
  'Cardano’s native token has been on a tear in recent

In [30]:
scores

{'ADA': [{'label': 'POSITIVE', 'score': 0.9976463317871094},
  {'label': 'POSITIVE', 'score': 0.9340047836303711},
  {'label': 'NEGATIVE', 'score': 0.9983871579170227},
  {'label': 'NEGATIVE', 'score': 0.9990785717964172},
  {'label': 'POSITIVE', 'score': 0.9979087710380554},
  {'label': 'POSITIVE', 'score': 0.8167056441307068},
  {'label': 'NEGATIVE', 'score': 0.9975221157073975},
  {'label': 'POSITIVE', 'score': 0.9979087710380554},
  {'label': 'POSITIVE', 'score': 0.9979087710380554},
  {'label': 'NEGATIVE', 'score': 0.962354302406311}],
 'ETH': [{'label': 'POSITIVE', 'score': 0.9979087710380554},
  {'label': 'POSITIVE', 'score': 0.9976463317871094},
  {'label': 'NEGATIVE', 'score': 0.9816083908081055},
  {'label': 'POSITIVE', 'score': 0.9979087710380554},
  {'label': 'POSITIVE', 'score': 0.9979087710380554},
  {'label': 'NEGATIVE', 'score': 0.9996261596679688},
  {'label': 'NEGATIVE', 'score': 0.9699082970619202},
  {'label': 'POSITIVE', 'score': 0.9861598610877991},
  {'label': 'N

In [31]:
cleaned_urls

{'ADA': ['https://finance.yahoo.com/news/what-augusts-record-breaking-month-for-crypto-flows-means-for-bitcoin-163906276.html',
  'https://finance.yahoo.com/news/institutions-load-24m-eth-ada-142945976.html',
  'https://finance.yahoo.com/news/cardano-ada-expect-pullback-another-192023603.html',
  'https://finance.yahoo.com/news/ada-sol-lead-way-crypto-110019476.html',
  'https://finance.yahoo.com/video/explaining-crypto-price-plunge-212051886.html',
  'https://finance.yahoo.com/news/cardano-hits-3-mark-first-155147241.html',
  'https://finance.yahoo.com/news/cardano-launch-erc-20-ethereum-110017667.html',
  'https://finance.yahoo.com/news/cardaswap-finance-determined-turn-uniswap-180900658.html',
  'https://finance.yahoo.com/video/ethereum-hits-highest-levels-since-201441481.html',
  'https://finance.yahoo.com/news/cardano-ada-reaches-time-high-221801619.html'],
 'ETH': ['https://finance.yahoo.com/news/bitcoin-ether-war-standard-chartered-115729714.html',
  'https://finance.yahoo.com/n

In [32]:
range(len(summaries['ETH']))

range(0, 10)

In [33]:
summaries['ETH'][3]

'We are aware of the issue and are working to resolve it.'

In [34]:
def create_output_array(summaries, scores, urls):
    output = []
    for ticker in monitored_tickers:
        for counter in range(len(summaries[ticker])):
            output_this = [
                ticker,
                summaries[ticker][counter],
                scores[ticker][counter]['label'],
                scores[ticker][counter]['score'],
                urls[ticker][counter]
            ]
            output.append(output_this)
    return output

In [35]:
final_output = create_output_array(summaries, scores, cleaned_urls)
final_output

[['ADA',
  'Cardano, Solana, Dogecoin among fastest-growing cryptos. Bitcoin IRA sees ‘record-breaking inflows’ of new accounts',
  'POSITIVE',
  0.9976463317871094,
  'https://finance.yahoo.com/news/what-augusts-record-breaking-month-for-crypto-flows-means-for-bitcoin-163906276.html'],
 ['ADA',
  'Ethereum, Cardano and Solana also see inflows. Bitcoin continues to see outflows for eighth consecutive week',
  'POSITIVE',
  0.9340047836303711,
  'https://finance.yahoo.com/news/institutions-load-24m-eth-ada-142945976.html'],
 ['ADA',
  'Is the recent low of $2.47 a 4th wave correction already over? Wave-b is now underway, targeting as high as $2.97-3.17',
  'NEGATIVE',
  0.9983871579170227,
  'https://finance.yahoo.com/news/cardano-ada-expect-pullback-another-192023603.html'],
 ['ADA',
  'Dogecoin, polkadot and ADA all fall more than 8% on debut in Japan. Bitcoin, Ether, Polkadot and SOL all trade lower',
  'NEGATIVE',
  0.9990785717964172,
  'https://finance.yahoo.com/news/ada-sol-lead-

In [36]:
final_output.insert(0, ['Ticker', 'Summary', 'Label', 'Confidence', 'URL'])

In [37]:
final_output

[['Ticker', 'Summary', 'Label', 'Confidence', 'URL'],
 ['ADA',
  'Cardano, Solana, Dogecoin among fastest-growing cryptos. Bitcoin IRA sees ‘record-breaking inflows’ of new accounts',
  'POSITIVE',
  0.9976463317871094,
  'https://finance.yahoo.com/news/what-augusts-record-breaking-month-for-crypto-flows-means-for-bitcoin-163906276.html'],
 ['ADA',
  'Ethereum, Cardano and Solana also see inflows. Bitcoin continues to see outflows for eighth consecutive week',
  'POSITIVE',
  0.9340047836303711,
  'https://finance.yahoo.com/news/institutions-load-24m-eth-ada-142945976.html'],
 ['ADA',
  'Is the recent low of $2.47 a 4th wave correction already over? Wave-b is now underway, targeting as high as $2.97-3.17',
  'NEGATIVE',
  0.9983871579170227,
  'https://finance.yahoo.com/news/cardano-ada-expect-pullback-another-192023603.html'],
 ['ADA',
  'Dogecoin, polkadot and ADA all fall more than 8% on debut in Japan. Bitcoin, Ether, Polkadot and SOL all trade lower',
  'NEGATIVE',
  0.99907857179

In [38]:
import csv
with open('assetsummaries.csv', mode='w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerows(final_output)