# 1. Install and Import Baseline Dependencies

In [1]:
!pip install transformers





In [2]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests

# 2. Setup Summarization Model

In [3]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

# 3. Summarize a Single Article

In [4]:
url = "https://finance.yahoo.com/news/bitcoin-crashes-below-50000-163721506.html"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
paragraphs = soup.find_all('p')

In [5]:
paragraphs[1].text

"Bitcoin's (BTC-USD) price crashed on Tuesday afternoon, falling below the $50,000 (£36,152) mark, on the same day that El Salvador began accepting the crypto as legal tender."

In [6]:
text = [paragraph.text for paragraph in paragraphs]
words = ' '.join(text).split(' ')[:400]
ARTICLE = ' '.join(words)

In [7]:
ARTICLE



In [8]:
input_ids = tokenizer.encode(ARTICLE, return_tensors='pt')
output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
summary = tokenizer.decode(output[0], skip_special_tokens=True)

In [9]:
summary

'El Salvador becomes the first country in the world to accept the cryptocurrency.'

# 4. Building a News and Sentiment Pipeline

In [10]:
monitored_tickers = ['ADA', 'ETH', 'BTC']

## 4.1. Search for Stock News using Google and Yahoo Finance

In [11]:
def search_for_stock_news_urls(ticker):
    search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all('a')
    hrefs = [link['href'] for link in atags]
    return hrefs 

In [12]:
raw_urls = {ticker:search_for_stock_news_urls(ticker) for ticker in monitored_tickers}
raw_urls

{'ADA': ['/?sa=X&ved=0ahUKEwim8Zzw3P_yAhVPNOwKHWzhDOQQOwgC',
  '/?output=search&ie=UTF-8&tbm=nws&sa=X&ved=0ahUKEwim8Zzw3P_yAhVPNOwKHWzhDOQQPAgE',
  '/search?q=yahoo+finance+ADA&tbm=nws&ie=UTF-8&gbv=1&sei=oz5BYeaTEs_osAfswrOgDg',
  '/search?q=yahoo+finance+ADA&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwim8Zzw3P_yAhVPNOwKHWzhDOQQ_AUIBygA',
  '/search?q=yahoo+finance+ADA&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwim8Zzw3P_yAhVPNOwKHWzhDOQQ_AUICSgC',
  '/search?q=yahoo+finance+ADA&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwim8Zzw3P_yAhVPNOwKHWzhDOQQ_AUICigD',
  'https://maps.google.com/maps?q=yahoo+finance+ADA&um=1&ie=UTF-8&sa=X&ved=0ahUKEwim8Zzw3P_yAhVPNOwKHWzhDOQQ_AUICygE',
  '/search?q=yahoo+finance+ADA&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwim8Zzw3P_yAhVPNOwKHWzhDOQQ_AUIDCgF',
  '/search?q=yahoo+finance+ADA&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwim8Zzw3P_yAhVPNOwKHWzhDOQQ_AUIDSgG',
  '/advanced_search',
  '/search?q=yahoo+finance+ADA&ie=UTF-8&tbm=nws&source=lnt&tbs=qdr:h&sa

In [13]:
raw_urls['BTC']

['/?sa=X&ved=0ahUKEwjv0Obw3P_yAhWQDewKHb4UCfcQOwgC',
 '/?output=search&ie=UTF-8&tbm=nws&sa=X&ved=0ahUKEwjv0Obw3P_yAhWQDewKHb4UCfcQPAgE',
 '/search?q=yahoo+finance+BTC&tbm=nws&ie=UTF-8&gbv=1&sei=pD5BYe_uHpCbsAe-qaS4Dw',
 '/search?q=yahoo+finance+BTC&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwjv0Obw3P_yAhWQDewKHb4UCfcQ_AUIBygA',
 '/search?q=yahoo+finance+BTC&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwjv0Obw3P_yAhWQDewKHb4UCfcQ_AUICSgC',
 '/search?q=yahoo+finance+BTC&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwjv0Obw3P_yAhWQDewKHb4UCfcQ_AUICigD',
 '/search?q=yahoo+finance+BTC&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwjv0Obw3P_yAhWQDewKHb4UCfcQ_AUICygE',
 'https://maps.google.com/maps?q=yahoo+finance+BTC&um=1&ie=UTF-8&sa=X&ved=0ahUKEwjv0Obw3P_yAhWQDewKHb4UCfcQ_AUIDCgF',
 '/search?q=yahoo+finance+BTC&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwjv0Obw3P_yAhWQDewKHb4UCfcQ_AUIDSgG',
 '/advanced_search',
 '/search?q=yahoo+finance+BTC&ie=UTF-8&tbm=nws&source=lnt&tbs=qdr:h&sa=X&ved=0ahUKEwjv0O

## 4.2. Strip out unwanted URLs

In [14]:
import re

In [15]:
exclude_list = ['maps', 'policies', 'preferences', 'accounts', 'support']

In [16]:
def strip_unwanted_urls(urls, exclude_list):
    val = []
    for url in urls: 
        if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))

In [17]:
cleaned_urls = {ticker:strip_unwanted_urls(raw_urls[ticker], exclude_list) for ticker in monitored_tickers}
cleaned_urls

{'ADA': ['https://finance.yahoo.com/news/cardano-ada-expected-pullback-came-170945648.html',
  'https://finance.yahoo.com/news/cardano-smart-contracts-hit-public-103919542.html',
  'https://finance.yahoo.com/news/bitcoin-price-cryptocurrencies-sweden-trading-stamps-ethereum-100929220.html',
  'https://finance.yahoo.com/news/cryptocurriencies-are-a-durable-asset-class-with-real-upside-black-rock-fixed-income-cio-160259991.html',
  'https://finance.yahoo.com/news/greed-one-first-launch-cardano-180100414.html',
  'https://finance.yahoo.com/news/undaunted-by-bitcoin-flash-crash-true-believers-say-its-the-greatest-time-to-get-into-crypto-214309569.html',
  'https://finance.yahoo.com/news/cardano-ada-expect-pullback-another-192023603.html',
  'https://finance.yahoo.com/news/what-augusts-record-breaking-month-for-crypto-flows-means-for-bitcoin-163906276.html',
  'https://finance.yahoo.com/news/cardano-hits-3-mark-first-155147241.html',
  'https://finance.yahoo.com/news/ada-down-7-today-despit

## 4.3. Search and Scrape Cleaned URLs

In [18]:
def scrape_and_process(URLs):
    ARTICLES = []
    for url in URLs: 
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:350]
        ARTICLE = ' '.join(words)
        ARTICLES.append(ARTICLE)
    return ARTICLES

In [19]:
articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}
articles

{'ADA': ['A week and a half ago, see here, I preferred for Cardano (ADA) “… wave-b is now underway, which can target as high as $2.97-3.17 for what would then become an “irregular flat,” before (green) wave-c of (red) wave-iv brings the price back down to the ideal target zone of $2.50-2.20 once again.” Bingo. In the meantime, I had told my premium crypto trading members that a break below $2.70 would be bearish. Thus, using the Elliott Wave Principle (EWP) combined with Technical Analyses (TA), my preferred view was the correct one, and now I can look at the new pieces of the never-ending puzzle. Figure 1. ETH daily chart with EWP count and technical indicators. My preferred view of a larger, more complex, i.e., irregular flat 4th wave, remains, but with the drop to $2.04, I must acknowledge the possibility of another option. See Figure 1 above. I’ll get to that in a minute. As long as this week’s low holds, ADA can build on its (red) 5th wave to ideally $3.50 +/- 0.25. See Figure 1A.

In [20]:
articles['BTC'][2]

"Michael Saylor's MicroStrategy has purchased an additional 5,050 bitcoins for about $243 million. The business enterprise software maker's total bitcoin holdings were valued at $5.1 billion as of Monday's session. Bitcoin traded around $44,500 on Monday, a pullback after reclaiming the $50,000 level in August. See more stories on Insider's business page. MicroStrategy has made another big purchase of bitcoin, bringing the value of its holdings of the most traded cryptocurrency to about $5.1 billion. The company bought 5,050 for about $242.9 million in cash, representing an average price of $48,099 per coin, said CEO Michael Saylor in a message Monday on Twitter. Saylor and MicroStrategy, in a statement filed with the Securities and Exchange Commission, said the company now holds 114,042 bitcoins purchased for $3.16 billion at an average price of about $27,713 per bitcoin. Bitcoin traded around $44,500 during Monday's session, valuing the company's bitcoin holdings at $5.1 billion. Say

## 4.4. Summarise all Articles

In [21]:
def summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article, return_tensors='pt')
        output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries

In [22]:
summaries = {ticker:summarize(articles[ticker]) for ticker in monitored_tickers}
summaries

{'ADA': ['I preferred view of larger, more complex, 4th wave: EWP count and technical indicators.',
  'We are aware of the issue and are working to resolve it.',
  'Sweden’s central bank governor says ‘private money usually collapses’. Cardano is set to beat its all-time high by 2021',
  'Rieder says he owns ‘small pieces’ of the asset. Paulson recently told Bloomberg crypto will ‘eventually prove to be worthless’',
  'Greed Music is a crypto music label with Grammy Award winning producers, Cool & Dre.',
  '‘It’s the greatest time to enter the crypto market,’ says Bitwise’s Hougan. El Salvador’s new Bitcoin Law overshadowed events this week',
  'Is the recent low of $2.47 a 4th wave correction already over? Wave-b is now underway, targeting as high as $2.97-3.17',
  'Largest cryptocurrency, Bitcoin, breached $50,000 for the second time in two weeks. Solana, Dogecoin and Cardano all posted four-digit gains',
  'ADA has surpassed BNB to become third-largest cryptocurrency by market cap. 

In [23]:
summaries['BTC']

['Is Ravencoin a potential Alt Coin for the future?',
 'NFTs have entered into the market for the first time in 2021. Total Network Services, Corp founded in 2019 by blockchain innovator Thomas Carter',
 'MicroStrategy CEO Michael Saylor has been a vocal backer of the cryptocurrency. Bitcoin traded around $44,500 on Monday, a pullback',
 'Rieder says he owns ‘small pieces’ of the asset. Paulson recently told Bloomberg crypto will ‘eventually prove to be worthless’',
 'A fake press release said the U.S. retail giant would allow customers to pay in Litecoin.',
 'Bitcoin falls well short of first major resistance level at $46,745. Chainlink, ADA, and crypto total market cap all fall in early hours',
 'We are aware of the issue and are working to resolve it.',
 'Regulators are starting to take a hard look at crypto lending. Coinbase had been preparing to offer 4% on stablecoin USDC',
 'Fidelity executives met with SEC officials. Several firms are seeking approval for Bitcoin ETFs',
 'Cathi

# 5. Adding Sentiment Analysis

In [24]:
from transformers import pipeline
sentiment = pipeline('sentiment-analysis')

In [25]:
sentiment(summaries['ETH'])

[{'label': 'NEGATIVE', 'score': 0.9317285418510437},
 {'label': 'NEGATIVE', 'score': 0.9988366961479187},
 {'label': 'POSITIVE', 'score': 0.9076218605041504},
 {'label': 'NEGATIVE', 'score': 0.9995661973953247},
 {'label': 'NEGATIVE', 'score': 0.9450134038925171},
 {'label': 'NEGATIVE', 'score': 0.9426094889640808},
 {'label': 'NEGATIVE', 'score': 0.9992786645889282},
 {'label': 'POSITIVE', 'score': 0.9941855669021606},
 {'label': 'NEGATIVE', 'score': 0.6406136155128479},
 {'label': 'POSITIVE', 'score': 0.9979087710380554}]

In [26]:
scores = {ticker:sentiment(summaries[ticker]) for ticker in monitored_tickers}
scores

{'ADA': [{'label': 'NEGATIVE', 'score': 0.9294495582580566},
  {'label': 'POSITIVE', 'score': 0.9979087710380554},
  {'label': 'NEGATIVE', 'score': 0.9995661973953247},
  {'label': 'NEGATIVE', 'score': 0.9993500113487244},
  {'label': 'NEGATIVE', 'score': 0.9760673642158508},
  {'label': 'POSITIVE', 'score': 0.9992268681526184},
  {'label': 'NEGATIVE', 'score': 0.9983871579170227},
  {'label': 'NEGATIVE', 'score': 0.6406136751174927},
  {'label': 'POSITIVE', 'score': 0.8167056441307068},
  {'label': 'NEGATIVE', 'score': 0.9996849298477173}],
 'ETH': [{'label': 'NEGATIVE', 'score': 0.9317285418510437},
  {'label': 'NEGATIVE', 'score': 0.9988366961479187},
  {'label': 'POSITIVE', 'score': 0.9076218605041504},
  {'label': 'NEGATIVE', 'score': 0.9995661973953247},
  {'label': 'NEGATIVE', 'score': 0.9450134038925171},
  {'label': 'NEGATIVE', 'score': 0.9426094889640808},
  {'label': 'NEGATIVE', 'score': 0.9992786645889282},
  {'label': 'POSITIVE', 'score': 0.9941855669021606},
  {'label': '

In [27]:
print(summaries['ETH'][3], scores['ETH'][3]['label'], scores['ETH'][3]['score'])

Sweden’s central bank governor says ‘private money usually collapses’. Cardano is set to beat its all-time high by 2021 NEGATIVE 0.9995661973953247


In [28]:
scores['ETH'][0]['score']

0.9317285418510437

# 6. Exporting Results to CSV

In [29]:
summaries

{'ADA': ['I preferred view of larger, more complex, 4th wave: EWP count and technical indicators.',
  'We are aware of the issue and are working to resolve it.',
  'Sweden’s central bank governor says ‘private money usually collapses’. Cardano is set to beat its all-time high by 2021',
  'Rieder says he owns ‘small pieces’ of the asset. Paulson recently told Bloomberg crypto will ‘eventually prove to be worthless’',
  'Greed Music is a crypto music label with Grammy Award winning producers, Cool & Dre.',
  '‘It’s the greatest time to enter the crypto market,’ says Bitwise’s Hougan. El Salvador’s new Bitcoin Law overshadowed events this week',
  'Is the recent low of $2.47 a 4th wave correction already over? Wave-b is now underway, targeting as high as $2.97-3.17',
  'Largest cryptocurrency, Bitcoin, breached $50,000 for the second time in two weeks. Solana, Dogecoin and Cardano all posted four-digit gains',
  'ADA has surpassed BNB to become third-largest cryptocurrency by market cap. 

In [30]:
scores

{'ADA': [{'label': 'NEGATIVE', 'score': 0.9294495582580566},
  {'label': 'POSITIVE', 'score': 0.9979087710380554},
  {'label': 'NEGATIVE', 'score': 0.9995661973953247},
  {'label': 'NEGATIVE', 'score': 0.9993500113487244},
  {'label': 'NEGATIVE', 'score': 0.9760673642158508},
  {'label': 'POSITIVE', 'score': 0.9992268681526184},
  {'label': 'NEGATIVE', 'score': 0.9983871579170227},
  {'label': 'NEGATIVE', 'score': 0.6406136751174927},
  {'label': 'POSITIVE', 'score': 0.8167056441307068},
  {'label': 'NEGATIVE', 'score': 0.9996849298477173}],
 'ETH': [{'label': 'NEGATIVE', 'score': 0.9317285418510437},
  {'label': 'NEGATIVE', 'score': 0.9988366961479187},
  {'label': 'POSITIVE', 'score': 0.9076218605041504},
  {'label': 'NEGATIVE', 'score': 0.9995661973953247},
  {'label': 'NEGATIVE', 'score': 0.9450134038925171},
  {'label': 'NEGATIVE', 'score': 0.9426094889640808},
  {'label': 'NEGATIVE', 'score': 0.9992786645889282},
  {'label': 'POSITIVE', 'score': 0.9941855669021606},
  {'label': '

In [31]:
cleaned_urls

{'ADA': ['https://finance.yahoo.com/news/cardano-ada-expected-pullback-came-170945648.html',
  'https://finance.yahoo.com/news/cardano-smart-contracts-hit-public-103919542.html',
  'https://finance.yahoo.com/news/bitcoin-price-cryptocurrencies-sweden-trading-stamps-ethereum-100929220.html',
  'https://finance.yahoo.com/news/cryptocurriencies-are-a-durable-asset-class-with-real-upside-black-rock-fixed-income-cio-160259991.html',
  'https://finance.yahoo.com/news/greed-one-first-launch-cardano-180100414.html',
  'https://finance.yahoo.com/news/undaunted-by-bitcoin-flash-crash-true-believers-say-its-the-greatest-time-to-get-into-crypto-214309569.html',
  'https://finance.yahoo.com/news/cardano-ada-expect-pullback-another-192023603.html',
  'https://finance.yahoo.com/news/what-augusts-record-breaking-month-for-crypto-flows-means-for-bitcoin-163906276.html',
  'https://finance.yahoo.com/news/cardano-hits-3-mark-first-155147241.html',
  'https://finance.yahoo.com/news/ada-down-7-today-despit

In [32]:
range(len(summaries['ETH']))

range(0, 10)

In [33]:
summaries['ETH'][3]

'Sweden’s central bank governor says ‘private money usually collapses’. Cardano is set to beat its all-time high by 2021'

In [34]:
def create_output_array(summaries, scores, urls):
    output = []
    for ticker in monitored_tickers:
        for counter in range(len(summaries[ticker])):
            output_this = [
                ticker,
                summaries[ticker][counter],
                scores[ticker][counter]['label'],
                scores[ticker][counter]['score'],
                urls[ticker][counter]
            ]
            output.append(output_this)
    return output

In [35]:
final_output = create_output_array(summaries, scores, cleaned_urls)
final_output

[['ADA',
  'I preferred view of larger, more complex, 4th wave: EWP count and technical indicators.',
  'NEGATIVE',
  0.9294495582580566,
  'https://finance.yahoo.com/news/cardano-ada-expected-pullback-came-170945648.html'],
 ['ADA',
  'We are aware of the issue and are working to resolve it.',
  'POSITIVE',
  0.9979087710380554,
  'https://finance.yahoo.com/news/cardano-smart-contracts-hit-public-103919542.html'],
 ['ADA',
  'Sweden’s central bank governor says ‘private money usually collapses’. Cardano is set to beat its all-time high by 2021',
  'NEGATIVE',
  0.9995661973953247,
  'https://finance.yahoo.com/news/bitcoin-price-cryptocurrencies-sweden-trading-stamps-ethereum-100929220.html'],
 ['ADA',
  'Rieder says he owns ‘small pieces’ of the asset. Paulson recently told Bloomberg crypto will ‘eventually prove to be worthless’',
  'NEGATIVE',
  0.9993500113487244,
  'https://finance.yahoo.com/news/cryptocurriencies-are-a-durable-asset-class-with-real-upside-black-rock-fixed-income-

In [36]:
final_output.insert(0, ['Ticker', 'Summary', 'Label', 'Confidence', 'URL'])

In [37]:
final_output

[['Ticker', 'Summary', 'Label', 'Confidence', 'URL'],
 ['ADA',
  'I preferred view of larger, more complex, 4th wave: EWP count and technical indicators.',
  'NEGATIVE',
  0.9294495582580566,
  'https://finance.yahoo.com/news/cardano-ada-expected-pullback-came-170945648.html'],
 ['ADA',
  'We are aware of the issue and are working to resolve it.',
  'POSITIVE',
  0.9979087710380554,
  'https://finance.yahoo.com/news/cardano-smart-contracts-hit-public-103919542.html'],
 ['ADA',
  'Sweden’s central bank governor says ‘private money usually collapses’. Cardano is set to beat its all-time high by 2021',
  'NEGATIVE',
  0.9995661973953247,
  'https://finance.yahoo.com/news/bitcoin-price-cryptocurrencies-sweden-trading-stamps-ethereum-100929220.html'],
 ['ADA',
  'Rieder says he owns ‘small pieces’ of the asset. Paulson recently told Bloomberg crypto will ‘eventually prove to be worthless’',
  'NEGATIVE',
  0.9993500113487244,
  'https://finance.yahoo.com/news/cryptocurriencies-are-a-durable

In [38]:
import csv
with open('assetsummaries.csv', mode='w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerows(final_output)