# 1. Install and Import Baseline Dependencies

In [22]:
!pip install transformers



In [23]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests

# 2. Setup Summarization Model

In [24]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

# 3. Summarize a Single Article

In [25]:
url = "https://au.finance.yahoo.com/news/china-restricting-tesla-use-uncovers-a-significant-challenge-for-elon-musk-expert-161921664.html"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
paragraphs = soup.find_all('p')

In [26]:
paragraphs[0].text

"Stay informed every day with Yahoo Finance's free Fully Briefed newsletter."

In [27]:
text = [paragraph.text for paragraph in paragraphs]
words = ' '.join(text).split(' ')[:400]
ARTICLE = ' '.join(words)

In [28]:
ARTICLE

'Stay informed every day with Yahoo Finance\'s free Fully Briefed newsletter. Renewed political tensions between the U.S. and China — which came to light this week as the Biden administration sat down with their Chinese counterparts for the first time to discuss a range of issues — could ensnarl vehicle maker Tesla (TSLA), which has pushed successfully into China in recent years. In fact, the heightened rhetoric between the two super economic superpowers may have already had blowback on Elon Musk\'s electric car company. The Chinese government is restricting the use of Tesla vehicles by military staff and employees of vital state-owned companies, The Wall Street Journal reported Friday. Chinese officials reportedly have concerns that Tesla\'s cars — outfitted with various data collecting capabilities — could serve as leakers of national security secrets. "I have been saying for months now that Tesla\'s level of integration of their business, of their research, of their sort of geo-trac

In [29]:
input_ids = tokenizer.encode(ARTICLE, return_tensors='pt')
output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
summary = tokenizer.decode(output[0], skip_special_tokens=True)

In [30]:
summary

'China restricting use of electric cars by military. Tesla has been successful in China in recent years'

# 4. Building a News and Sentiment Pipeline

In [31]:
monitored_tickers = ['DOGE', 'ETH', 'BTC']

## 4.1. Search for Stock News using Google and Yahoo Finance

In [32]:
def search_for_stock_news_urls(ticker):
    search_url = "https://www.google.com/search?q={}&tbm=nws".format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all('a')
    hrefs = [link['href'] for link in atags]
    return hrefs 

In [33]:
raw_urls = {ticker:search_for_stock_news_urls(ticker) for ticker in monitored_tickers}
raw_urls

{'DOGE': ['/?sa=X&ved=0ahUKEwi6iO-826vwAhX-HLkGHXJyDj8QOwgC',
  '/?output=search&ie=UTF-8&tbm=nws&sa=X&ved=0ahUKEwi6iO-826vwAhX-HLkGHXJyDj8QPAgE',
  '/search?q=DOGE&tbm=nws&ie=UTF-8&gbv=1&sei=JPuOYLrmBv655OUP8uS5-AM',
  '/search?q=DOGE&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwi6iO-826vwAhX-HLkGHXJyDj8Q_AUIBygA',
  '/search?q=DOGE&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwi6iO-826vwAhX-HLkGHXJyDj8Q_AUICSgC',
  '/search?q=DOGE&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwi6iO-826vwAhX-HLkGHXJyDj8Q_AUICigD',
  'https://maps.google.com/maps?q=DOGE&um=1&ie=UTF-8&sa=X&ved=0ahUKEwi6iO-826vwAhX-HLkGHXJyDj8Q_AUICygE',
  '/search?q=DOGE&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwi6iO-826vwAhX-HLkGHXJyDj8Q_AUIDCgF',
  '/search?q=DOGE&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwi6iO-826vwAhX-HLkGHXJyDj8Q_AUIDSgG',
  '/advanced_search',
  '/search?q=DOGE&ie=UTF-8&tbm=nws&source=lnt&tbs=qdr:h&sa=X&ved=0ahUKEwi6iO-826vwAhX-HLkGHXJyDj8QpwUIDw',
  '/search?q=DOGE&ie=UTF-8&tbm=nws&source=lnt&tbs=qdr

In [34]:
raw_urls['BTC']

['/?sa=X&ved=0ahUKEwiCmc-926vwAhXRGbkGHU23CV8QOwgC',
 '/?output=search&ie=UTF-8&tbm=nws&sa=X&ved=0ahUKEwiCmc-926vwAhXRGbkGHU23CV8QPAgE',
 '/search?q=BTC&tbm=nws&ie=UTF-8&gbv=1&sei=JfuOYMLyKdGz5OUPze6m-AU',
 '/search?q=BTC&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwiCmc-926vwAhXRGbkGHU23CV8Q_AUIBygA',
 '/search?q=BTC&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwiCmc-926vwAhXRGbkGHU23CV8Q_AUICSgC',
 '/search?q=BTC&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwiCmc-926vwAhXRGbkGHU23CV8Q_AUICigD',
 '/search?q=BTC&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwiCmc-926vwAhXRGbkGHU23CV8Q_AUICygE',
 'https://maps.google.com/maps?q=BTC&um=1&ie=UTF-8&sa=X&ved=0ahUKEwiCmc-926vwAhXRGbkGHU23CV8Q_AUIDCgF',
 '/search?q=BTC&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwiCmc-926vwAhXRGbkGHU23CV8Q_AUIDSgG',
 '/advanced_search',
 '/search?q=BTC&ie=UTF-8&tbm=nws&source=lnt&tbs=qdr:h&sa=X&ved=0ahUKEwiCmc-926vwAhXRGbkGHU23CV8QpwUIDw',
 '/search?q=BTC&ie=UTF-8&tbm=nws&source=lnt&tbs=qdr:d&sa=X&ved=0ahUKEwiCmc-926vw

## 4.2. Strip out unwanted URLs

In [35]:
import re

In [36]:
exclude_list = ['maps', 'policies', 'preferences', 'accounts', 'support']

In [37]:
def strip_unwanted_urls(urls, exclude_list):
    val = []
    for url in urls: 
        if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))

In [38]:
cleaned_urls = {ticker:strip_unwanted_urls(raw_urls[ticker], exclude_list) for ticker in monitored_tickers}
cleaned_urls

{'DOGE': ['https://www.deseret.com/2021/5/1/22411726/dogecoin-mark-cuban-investment-prediction',
  'https://www.fool.com/investing/2021/04/29/5-reasons-to-avoid-dogecoin-like-the-plague/',
  'https://www.deseret.com/u-s-world/2021/4/29/22409512/dogecoin-all-time-high-record',
  'https://finance.yahoo.com/news/dogecoin-dead-elon-musk-mark-194114001.html',
  'https://www.forbes.com/sites/billybambrough/2021/05/01/ethereum-is-suddenly-rocketing-but-dogecoin-is-still-the-crypto-price-king/',
  'https://www.coindesk.com/dogecoin-federal-reserve-chairman-jerome-powell-capital-markets',
  'https://www.benzinga.com/markets/cryptocurrency/21/05/20904133/how-is-doge-looking-a-week-away-from-elon-musks-may-8-saturday-night-live-appearance',
  'https://markets.businessinsider.com/currencies/news/mike-novogratz-xrp-ripple-dogecoin-bitcoin-insurance-crypto-investments-2021-4-1030373300',
  'https://www.deseret.com/u-s-world/2021/4/30/22409494/dogecoin-stable-investment-price',
  'https://www.benzing

## 4.3. Search and Scrape Cleaned URLs

In [39]:
def scrape_and_process(URLs):
    ARTICLES = []
    for url in URLs: 
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:350]
        ARTICLE = ' '.join(words)
        ARTICLES.append(ARTICLE)
    return ARTICLES

In [40]:
articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}
articles

{'DOGE': ['Filed under: Dallas Mavericks owner Mark Cuban promoted Dogecoin and other cryptocurrencies on ‘Ellen’ this week  Dallas Mavericks owner Mark Cuban made an appearance on “Ellen” — the daytime talk show with Ellen DeGeneres — and chatted about Dogecoin. Cuban — when asked by DeGeneres about cryptocurrencies — decided to explain the importance of Dogecoin. He said his 11-year-old son, Jake, is involved in Dogecoin, buying about $30 worth. \n    Related\n   Cuban said the Dallas Mavericks accept Dogecoin — which started as an internet meme — in their team shop, too, according to CoinDesk. Cuban’s recent comments might be a good thing for Dogecoin owners, as his previous comments led to a spike in value, as I explained for the Deseret News. Cuban previously celebrated Dogecoin on Twitter. He said the cryptocurrency’s biggest issue is not being able to withdraw any of your Dogecoin holdings from the Robinhood app. Robinhood CEO Vlad Tenev said he will try to change the Robinhood 

In [41]:
articles['BTC'][2]

'CryptoQuant chief Ki Young Ju is unveiling how Bitcoin whales are playing the market as BTC consolidates between $50,000 and $60,000. In a new tweet, the head of the on-chain analytics platform says that he’s witnessing large Bitcoin transfers that are likely happening over the counter (OTC). “Whales have been accumulating BTC. Massive Bitcoins have transferred, but these transactions are not from exchanges. Possibly OTC deals. NVT golden cross remains very low, meaning transaction volume is big compared to the market cap.” According to CryptoQuant, the NVT (Network to Value Transactions) golden cross is a signal that indicates the formation of a local top or bottom. A value below -1.6 suggests that Bitcoin may be poised for a strong leg up. At the end of April, the NVT golden cross metric had a value of -2.95. “NVT Golden Cross indicator hit a year low. BTC has upside potential from the perspective of the NVT valuation model.” In addition, Ki highlights that whales are transferring B

## 4.4. Summarise all Articles

In [44]:
def summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article, return_tensors='pt')
        output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries

In [46]:
summaries = {ticker:summarize(articles[ticker]) for ticker in monitored_tickers}
summaries

{'DOGE': ['Mark Cuban promoted cryptocurrency on ‘Ellen’. Cuban’s previous comments led to a spike in value',
  'Digital currency has more than doubled in value in a matter of months.',
  'Analysts say investing too much into cryptocurrency could be a problem. Musk, Cuban have been tweeting about it',
  'Coin’s value has more than doubled since the start of the year. Tech innovations have made Dogecoin a serious money-maker',
  'Dogecoin is adding almost 50% to its price this week. Ethereum is up more than three-fold from the start of the year',
  '"You are seeing things that are a bit frothy, that’s a fact," said Powell. Fed chief upbeat on U.S. economic recovery, well-capitalized banks',
  'Largest cryptocurrency has gained over 7,500% on a year-to-date basis.',
  'Founder of Galaxy Digital says crypto will make up 2% of global wealth.',
  '‘Some of the asset prices are high,’ Fed chair says. ‘Overall financial stability picture is mixed,’ he says',
  'Shares of Tesla could see a rea

In [47]:
summaries['BTC']

['Check the anti-virus settings on your device.',
 'The 90-year-old ‘Oracle of Omaha’ called the cryptocurrency ‘disgusting’. Buffett says bringing up the currency is like ‘waving the red flag at the bull',
 'Analytics platform CryptoQuant sees ‘massive’ Bitcoin transfers. Golden cross indicator hit a year low at the end of April',
 '"I don\'t welcome a currency that\'s so useful to kidnappers and extortionists,\'" he says.',
 'A look at some of the biggest winners and losers in the stock market this year.',
 'Ethereum has been gaining ground in recent weeks.',
 'SEC delayed rendering decision on VanEck fund this week',
 '‘Of course I hate the bitcoin success,’ Vice Chairman says. Buffett’s company has no exposure to cryptocurrency',
 'This is part 3 of a 12-part series. All quotes inspired by Satoshi Nakamoto',
 'FundStrat sets a $10K price target on the cryptocurrency. Ethereum is up about 40% for the month to date versus 5% decline for bitcoin']

# 5. Adding Sentiment Analysis

In [48]:
from transformers import pipeline
sentiment = pipeline('sentiment-analysis')

In [49]:
sentiment(summaries['ETH'])

[{'label': 'NEGATIVE', 'score': 0.8904660940170288},
 {'label': 'NEGATIVE', 'score': 0.9969916343688965},
 {'label': 'POSITIVE', 'score': 0.9822368621826172},
 {'label': 'NEGATIVE', 'score': 0.9882952570915222},
 {'label': 'POSITIVE', 'score': 0.9987367391586304},
 {'label': 'NEGATIVE', 'score': 0.9977512955665588},
 {'label': 'NEGATIVE', 'score': 0.9923267960548401},
 {'label': 'NEGATIVE', 'score': 0.9682729244232178},
 {'label': 'NEGATIVE', 'score': 0.9958792924880981},
 {'label': 'NEGATIVE', 'score': 0.9975712895393372}]

In [50]:
scores = {ticker:sentiment(summaries[ticker]) for ticker in monitored_tickers}
scores

{'DOGE': [{'label': 'POSITIVE', 'score': 0.9981436729431152},
  {'label': 'NEGATIVE', 'score': 0.9414214491844177},
  {'label': 'NEGATIVE', 'score': 0.9991558194160461},
  {'label': 'POSITIVE', 'score': 0.9993341565132141},
  {'label': 'POSITIVE', 'score': 0.9822368621826172},
  {'label': 'POSITIVE', 'score': 0.9920030832290649},
  {'label': 'POSITIVE', 'score': 0.9810976386070251},
  {'label': 'POSITIVE', 'score': 0.9948969483375549},
  {'label': 'NEGATIVE', 'score': 0.9983652234077454},
  {'label': 'POSITIVE', 'score': 0.9751201272010803}],
 'ETH': [{'label': 'NEGATIVE', 'score': 0.8904660940170288},
  {'label': 'NEGATIVE', 'score': 0.9969916343688965},
  {'label': 'POSITIVE', 'score': 0.9822368621826172},
  {'label': 'NEGATIVE', 'score': 0.9882952570915222},
  {'label': 'POSITIVE', 'score': 0.9987367391586304},
  {'label': 'NEGATIVE', 'score': 0.9977512955665588},
  {'label': 'NEGATIVE', 'score': 0.9923267960548401},
  {'label': 'NEGATIVE', 'score': 0.9682729244232178},
  {'label': 

In [51]:
print(summaries['ETH'][3], scores['ETH'][3]['label'], scores['ETH'][3]['score'])

BNB comes close to hitting an all-time high today, is up 20% from last week’s crash. BSC-based exchange transacted more than four DEXs on Ethereum NEGATIVE 0.9882952570915222


In [52]:
scores['ETH'][0]['score']

0.8904660940170288

# 6. Exporting Results to CSV

In [53]:
summaries

{'DOGE': ['Mark Cuban promoted cryptocurrency on ‘Ellen’. Cuban’s previous comments led to a spike in value',
  'Digital currency has more than doubled in value in a matter of months.',
  'Analysts say investing too much into cryptocurrency could be a problem. Musk, Cuban have been tweeting about it',
  'Coin’s value has more than doubled since the start of the year. Tech innovations have made Dogecoin a serious money-maker',
  'Dogecoin is adding almost 50% to its price this week. Ethereum is up more than three-fold from the start of the year',
  '"You are seeing things that are a bit frothy, that’s a fact," said Powell. Fed chief upbeat on U.S. economic recovery, well-capitalized banks',
  'Largest cryptocurrency has gained over 7,500% on a year-to-date basis.',
  'Founder of Galaxy Digital says crypto will make up 2% of global wealth.',
  '‘Some of the asset prices are high,’ Fed chair says. ‘Overall financial stability picture is mixed,’ he says',
  'Shares of Tesla could see a rea

In [54]:
scores

{'DOGE': [{'label': 'POSITIVE', 'score': 0.9981436729431152},
  {'label': 'NEGATIVE', 'score': 0.9414214491844177},
  {'label': 'NEGATIVE', 'score': 0.9991558194160461},
  {'label': 'POSITIVE', 'score': 0.9993341565132141},
  {'label': 'POSITIVE', 'score': 0.9822368621826172},
  {'label': 'POSITIVE', 'score': 0.9920030832290649},
  {'label': 'POSITIVE', 'score': 0.9810976386070251},
  {'label': 'POSITIVE', 'score': 0.9948969483375549},
  {'label': 'NEGATIVE', 'score': 0.9983652234077454},
  {'label': 'POSITIVE', 'score': 0.9751201272010803}],
 'ETH': [{'label': 'NEGATIVE', 'score': 0.8904660940170288},
  {'label': 'NEGATIVE', 'score': 0.9969916343688965},
  {'label': 'POSITIVE', 'score': 0.9822368621826172},
  {'label': 'NEGATIVE', 'score': 0.9882952570915222},
  {'label': 'POSITIVE', 'score': 0.9987367391586304},
  {'label': 'NEGATIVE', 'score': 0.9977512955665588},
  {'label': 'NEGATIVE', 'score': 0.9923267960548401},
  {'label': 'NEGATIVE', 'score': 0.9682729244232178},
  {'label': 

In [55]:
cleaned_urls

{'DOGE': ['https://www.deseret.com/2021/5/1/22411726/dogecoin-mark-cuban-investment-prediction',
  'https://www.fool.com/investing/2021/04/29/5-reasons-to-avoid-dogecoin-like-the-plague/',
  'https://www.deseret.com/u-s-world/2021/4/29/22409512/dogecoin-all-time-high-record',
  'https://finance.yahoo.com/news/dogecoin-dead-elon-musk-mark-194114001.html',
  'https://www.forbes.com/sites/billybambrough/2021/05/01/ethereum-is-suddenly-rocketing-but-dogecoin-is-still-the-crypto-price-king/',
  'https://www.coindesk.com/dogecoin-federal-reserve-chairman-jerome-powell-capital-markets',
  'https://www.benzinga.com/markets/cryptocurrency/21/05/20904133/how-is-doge-looking-a-week-away-from-elon-musks-may-8-saturday-night-live-appearance',
  'https://markets.businessinsider.com/currencies/news/mike-novogratz-xrp-ripple-dogecoin-bitcoin-insurance-crypto-investments-2021-4-1030373300',
  'https://www.deseret.com/u-s-world/2021/4/30/22409494/dogecoin-stable-investment-price',
  'https://www.benzing

In [56]:
range(len(summaries['ETH']))

range(0, 10)

In [57]:
summaries['ETH'][3]

'BNB comes close to hitting an all-time high today, is up 20% from last week’s crash. BSC-based exchange transacted more than four DEXs on Ethereum'

In [58]:
def create_output_array(summaries, scores, urls):
    output = []
    for ticker in monitored_tickers:
        for counter in range(len(summaries[ticker])):
            output_this = [
                ticker,
                summaries[ticker][counter],
                scores[ticker][counter]['label'],
                scores[ticker][counter]['score'],
                urls[ticker][counter]
            ]
            output.append(output_this)
    return output

In [59]:
final_output = create_output_array(summaries, scores, cleaned_urls)
final_output

[['DOGE',
  'Mark Cuban promoted cryptocurrency on ‘Ellen’. Cuban’s previous comments led to a spike in value',
  'POSITIVE',
  0.9981436729431152,
  'https://www.deseret.com/2021/5/1/22411726/dogecoin-mark-cuban-investment-prediction'],
 ['DOGE',
  'Digital currency has more than doubled in value in a matter of months.',
  'NEGATIVE',
  0.9414214491844177,
  'https://www.fool.com/investing/2021/04/29/5-reasons-to-avoid-dogecoin-like-the-plague/'],
 ['DOGE',
  'Analysts say investing too much into cryptocurrency could be a problem. Musk, Cuban have been tweeting about it',
  'NEGATIVE',
  0.9991558194160461,
  'https://www.deseret.com/u-s-world/2021/4/29/22409512/dogecoin-all-time-high-record'],
 ['DOGE',
  'Coin’s value has more than doubled since the start of the year. Tech innovations have made Dogecoin a serious money-maker',
  'POSITIVE',
  0.9993341565132141,
  'https://finance.yahoo.com/news/dogecoin-dead-elon-musk-mark-194114001.html'],
 ['DOGE',
  'Dogecoin is adding almost 50

In [60]:
final_output.insert(0, ['Ticker', 'Summary', 'Label', 'Confidence', 'URL'])

In [61]:
final_output

[['Ticker', 'Summary', 'Label', 'Confidence', 'URL'],
 ['DOGE',
  'Mark Cuban promoted cryptocurrency on ‘Ellen’. Cuban’s previous comments led to a spike in value',
  'POSITIVE',
  0.9981436729431152,
  'https://www.deseret.com/2021/5/1/22411726/dogecoin-mark-cuban-investment-prediction'],
 ['DOGE',
  'Digital currency has more than doubled in value in a matter of months.',
  'NEGATIVE',
  0.9414214491844177,
  'https://www.fool.com/investing/2021/04/29/5-reasons-to-avoid-dogecoin-like-the-plague/'],
 ['DOGE',
  'Analysts say investing too much into cryptocurrency could be a problem. Musk, Cuban have been tweeting about it',
  'NEGATIVE',
  0.9991558194160461,
  'https://www.deseret.com/u-s-world/2021/4/29/22409512/dogecoin-all-time-high-record'],
 ['DOGE',
  'Coin’s value has more than doubled since the start of the year. Tech innovations have made Dogecoin a serious money-maker',
  'POSITIVE',
  0.9993341565132141,
  'https://finance.yahoo.com/news/dogecoin-dead-elon-musk-mark-19411

In [62]:
import csv
with open('assetsummaries.csv', mode='w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerows(final_output)