### 1. Install and import Baseline Dependencies

In [1]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests

### 2. Setup Summarization Model

In [2]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

### 3. Summarize a Single Article

In [3]:
url = "https://finance.yahoo.com/news/apple-kicks-off-four-part-120513895.html"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
paragraphs = soup.find_all('p')

In [4]:
paragraphs[0]

<p>(Bloomberg) -- Apple Inc. tapped the US high-grade bond market Monday with a $5.5 billion sale in four parts.</p>

In [5]:
paragraphs[0].text

'(Bloomberg) -- Apple Inc. tapped the US high-grade bond market Monday with a $5.5 billion sale in four parts.'

In [6]:
text = [paragraph.text for paragraph in paragraphs]
words = ' '.join(text).split(' ')[:400] # We are grabbing the first 400 words
ARTICLE = ' '.join(words)

In [7]:
ARTICLE

'(Bloomberg) -- Apple Inc. tapped the US high-grade bond market Monday with a $5.5 billion sale in four parts. Most Read from Bloomberg You Won’t Like What Comes After Inflation Pelosi Is Expected to Visit Taiwan, Ramping Up US-China Tensions Manchin Spending Deal Includes Billions in Oil Import Taxes Biden Team Tries to Blunt China Rage as Pelosi Heads for Taiwan The longest portion of the offering, a 40-year security, yields 118 basis points over US Treasuries, down from initial price discussions in the 150 basis points range, according to people familiar with the deal. The order book for the sale peaked at more than $23 billion, a person with knowledge of the demand said. Proceeds from the bond sale are earmarked for general corporate purposes, including the financing of share buybacks and dividends, said the people, who asked not to be identified as the details are private. Read more: IG ANALYSIS: Apple, UBS Headline Calendar; Edison Struggles The sale comes after the primary marke

In [8]:
input_ids = tokenizer.encode(ARTICLE , return_tensors = 'pt') # pt - pytorch tensors
output = model.generate(input_ids , max_length = 55, num_beams = 5, early_stopping = True) # Number of beams for beam search
# The beam search algorithm selects multiple alternatives for an input sequence at each time-step based on conditional probability.
summmary = tokenizer.decode(output[0] , skip_special_tokens = True)

In [9]:
summmary

'The iPhone maker sold $4 billion of 10-year notes, $1 billion of 30-year bonds.'

### 4. Building a News and Sentiment Pipeline

In [10]:
monitored_tickers = ['GME', 'TSLA', 'BTC']

### 4.1 Search for Stock News using Google and Yahoo Finance

In [11]:
def search_for_stocks_news_url(ticker):
    search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text , "html.parser")
    atags = soup.find_all('a')
    # a tags represent a link
    hrefs = [link['href'] for link in atags]
    return hrefs

In [12]:
raw_urls = {ticker: search_for_stocks_news_url(ticker) for ticker in monitored_tickers}
raw_urls

{'GME': ['/?sa=X&ved=0ahUKEwiRlsmQ4Kn5AhWMilwKHY5BCWsQOwgC',
  '/search?q=yahoo+finance+GME&tbm=nws&ie=UTF-8&gbv=1&sei=aO7pYpHCH4yV8gKOg6XYBg',
  '/search?q=yahoo+finance+GME&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwiRlsmQ4Kn5AhWMilwKHY5BCWsQ_AUIBSgA',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwiRlsmQ4Kn5AhWMilwKHY5BCWsQ_AUIBygC',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwiRlsmQ4Kn5AhWMilwKHY5BCWsQ_AUICCgD',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwiRlsmQ4Kn5AhWMilwKHY5BCWsQ_AUICSgE',
  'https://maps.google.com/maps?q=yahoo+finance+GME&um=1&ie=UTF-8&sa=X&ved=0ahUKEwiRlsmQ4Kn5AhWMilwKHY5BCWsQ_AUICigF',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwiRlsmQ4Kn5AhWMilwKHY5BCWsQ_AUICygG',
  '/advanced_search',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=nws&source=lnt&tbs=qdr:h&sa=X&ved=0ahUKEwiRlsmQ4Kn5AhWMilwKHY5BCWsQpwUIDQ',
  '/search?q=yahoo+finance+GME&ie=UT

### 4.2 Strip out unwanted URLs

In [13]:
import re

In [14]:
exclude_list = ['maps' , 'polices', 'preferences', 'accounts', 'support']

In [15]:
def strip_unwanted_urls(urls, exclude_list):
    val = []
    for url in urls: 
        if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))

In [16]:
cleaned_urls = {ticker:strip_unwanted_urls(raw_urls[ticker], exclude_list) for ticker in monitored_tickers}
cleaned_urls

{'GME': ['https://policies.google.com/terms?hl=en-IN',
  'https://finance.yahoo.com/news/dow-futures-little-changed-p-190803222.html',
  'https://finance.yahoo.com/news/gamestop-seagen-bed-bath-beyond-080608920.html',
  'https://finance.yahoo.com/news/stock-market-news-live-updates-july-8-2022-114209262.html',
  'https://policies.google.com/privacy?hl=en-IN',
  'https://finance.yahoo.com/news/gamestop-launches-nft-marketplace-202000168.html',
  'https://finance.yahoo.com/news/dow-futures-down-20-pts-065453693.html',
  'https://finance.yahoo.com/news/twitter-gamestop-upstart-fall-premarket-075217111.html',
  'https://finance.yahoo.com/news/10-stocks-making-headlines-tuesday-213626352.html',
  'https://finance.yahoo.com/news/p-500-climbs-energy-rebound-155302286.html',
  'https://finance.yahoo.com/news/midday-movers-upstart-spirit-gamestop-134648988.html',
  'https://finance.yahoo.com/news/stock-market-today-p-surges-201337916.html'],
 'TSLA': ['https://policies.google.com/terms?hl=en-IN

### 4.3 Search and Scrape Cleaned URLs

In [17]:
def scrape_and_process(URLs):
    ARTICLES = []
    for url in URLs: 
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:350]
        ARTICLE = ' '.join(words)
        ARTICLES.append(ARTICLE)
    return ARTICLES

In [18]:
articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}
articles

{'GME': ["Effective 5 January 2022 | Archived versions | Download PDF Country version: India These Terms of Service reflect the way that Google’s business works, the laws that apply to our company, and certain things that we’ve always believed to be true. As a result, these Terms of Service help define Google’s relationship with you as you interact with our services. For example, these terms include the following topic headings: Understanding these terms is important because, by using our services, you’re agreeing to these terms. Besides these terms, we also publish a Privacy Policy. Although it’s not part of these terms, we encourage you to read it to better understand how you can update, manage, export and delete your information. Google services are provided by, and you’re contracting with: Google LLCorganised under the laws of the State of Delaware, USA, and operating under the laws of the USA1600 Amphitheatre ParkwayMountain View, California 94043USA If you’re under the age requir

### 4.4 Summarise all Articles

In [19]:
def summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article, return_tensors='pt')
        output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries

In [20]:
summaries = {ticker:summarize(articles[ticker]) for ticker in monitored_tickers}
summaries

{'GME': ['Terms of Service apply to all of our products and services.',
  'Fed officials said another 50 or 75 basis point move would ‘likely be appropriate’.',
  'Merck said to be in advanced talks to buy Seagen. Shell, Bed Bath & Beyond, Virgin Galactic among companies in focus',
  'U.S. economy added 372,000 jobs last month. Unemployment rate held steady at 3.6% in June',
  'We collect information to provide better services to all our users.',
  'Non-fungible token marketplace allows gamers, creators, collectors and other community members to buy, sell and trade NFTs.',
  'Weekly take on events in the world economy and their fallout.',
  'Tesla, Spirit, Upstart, XPO Logistics among day’s gainers. Twitter, GameStop, Levi Strauss among day’s gainers',
  'ServiceNow, Microsoft and Delta Air Lines make big moves today.',
  'Weekly crude inventories unexpectedly rose. Jobless claims rise to highest since January',
  'We are aware of the issue and are working to resolve it.',
  'Weekly jo

### 5. Adding Sentiment Analysis

In [23]:
import flair 
sentiment_model = flair.models.TextClassifier.load('en-sentiment')

2022-08-03 09:29:37,836 https://nlp.informatik.hu-berlin.de/resources/models/sentiment-curated-distilbert/sentiment-en-mix-distillbert_4.pt not found in cache, downloading to C:\Users\LENOVO\AppData\Local\Temp\tmpvnpw02z5


100%|██████████| 265512723/265512723 [00:23<00:00, 11197332.54B/s]

2022-08-03 09:30:02,281 copying C:\Users\LENOVO\AppData\Local\Temp\tmpvnpw02z5 to cache at C:\Users\LENOVO\.flair\models\sentiment-en-mix-distillbert_4.pt





2022-08-03 09:30:08,898 removing temp file C:\Users\LENOVO\AppData\Local\Temp\tmpvnpw02z5
2022-08-03 09:30:09,468 loading file C:\Users\LENOVO\.flair\models\sentiment-en-mix-distillbert_4.pt


Downloading: 100%|██████████| 28.0/28.0 [00:00<00:00, 9.25kB/s]
Downloading: 100%|██████████| 483/483 [00:00<00:00, 242kB/s]
Downloading: 100%|██████████| 226k/226k [00:00<00:00, 260kB/s]  
Downloading: 100%|██████████| 455k/455k [00:01<00:00, 422kB/s]  


In [31]:
text = "Tesla stock has fallen by 10.5% over this weekend"

In [32]:
sentence = flair.data.Sentence(text)
sentiment_model.predict(sentence)

In [33]:
sentence

Sentence: "Tesla stock has fallen by 10.5 % over this weekend" → NEGATIVE (0.9999)

In [36]:
me = flair.data.Sentence(summaries['BTC'])

In [90]:
import pandas as pd

In [105]:
def news(summaries):
    probs = []
    sentiments = []
    for summmary in summaries:
        sentence = flair.data.Sentence(summmary)
        sentiment_model.predict(sentence)
        
        # print(sentence)
        # Extract sentiment prediction
        probs.append(sentence.labels[0].score)
        sentiments.append(sentence.labels[0].value)
        
    data = {'Probs': probs,
            'Sentiments': sentiments}
    probs_sentiment = pd.DataFrame(data)
    return probs_sentiment

In [106]:
Probs_Sentiments = {ticker:news(summaries[ticker]) for ticker in monitored_tickers}
Probs_Sentiments

{'GME':        Probs Sentiments
 0   0.998128   POSITIVE
 1   0.968114   NEGATIVE
 2   0.855187   NEGATIVE
 3   0.836615   POSITIVE
 4   0.972137   NEGATIVE
 5   0.618895   POSITIVE
 6   0.993245   POSITIVE
 7   0.949120   POSITIVE
 8   0.848616   POSITIVE
 9   0.972733   NEGATIVE
 10  0.998053   POSITIVE
 11  0.770321   POSITIVE,
 'TSLA':        Probs Sentiments
 0   0.998128   POSITIVE
 1   0.997564   NEGATIVE
 2   0.971101   POSITIVE
 3   0.787095   NEGATIVE
 4   0.972137   NEGATIVE
 5   0.991269   NEGATIVE
 6   0.995243   NEGATIVE
 7   0.968061   NEGATIVE
 8   0.999076   NEGATIVE
 9   0.795832   NEGATIVE
 10  0.999328   NEGATIVE
 11  0.995559   POSITIVE,
 'BTC':        Probs Sentiments
 0   0.999875   NEGATIVE
 1   0.991306   POSITIVE
 2   0.998128   POSITIVE
 3   0.999800   NEGATIVE
 4   0.972137   NEGATIVE
 5   0.936211   NEGATIVE
 6   0.999677   NEGATIVE
 7   0.722814   NEGATIVE
 8   0.998836   NEGATIVE
 9   0.999875   NEGATIVE
 10  0.948087   NEGATIVE
 11  0.999963   NEGATIVE}

In [143]:
def news(articles, summaries, cleaned_urls):
    probs = []
    sentiments = []
    for article in articles:
        sentence = flair.data.Sentence(article)
        sentiment_model.predict(sentence)
        
        # print(sentence)
        # Extract sentiment prediction
        probs.append(sentence.labels[0].score)
        sentiments.append(sentence.labels[0].value)
        
    data = {'Probs': probs,
            'Sentiments': sentiments,
            'Articles': articles,
            'Summaries': summaries,
            'URLs': cleaned_urls}
    return data

In [150]:
data = {ticker:news(articles[ticker], summaries[ticker], cleaned_urls[ticker]) for ticker in monitored_tickers}
Probs_Sentiments = pd.DataFrame(data)

In [151]:
Probs_Sentiments

Unnamed: 0,GME,TSLA,BTC
Probs,"[0.924636960029602, 0.9998990297317505, 0.9997...","[0.924636960029602, 0.9938585162162781, 0.9860...","[0.5979259610176086, 0.9932439923286438, 0.924..."
Sentiments,"[POSITIVE, NEGATIVE, NEGATIVE, NEGATIVE, POSIT...","[POSITIVE, NEGATIVE, NEGATIVE, NEGATIVE, POSIT...","[NEGATIVE, NEGATIVE, POSITIVE, NEGATIVE, POSIT..."
Articles,[Effective 5 January 2022 | Archived versions ...,[Effective 5 January 2022 | Archived versions ...,[Ether has flipped bitcoin for the first time ...
Summaries,[Terms of Service apply to all of our products...,[Terms of Service apply to all of our products...,[Deribit saw $5.7bn of ether options trades on...
URLs,"[https://policies.google.com/terms?hl=en-IN, h...","[https://policies.google.com/terms?hl=en-IN, h...",[https://finance.yahoo.com/news/ethereum-bitco...
