### Installing and importing Depepndencies

In [2]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration  #tokenizer and model
from bs4 import BeautifulSoup #for scraping
import requests  #for requesting web

### Loading Summarization Model

In [3]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name) 

Downloading spiece.model: 100%|██████████| 1.91M/1.91M [00:00<00:00, 2.43MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 1.34k/1.34k [00:00<00:00, 191kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 1.44k/1.44k [00:00<00:00, 151kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 1.27k/1.27k [00:00<00:00, 203kB/s]
Downloading model.safetensors: 100%|██████████| 2.28G/2.28G [05:23<00:00, 7.03MB/s]
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at human-centered-summarization/financial-summarization-pegasus and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Summarize a Single article

In [6]:
url = "https://finance.yahoo.com/news/millennial-plumber-got-raise-45-110000225.html"
req = requests.get(url) #making request to get info from the url
soup = BeautifulSoup(req.text, "html.parser") #to get html elements from the webpage
news_paragraphs = soup.find_all('p') #paragraphs- getting <p> </p> tags

In [8]:
req
#req.txt #contains everything from the webpage

<Response [200]>

In [10]:
news_paragraphs[0].text

'Robert Jenkins’ first job as a certified plumber was a dream. After a four-year apprenticeship during which he rotated among four companies, he was hired by a local plumber with a reputation for good pay and a great culture.'

In [18]:
text = [p.text for p in news_paragraphs]
##getting 400 words from the news_paragraphs
words = ' '.join(text).split(' ')[:400]  #getting 400 words from word list
article = ' '.join(words) #converting list to string
print(len(words),article)

400 Robert Jenkins’ first job as a certified plumber was a dream. After a four-year apprenticeship during which he rotated among four companies, he was hired by a local plumber with a reputation for good pay and a great culture. “It was great,” he recalls, “pay was good, and they supplied trucks and tools, at first.” Jenkins made about $40 hourly servicing small businesses and homes around Eugene, Ore., with trucks and tools provided. A few months after he started, Jenkins, 36, saw the company raise everyone’s pay across the board, to $45 an hour. That’s when the trouble started. Jenkins says the company stopped supplying tools for its workers: “I was told we get paid enough to buy our own tools,” he said. It’s legal in Oregon for companies to make workers buy their own tools, as long as they make over minimum wage—but this was a change to Jenkins’ working conditions, and he felt cheated. He recalled one job that would have required him to buy a press gun to join pipes together—not a c

In [19]:
input_ids = tokenizer.encode(article, return_tensors='pt') #encoding to input ids
output = model.generate(input_ids, max_length = 55, num_beams = 5, early_stopping= True) #summary length can be adjusted using max_len, num_beams for beam search num_beams= number of most likely possibilities to be considered for generating sentence instead of greedy search, ealry stpping when model reaches certain level of accuracy
summary = tokenizer.decode(output[0], skip_special = True)

In [20]:
print(summary)

<pad>Workers are more likely to quit if they’re promoted, research shows.</s>


### Building a Sentiment Analysis Pipeline

In [22]:
tickers = ['AAL', 'RBLX', 'F']

In [23]:
def search_news_url(ticker):
    search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    a_tags = soup.find_all('a')  #a tags represent links - list of a tags
    hrefs = [link['href'] for link in a_tags]
    return hrefs

In [27]:
raw_links = {t: search_news_url(t) for t in tickers}

In [30]:
raw_links.keys()

dict_keys(['AAL', 'RBLX', 'F'])

In [32]:
raw_links['RBLX']

['/?sa=X&ved=0ahUKEwiYn_ei2qeBAxXWqJUCHRl4Bk0QOwgC',
 '/search?q=yahoo+finance+RBLX&tbm=nws&sca_esv=565014946&ie=UTF-8&gbv=1&sei=yboBZdiWAdbR1sQPmfCZ6AQ',
 '/search?q=yahoo+finance+RBLX&sca_esv=565014946&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwiYn_ei2qeBAxXWqJUCHRl4Bk0Q_AUIBSgA',
 '/search?q=yahoo+finance+RBLX&sca_esv=565014946&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwiYn_ei2qeBAxXWqJUCHRl4Bk0Q_AUIBygC',
 '/search?q=yahoo+finance+RBLX&sca_esv=565014946&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwiYn_ei2qeBAxXWqJUCHRl4Bk0Q_AUICCgD',
 'https://maps.google.com/maps?q=yahoo+finance+RBLX&um=1&ie=UTF-8&sa=X&ved=0ahUKEwiYn_ei2qeBAxXWqJUCHRl4Bk0Q_AUICSgE',
 '/search?q=yahoo+finance+RBLX&sca_esv=565014946&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwiYn_ei2qeBAxXWqJUCHRl4Bk0Q_AUICigF',
 '/search?q=yahoo+finance+RBLX&sca_esv=565014946&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwiYn_ei2qeBAxXWqJUCHRl4Bk0Q_AUICygG',
 '/advanced_search',
 '/search?q=yahoo+finance+RBLX&sca_esv=565014946&ie=UTF-

### Strping unwanted urls

In [31]:
import re

In [33]:
#exclude links with certain words
exclude_keywords = ['maps', 'policies', 'prefrences', 'accounts', 'support']

In [40]:
def remove_unwanted_url(urls, exclude_list):
    val = []
    for url in urls: 
        if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))

In [41]:
cleaned_urls = {ticker:remove_unwanted_url(raw_links[ticker], exclude_keywords) for ticker in tickers}
cleaned_urls

{'AAL': ['https://finance.yahoo.com/news/most-airline-stocks-hurt-during-130800070.html',
  'https://finance.yahoo.com/news/5-airline-stocks-buy-buoyant-121500312.html',
  'https://finance.yahoo.com/news/busy-labor-day-weekend-awaits-121000635.html',
  'https://finance.yahoo.com/news/3-airline-stocks-watch-amid-131800982.html',
  'https://uk.finance.yahoo.com/news/ftse-100-glencore-anglo-american-china-data-152834735.html',
  'https://finance.yahoo.com/news/gbp-jpy-forecast-british-pound-140242190.html',
  'https://finance.yahoo.com/news/american-airlines-aal-gains-market-215020513.html',
  'https://finance.yahoo.com/news/insurance-industry-faces-average-annual-081200418.html',
  'https://www.google.com/preferences?hl=en-IN',
  'https://www.google.com/search?q%3Dyahoo%2Bfinance%2BAAL%26tbm%3Dnws%26pccc%3D1',
  'https://finance.yahoo.com/news/10-travel-stocks-billionaires-loading-192322831.html',
  'https://finance.yahoo.com/news/airline-stock-roundup-aals-bearish-154000916.html'],
 'RB