# 1. Install and Import Baseline Dependencies

In [1]:
!pip install transformers
!pip install bs4
!pip install sentencepiece
!pip install torch
!pip install ipywidgets



In [2]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests

# 2. Setup Summarization Model

In [3]:
model_name = "human-centered-summarization/financial-summarization-pegasus" # name of model
tokenizer = PegasusTokenizer.from_pretrained(model_name) # passing to tokenizer which allows us to encode our texts into an
                                                            #input format (translating to a format our model works with)
model = PegasusForConditionalGeneration.from_pretrained(model_name) # loads model

# 3. Unit Testing
   **We will perform a unit test by Summarize a Single Article. What we will do in this section is:**
   - Pass through a URL
   - Scrape the website
   - Summarize the website
   - Output Summary

In [4]:
# a. Pass through URL
url = "https://finance.yahoo.com/news/game-stop-stock-is-enduring-an-awful-september-180120497.html" # string holding url
r = requests.get(url) # make requests to url 
# b. Scrape Website
soup = BeautifulSoup(r.text, 'html.parser') # allows eaier processing of output
paragraphs = soup.find_all('p') # find_all paragraphs within url

In [6]:
# c. Summarize Website
text = [paragraph.text for paragraph in paragraphs] # for loop which extractes data from paragraph.text
words = ' '.join(text).split(' ')[:400] # summarize to 400 individual words by splitting
ARTICLE = ' '.join(words)

In [8]:
# d. Output Summary
input_ids = tokenizer.encode(ARTICLE, return_tensors = 'pt') # converts ARTICLE into encoded sequence
output = model.generate(input_ids, max_length = 55, num_beams = 5, early_stopping = True) # passing input_ids into model
summary = tokenizer.decode(output[0], skip_special_tokens = True) # uses tokenizer to decode into readable format

# 4. Building a News and Sentiment Pipeline

In [10]:
monitored_tickers = ['AAPL', 'TSLA', 'AMZN', 'NVDA']

# 4.1 Search for Stock News using Google and Yahoo Finance

In [11]:
def search_stock_urls(ticker):
    search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format(ticker) # base url
    r = requests.get(search_url) # requests url
    soup = BeautifulSoup(r.text, 'html.parser') # parse through url
    a_links = soup.find_all('a') # find links on page (a tag)
    hrefs = [link['href'] for link in a_links] # extracts hrefs links only
    return hrefs

In [12]:
raw_urls = {ticker:search_stock_urls(ticker) for ticker in monitored_tickers} # for loop storing urls in dictionary
raw_urls

{'AAPL': ['/?sa=X&ved=0ahUKEwiR16DErJnzAhWNMlkFHZa6AKoQOwgC',
  '/?output=search&ie=UTF-8&tbm=nws&sa=X&ved=0ahUKEwiR16DErJnzAhWNMlkFHZa6AKoQPAgE',
  '/search?q=yahoo+finance+AAPL&tbm=nws&ie=UTF-8&gbv=1&sei=m61OYdGdF43l5NoPlvWC0Ao',
  '/search?q=yahoo+finance+AAPL&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwiR16DErJnzAhWNMlkFHZa6AKoQ_AUIBygA',
  '/search?q=yahoo+finance+AAPL&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwiR16DErJnzAhWNMlkFHZa6AKoQ_AUICSgC',
  '/search?q=yahoo+finance+AAPL&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwiR16DErJnzAhWNMlkFHZa6AKoQ_AUICigD',
  '/search?q=yahoo+finance+AAPL&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwiR16DErJnzAhWNMlkFHZa6AKoQ_AUICygE',
  'https://maps.google.com/maps?q=yahoo+finance+AAPL&um=1&ie=UTF-8&sa=X&ved=0ahUKEwiR16DErJnzAhWNMlkFHZa6AKoQ_AUIDCgF',
  '/search?q=yahoo+finance+AAPL&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwiR16DErJnzAhWNMlkFHZa6AKoQ_AUIDSgG',
  '/advanced_search',
  '/search?q=yahoo+finance+AAPL&ie=UTF-8&tbm=nws&source=lnt&tb

# 4.2 Strip Out Unwanted URLs

1. make sure there is an https in link
2. get rid of any unwanted google links like https://policies.google.com

In [13]:
import re

In [14]:
excluded_list = ['maps', 'policies', 'preferences', 'accounts', 'support'] # list that contains words that we want to exclude

In [15]:
def strip_unwanted_urls(urls, excluded_list): 
    val = [] # store temporary results
    for url in urls:
        if 'https://' in url and not any(excluded_word in url for excluded_word in excluded_list): # conditional for loop to filter loop
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0] # grabs url gets rid of unwanted portion of url
            val.append(res) # append url to temporary list
    return list(set(val))

In [16]:
cleaned_urls = {ticker:strip_unwanted_urls(raw_urls[ticker], excluded_list) for ticker in monitored_tickers} #storing cleaned urls in a dictionary
cleaned_urls

{'AAPL': ['https://finance.yahoo.com/news/apple-iphone-13-iphone-13-pro-review-132522532.html',
  'https://ca.finance.yahoo.com/news/epic-v-apple-ruling-a-black-eye-but-containable-risk-204252604.html',
  'https://ca.finance.yahoo.com/news/best-fall-foods-target-pumpkin-201220186.html',
  'https://ca.finance.yahoo.com/news/pumpkin-vs-apple-new-data-shows-which-fall-flavor-is-gaining-popularity-214122342.html',
  'https://uk.finance.yahoo.com/news/queues-return-apple-stores-iphone-081812150.html',
  'https://finance.yahoo.com/news/apple-inc-aapl-good-stock-213144227.html',
  'https://ca.finance.yahoo.com/news/apple-i-phone-13-goes-on-sale-230149888.html',
  'https://ca.finance.yahoo.com/news/nfl-apple-sunday-ticket-directv-193022880.html',
  'https://ca.finance.yahoo.com/news/3-warren-buffett-stocks-d-141525490.html',
  'https://ca.finance.yahoo.com/news/apple-stock-is-making-an-impressive-move-into-the-new-i-phone-reveal-173701881.html'],
 'TSLA': ['https://ca.finance.yahoo.com/news/he

# 4.3 Search and Scrape Cleaned URLs

In [17]:
def scrape_and_process(URLS):
    ARTICLES = []
    for url in URLS:
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:350]
        ARTICLE = ' '.join(words)
        ARTICLES.append(ARTICLE) # append single summarized article into ARTICLES array
    return ARTICLES

In [18]:
articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}
articles

{'AAPL': [' Apple’s (AAPL) iPhone 13 line hits the market Friday, and it’s bringing impressive changes to the smartphones’ cameras, batteries, and performance. The lineup includes the pint-size iPhone 13 mini, the iPhone 13, the iPhone 13 Pro, and the hand-stretching iPhone 13 Pro Max. As with the iPhone 12 lineup, pricing for the iPhone 13 family runs from $699 for the entry-level mini to $1,099 for the entry-level iPhone 13 Pro Max. But this time around you’ll get more storage out of the mini and base iPhone 13. That’s because instead of just 64GB, each iPhone now comes with a minimum of 128GB of space. After using the latest iPhones for a few days, I think it’s clear they’re worthwhile upgrades for anyone who’s rolling with anything other than an iPhone 12. If you do have a 12, then upgrading to the iPhone 13 isn’t a must — but you’ll still find welcome changes that may be worth your hard-earned cash. These changes include: Slick new camera features Faster processors Dramatically lo

# 4.4 Summarize All Articles (56:24)

In [19]:
def summarize(articles):
    summaries = []
    for article in articles: # loop through all articles
        input_ids = tokenizer.encode(article, return_tensors = 'pt') # create input ids
        output = model.generate(input_ids, max_length = 55, num_beams = 5, early_stopping = True) # using model to generate summary and storing in output
        summary = tokenizer.decode(output[0], skip_special_tokens = True) # decodes input ids to readable format
        summaries.append(summary)
    return summaries

In [20]:
summaries = {ticker:summarize(articles[ticker]) for ticker in monitored_tickers}
summaries

{'AAPL': ['I’ve been using the new iPhones for a few days, and they’re worth upgrading. Apple’s ProMotion display technology is a welcome upgrade',
  'Judge says Apple must give developers more options. Apple says it will continue to operate a ‘fair and efficient’ marketplace',
  'Target’s fall foods section is pretty close to Trader Joe’s. Here’s a look at some of the best fall foods from the retailer',
  "Coffee chain's Pumpkin Spice Latte is most-talked about seasonal item.",
  'Four new handsets have improved battery life, cameras. Apple says iPhone 13 is ‘the best it has ever created’',
  'Is Apple a good stock to buy? Prominent investors were bullish at the end of the second quarter.',
  'Apple’s latest phones have smaller notch, bigger screens. Four new models will retail for £679 in the UK and $699 in the US',
  'We are aware of the issue and are working to resolve it.',
  'Suncor, Apple and Bank of Canada would be good stocks to own.',
  'The tech giant will hold a Sept. 14 ev

# 5. Adding Sentiment Analysis

In [21]:
from transformers import pipeline
sentiment = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


In [22]:
scores = {ticker:sentiment(summaries[ticker]) for ticker in monitored_tickers}
scores

{'AAPL': [{'label': 'POSITIVE', 'score': 0.9997009038925171},
  {'label': 'POSITIVE', 'score': 0.986458957195282},
  {'label': 'POSITIVE', 'score': 0.9994326233863831},
  {'label': 'POSITIVE', 'score': 0.9829902648925781},
  {'label': 'POSITIVE', 'score': 0.9995183944702148},
  {'label': 'NEGATIVE', 'score': 0.99893718957901},
  {'label': 'POSITIVE', 'score': 0.902543842792511},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'POSITIVE', 'score': 0.8983643054962158},
  {'label': 'POSITIVE', 'score': 0.9924111366271973}],
 'TSLA': [{'label': 'POSITIVE', 'score': 0.9972322583198547},
  {'label': 'NEGATIVE', 'score': 0.9862992167472839},
  {'label': 'NEGATIVE', 'score': 0.9903510212898254},
  {'label': 'NEGATIVE', 'score': 0.989510715007782},
  {'label': 'NEGATIVE', 'score': 0.9874534010887146},
  {'label': 'NEGATIVE', 'score': 0.995489776134491},
  {'label': 'NEGATIVE', 'score': 0.9289106130599976},
  {'label': 'NEGATIVE', 'score': 0.997568666934967},
  {'label': 'NEGAT

# 6. Exporting Results to CSV

In [23]:
def create_output_array(summaries, scores, urls):
    output = []
    for ticker in monitored_tickers: # loop through monitored_tickers
        for counter in range(len(summaries[ticker])): # inner loop which loops through the range of monitored_tickers
            output_this = [
                ticker,
                summaries[ticker][counter],
                scores[ticker][counter]['label'],
                scores[ticker][counter]['score'],
                urls[ticker][counter]
            ] # create an array that bundles all information together
            output.append(output_this)
    return output

In [24]:
final_output = create_output_array(summaries, scores, cleaned_urls)
final_output

[['AAPL',
  'I’ve been using the new iPhones for a few days, and they’re worth upgrading. Apple’s ProMotion display technology is a welcome upgrade',
  'POSITIVE',
  0.9997009038925171,
  'https://finance.yahoo.com/news/apple-iphone-13-iphone-13-pro-review-132522532.html'],
 ['AAPL',
  'Judge says Apple must give developers more options. Apple says it will continue to operate a ‘fair and efficient’ marketplace',
  'POSITIVE',
  0.986458957195282,
  'https://ca.finance.yahoo.com/news/epic-v-apple-ruling-a-black-eye-but-containable-risk-204252604.html'],
 ['AAPL',
  'Target’s fall foods section is pretty close to Trader Joe’s. Here’s a look at some of the best fall foods from the retailer',
  'POSITIVE',
  0.9994326233863831,
  'https://ca.finance.yahoo.com/news/best-fall-foods-target-pumpkin-201220186.html'],
 ['AAPL',
  "Coffee chain's Pumpkin Spice Latte is most-talked about seasonal item.",
  'POSITIVE',
  0.9829902648925781,
  'https://ca.finance.yahoo.com/news/pumpkin-vs-apple-new-

In [25]:
final_output.insert(0, ['Ticker', 'Summary', 'Label', 'Score', 'Confidence', 'URL'])
final_output

[['Ticker', 'Summary', 'Label', 'Score', 'Confidence', 'URL'],
 ['AAPL',
  'I’ve been using the new iPhones for a few days, and they’re worth upgrading. Apple’s ProMotion display technology is a welcome upgrade',
  'POSITIVE',
  0.9997009038925171,
  'https://finance.yahoo.com/news/apple-iphone-13-iphone-13-pro-review-132522532.html'],
 ['AAPL',
  'Judge says Apple must give developers more options. Apple says it will continue to operate a ‘fair and efficient’ marketplace',
  'POSITIVE',
  0.986458957195282,
  'https://ca.finance.yahoo.com/news/epic-v-apple-ruling-a-black-eye-but-containable-risk-204252604.html'],
 ['AAPL',
  'Target’s fall foods section is pretty close to Trader Joe’s. Here’s a look at some of the best fall foods from the retailer',
  'POSITIVE',
  0.9994326233863831,
  'https://ca.finance.yahoo.com/news/best-fall-foods-target-pumpkin-201220186.html'],
 ['AAPL',
  "Coffee chain's Pumpkin Spice Latte is most-talked about seasonal item.",
  'POSITIVE',
  0.9829902648925

In [26]:
import csv # import csv
with open('assetsummaries.csv', mode = 'w', newline = '') as f: # write file as f and name file assetsummaries
    csv_writer = csv.writer(f, delimiter = ',', quotechar = '"', quoting = csv.QUOTE_MINIMAL) # create a new csv_write and pass through specified parameters
    csv_writer.writerows(final_output)