# 1. Install and Import Baseline Dependencies

In [1]:
!pip install transformers
!pip install sentencepiece

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
                                              0.0/7.2 MB ? eta -:--:--
                                              0.0/7.2 MB ? eta -:--:--
                                              0.0/7.2 MB 640.0 kB/s eta 0:00:12
                                              0.0/7.2 MB 640.0 kB/s eta 0:00:12
                                              0.0/7.2 MB 640.0 kB/s eta 0:00:12
                                              0.0/7.2 MB 640.0 kB/s eta 0:00:12
                                              0.0/7.2 MB 640.0 kB/s eta 0:00:12
                                              0.0/7.2 MB 122.9 kB/s eta 0:00:58
                                              0.1/7.2 MB 229.0 kB/s eta 0:00:31
                                              0.1/7.2 MB 249.8 kB/s eta 0:00:29
                                              0.1/7.2 MB 288.1 kB/s eta 0:00:25
                                              0.2/7.2 

In [2]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from bs4 import BeautifulSoup
import requests
import re
from transformers import pipeline
import csv

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


# 2. Setup Summarization Model

In [3]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

Downloading spiece.model: 100%|██████████| 1.91M/1.91M [00:16<00:00, 113kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading (…)cial_tokens_map.json: 100%|██████████| 1.34k/1.34k [00:00<?, ?B/s]
Downloading (…)okenizer_config.json: 100%|██████████| 1.44k/1.44k [00:00<?, ?B/s]
Downloading (…)lve/main/config.json: 100%|██████████| 1.27k/1.27k [00:00<?, ?B/s]


ImportError: 
PegasusForConditionalGeneration requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
Please note that you may need to restart your runtime after installation.


# 3. Building a News and Sentiment Pipeline

In [None]:
monitored_tickers = ['MBG']

## 3.1 Search for Stock News using Google and Finanzen.net

In [None]:
def search_for_stock_news_urls(ticker):
    search_url = "https://www.google.com/search?q=finanzen.net+{}&tbm=nws".format(ticker)
    #search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format(ticker)
    headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' }
    r = requests.get(search_url, headers=headers)
    soup = BeautifulSoup(r.text, 'html.parser')
    #atags = soup.find_all('a')
    atags = soup.find_all('a', href=True)
    hrefs = [link['href'] for link in atags]
    return hrefs

raw_urls = {ticker:search_for_stock_news_urls(ticker) for ticker in monitored_tickers}

raw_urls


{'MBG': ['/search?q=finanzen.net+MBG&tbm=nws&gbv=1&sei=iAZiZMfwGdeLwbkP8LOQ6AE',
  'https://support.google.com/websearch/answer/181196?hl=en',
  'https://www.google.com/webhp?hl=en&sa=X&ved=0ahUKEwjH1LyBjPf-AhXXRTABHfAZBB0QPAgI',
  '#',
  '/search?q=finanzen.net+MBG&source=lnms&sa=X&ved=2ahUKEwjH1LyBjPf-AhXXRTABHfAZBB0Q0pQJegQIBhAC',
  '/search?q=finanzen.net+MBG&tbm=isch&source=lnms&sa=X&ved=2ahUKEwjH1LyBjPf-AhXXRTABHfAZBB0Q0pQJegQIBhAG',
  '/search?q=finanzen.net+MBG&tbm=vid&source=lnms&sa=X&ved=2ahUKEwjH1LyBjPf-AhXXRTABHfAZBB0Q0pQJegQIBhAI',
  '/search?q=finanzen.net+MBG&tbm=shop&source=lnms&sa=X&ved=2ahUKEwjH1LyBjPf-AhXXRTABHfAZBB0Q0pQJegQIBhAK',
  'https://www.finanzen.net/nachricht/aktien/vorlaeufige-zahlen-mercedes-benz-aktie-gesucht-mercedes-benz-faehrt-in-kerngeschaeften-mehr-ergebnis-ein-als-gedacht-12376554',
  'https://www.finanzen.net/nachricht/aktien/ukraine-krieg-im-fokus-mercedes-benz-aktie-knapp-in-gruen-mercedes-benz-trennt-sich-von-anteilen-an-russischen-tochtergesel

## 3.2 Strip out unwanted URLs

In [None]:
exclude_list = ['maps', 'policies', 'preferences', 'accounts', 'support']

In [None]:
def strip_unwanted_urls(urls, exclude_list):
    val = []
    for url in urls: 
        if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))

cleaned_urls = {ticker:strip_unwanted_urls(raw_urls[ticker], exclude_list) for ticker in monitored_tickers}

cleaned_urls


{'MBG': ['https://www.finanzen.net/nachricht/cfd/dax-schwache-ubs-quartalszahlen-trueben-die-stimmung-in-europa-12388562',
  'https://www.finanzen.net/nachricht/cfd/dax-profitiert-von-der-positiven-sitzung-in-asien-12305504',
  'https://www.finanzen.net/nachricht/aktien/vorlaeufige-zahlen-mercedes-benz-aktie-gesucht-mercedes-benz-faehrt-in-kerngeschaeften-mehr-ergebnis-ein-als-gedacht-12376554',
  'https://www.finanzen.net/nachricht/cfd/aktienrueckkauf-treibt-mercedes-benz-aktienkurs-12177450',
  'https://www.finanzen.net/nachricht/aktien/ukraine-krieg-im-fokus-mercedes-benz-aktie-knapp-in-gruen-mercedes-benz-trennt-sich-von-anteilen-an-russischen-tochtergesellschaften-12389478',
  'https://www.finanzen.net/nachricht/aktien/insider-kauft-daimler-truck-aktien-12263689',
  'https://www.finanzen.net/analyse/mercedes-benz_group_ex_daimler_kaufen-dz_bank_872458',
  'https://www.google.com/webhp?hl=en',
  'https://newsinitiative.withgoogle.com/hownewsworks/?fg=1',
  'https://www.finanzen.net

## 3.3 Search and Scrape Cleaned URLs

In [None]:
def scrape_and_process(URLs):
    ARTICLES = []
    for url in URLs: 
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:350]
        ARTICLE = ' '.join(words)
        ARTICLES.append(ARTICLE)
    return ARTICLES

articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}

articles

{'MBG': ['Neu auf finanzen.net? \nKostenfrei registrieren und Vorteile nutzen\n Der heutige Handelstag an den europÃ¤ischen BÃ¶rsen bringt eine deutliche Verschlechterung der Anlegerstimmung als Reaktion auf den schwachen Handel in Asien und den gestrigen Schlusskurs an der Wall Street. Der deutsche DAX notiert derzeit auf neuen WochentiefststÃ¤nden. Die Aufmerksamkeit der Anleger richtet sich heute auf die Quartalsergebnisse der europÃ¤ischen Banken mit den Berichten von UBS (UBSG.CH) und Santander (SAN1.ES).  Die Stimmung in Europa ist wÃ¤hrend des Handels am Dienstag eindeutig schlecht. Quelle: xStation5 von XTB Die Aufmerksamkeit der Anleger in Europa richtet sich heute vor allem auf die Quartalsberichte der europÃ¤ischen GroÃ\x9fbanken wie Santander (SAN1.ES) und UBS (UBSG.CH). Die Ergebnisse der spanischen Santander Ã¼bertrafen zwar die Erwartungen der Analysten, aber die Aktien der Bank verlieren dennoch aufgrund starker KundenabgÃ¤nge und eines etwas enttÃ¤uschenden Nettozinser

## 3.4 Summarise all Articles

In [None]:
def summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article,truncation=True, padding="longest", return_tensors='pt')
        #input_ids = tokenizer.encode_plus(article, add_special_tokens=True, max_length=512, truncation=True, padding="max_length")
        output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries

summaries = {ticker:summarize(articles[ticker]) for ticker in monitored_tickers}

summaries

{'MBG': ['Die Aufmerksamkeit der Anlegerstimmung richtet sich bei Santander, UBS-Bericht.',
  'Die Aktien der einzelnen legen dynamisch erwarteten.',
  'Mercedes-Benz waren erschieen, sodass wegen Inflation, steigenden Zinsen.',
  'Mercedes-Benz Programm selbst im Mrz 2023 beginnen.',
  'Die Mercedes-Aktieinnt gewinnt am XETRA-Handel zeitweise 0,16 Prozent.',
  'Daimler Truck-Aktie gab am Tag der offiziellen BaFin-News im FSE-Handel.',
  'DZ Bank sagt sich am Dienstag in seinen Zielen bertroffen.',
  'All images are copyrighted.',
  'Google News helps you stay up-to-date with the stories that matter most.',
  'BMN-Stand der Mercedes-Aktie erwarteten Anstieg von 22,05 EUR.',
  'Mercedes-Benz wegen geblich illegaler Abgastechnik auf Schadenersatz.',
  'Die Anleger richtet bereits erste Ergebnisbericht der SAP SE.']}

# 4. Adding Sentiment Analysis

In [None]:
sentiment = pipeline('sentiment-analysis')
scores = {ticker:sentiment(summaries[ticker]) for ticker in monitored_tickers}

scores

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Xformers is not installed correctly. If you want to use memorry_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


'\nJust for checking & trouble shooting\nscores\n'

# 5. Exporting Results to CSV

In [None]:
def create_output_array(summaries, scores, urls):
    output = []
    for ticker in monitored_tickers:
        for counter in range(len(summaries[ticker])):
            output_this = [
                ticker,
                summaries[ticker][counter],
                scores[ticker][counter]['label'],
                scores[ticker][counter]['score'],
                urls[ticker][counter]
            ]
            output.append(output_this)
    return output

final_output = create_output_array(summaries, scores, cleaned_urls)
final_output.insert(0, ['Ticker', 'Summary', 'Label', 'Confidence', 'URL'])

"""
Just for checking & trouble shooting
final_output
"""
with open('output.csv', mode='w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerows(final_output)