<a href="https://colab.research.google.com/github/anvichip/automate-stocks-sentiments/blob/main/Automate_Stocks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("human-centered-summarization/financial-summarization-pegasus")

model = AutoModelForSeq2SeqLM.from_pretrained("human-centered-summarization/financial-summarization-pegasus")

In [None]:
input_ids = tokenizer.encode(ARTICLE, return_tensors='pt')
output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
summary = tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
monitored_tickers = ['GME', 'TSLA', 'BTC']

In [None]:
def search_for_stock_news_urls(ticker):
    search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all('a')
    hrefs = [link['href'] for link in atags]
    return hrefs

In [None]:
raw_urls = {ticker:search_for_stock_news_urls(ticker) for ticker in monitored_tickers}
raw_urls

{'GME': ['/?sa=X&ved=0ahUKEwimmtDP5cT9AhUXF1kFHWJuCa0QOwgC',
  '/search?q=yahoo+finance+GME&tbm=nws&ie=UTF-8&gbv=1&sei=hYsEZObDN5eu5NoP4tyl6Ao',
  '/search?q=yahoo+finance+GME&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwimmtDP5cT9AhUXF1kFHWJuCa0Q_AUIBSgA',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwimmtDP5cT9AhUXF1kFHWJuCa0Q_AUIBygC',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwimmtDP5cT9AhUXF1kFHWJuCa0Q_AUICCgD',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwimmtDP5cT9AhUXF1kFHWJuCa0Q_AUICSgE',
  'https://maps.google.com/maps?q=yahoo+finance+GME&um=1&ie=UTF-8&sa=X&ved=0ahUKEwimmtDP5cT9AhUXF1kFHWJuCa0Q_AUICigF',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwimmtDP5cT9AhUXF1kFHWJuCa0Q_AUICygG',
  '/advanced_search',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=nws&source=lnt&tbs=qdr:h&sa=X&ved=0ahUKEwimmtDP5cT9AhUXF1kFHWJuCa0QpwUIDQ',
  '/search?q=yahoo+finance+GME&ie=U

In [None]:
#raw_urls['GME']

In [None]:
import re
exclude_list = ['maps', 'policies', 'preferences', 'accounts', 'support']
def strip_unwanted_urls(urls, exclude_list):
    val = []
    for url in urls:
        if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))
cleaned_urls = {ticker:strip_unwanted_urls(raw_urls[ticker], exclude_list) for ticker in monitored_tickers}
cleaned_urls

{'GME': ['https://finance.yahoo.com/news/wall-street-watchdog-set-adopt-054326135.html',
  'https://finance.yahoo.com/news/amc-entertainment-holdings-amc-stock-100647547.html',
  'https://finance.yahoo.com/news/wall-street-legend-bob-farrells-135201840.html',
  'https://finance.yahoo.com/news/activist-investor-ryan-cohen-buys-124223435.html',
  'https://finance.yahoo.com/news/clock-ticking-louder-stock-rally-212410089.html',
  'https://finance.yahoo.com/news/low-beta-high-performance-3-153503366.html',
  'https://finance.yahoo.com/news/hedge-funds-caught-bigger-squeeze-133434194.html',
  'https://finance.yahoo.com/news/why-the-ai-stock-rally-isnt-another-meme-craze-according-to-a-top-investor-193843100.html',
  'https://www.google.com/search?q%3Dyahoo%2Bfinance%2BGME%26tbm%3Dnws%26pccc%3D1',
  'https://finance.yahoo.com/news/2-high-risk-high-reward-143514698.html',
  'https://finance.yahoo.com/news/bed-bath-beyond-spike-sparks-134101172.html'],
 'TSLA': ['https://finance.yahoo.com/news

In [None]:
def scrape_and_process(urls):
  articles = []
  for url in urls:
    no = "Thank you for your patience. Our engineers are working quickly to resolve the issue."
    r = requests.get(url)
    soup = BeautifulSoup(r.text,'html.parser')
    paragraphs = soup.find_all('p')
    text = [paragraph.text for paragraph in paragraphs]
    words = ' '.join(text).split(' ')[:350]
    article = ' '.join(words)
    if article != no:
      articles.append(article)
  return articles

In [None]:
#articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}
articles = {}
for ticker in monitored_tickers:
  articles[ticker] = scrape_and_process(cleaned_urls[ticker])
articles

{'GME': ['By Douglas Gillison and Chris Prentice (Reuters) -Wall Street\'s top regulator on Wednesday adopted rules tightening the time-frame for stock trades in an effort to tamp down the kind of risk seen in 2021\'s GameStop fiasco, when retail investors suffered heavy losses. The U.S. Securities and Exchange Commission (SEC) also proposed changing rules protecting client assets held by investment managers, a move that could hinder cryptocurrency platforms from serving a key marketplace role. In a 3-2 vote, the SEC opted to shorten the time between when a securities order is placed and when a trade concludes -something officials say can lessen the kind of "systemic risk" spotlighted in early 2021 when the share price of the consumer electronics retailer GameStop Corp plummeted amid intense market volatility. Trade groups have broadly welcomed the commission\'s proposal to cut the so-called settlement cycle to a single business day from two, six years after an earlier SEC rule shorten

In [None]:
def summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article, return_tensors='pt')
        output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries

In [None]:
summaries = {ticker:summarize(articles[ticker]) for ticker in monitored_tickers}
summaries

{'GME': ['Agency also proposes new rules for advisers to protect client assets.',
  'Bireme Capital highlighted AMC Entertainment Holdings, Inc. in its Q4 2022 investor letter.',
  'Bob Farrell is a veteran Wall Street analyst and value investor. Below are his 10 Rules for success:',
  'Cohen is a top-five nonfamily shareholder of Nordstrom. Activist investor wants to replace at least one director',
  'S&P 500 nears half-way-point level that spelled doom for bulls in August. Hedge funds trim positions the most in two years',
  'Low beta stocks can be optimal for investors. High beta stocks can be optimal for some portfolios',
  'Goldman says hedge funds exited long positions at fastest pace since 2015.',
  "BlackRock's Rieder says market opportunity is bigger than meme stocks.",
  'All images are copyrighted.',
  'Two high-risk, high-reward stocks are on Cathie Wood’s radar. Verve Therapeutics’ pipeline includes two heart therapies',
  'Bed Bath & Beyond long a favorite of ‘meme stock’

In [None]:
from transformers import pipeline
sentiment = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [None]:
sentiment(summaries['BTC'])

[{'label': 'NEGATIVE', 'score': 0.9867881536483765},
 {'label': 'POSITIVE', 'score': 0.9756914973258972},
 {'label': 'NEGATIVE', 'score': 0.9993788003921509},
 {'label': 'POSITIVE', 'score': 0.9870191812515259},
 {'label': 'NEGATIVE', 'score': 0.9880996346473694},
 {'label': 'NEGATIVE', 'score': 0.5586729049682617},
 {'label': 'NEGATIVE', 'score': 0.999204695224762},
 {'label': 'NEGATIVE', 'score': 0.9824326038360596}]

In [None]:
scores = {ticker:sentiment(summaries[ticker]) for ticker in monitored_tickers}

scores

{'GME': [{'label': 'POSITIVE', 'score': 0.7255050539970398},
  {'label': 'POSITIVE', 'score': 0.9899953603744507},
  {'label': 'POSITIVE', 'score': 0.9988483190536499},
  {'label': 'POSITIVE', 'score': 0.9836245775222778},
  {'label': 'NEGATIVE', 'score': 0.998445451259613},
  {'label': 'NEGATIVE', 'score': 0.6279425621032715},
  {'label': 'NEGATIVE', 'score': 0.9970281720161438},
  {'label': 'POSITIVE', 'score': 0.9949323534965515},
  {'label': 'NEGATIVE', 'score': 0.9880996346473694},
  {'label': 'POSITIVE', 'score': 0.9682140946388245},
  {'label': 'POSITIVE', 'score': 0.9846209287643433}],
 'TSLA': [{'label': 'NEGATIVE', 'score': 0.999440610408783},
  {'label': 'NEGATIVE', 'score': 0.9995900988578796},
  {'label': 'POSITIVE', 'score': 0.966428816318512},
  {'label': 'POSITIVE', 'score': 0.9978176355361938},
  {'label': 'NEGATIVE', 'score': 0.9988155364990234},
  {'label': 'NEGATIVE', 'score': 0.9880996346473694},
  {'label': 'NEGATIVE', 'score': 0.9607514142990112},
  {'label': 'NE

In [None]:
def create_output_array(summaries, scores, urls):
    output = []
    for ticker in monitored_tickers:
        for counter in range(len(summaries[ticker])):
            output_this = [
                ticker,
                summaries[ticker][counter],
                scores[ticker][counter]['label'],
                scores[ticker][counter]['score'],
                urls[ticker][counter]
            ]
            output.append(output_this)
    return output

In [None]:
final_output = create_output_array(summaries, scores, cleaned_urls)
final_output.insert(0, ['Ticker', 'Summary', 'Label', 'Confidence', 'URL'])
final_output

[['Ticker', 'Summary', 'Label', 'Confidence', 'URL'],
 ['GME',
  'Agency also proposes new rules for advisers to protect client assets.',
  'POSITIVE',
  0.7255050539970398,
  'https://finance.yahoo.com/news/wall-street-watchdog-set-adopt-054326135.html'],
 ['GME',
  'Bireme Capital highlighted AMC Entertainment Holdings, Inc. in its Q4 2022 investor letter.',
  'POSITIVE',
  0.9899953603744507,
  'https://finance.yahoo.com/news/amc-entertainment-holdings-amc-stock-100647547.html'],
 ['GME',
  'Bob Farrell is a veteran Wall Street analyst and value investor. Below are his 10 Rules for success:',
  'POSITIVE',
  0.9988483190536499,
  'https://finance.yahoo.com/news/wall-street-legend-bob-farrells-135201840.html'],
 ['GME',
  'Cohen is a top-five nonfamily shareholder of Nordstrom. Activist investor wants to replace at least one director',
  'POSITIVE',
  0.9836245775222778,
  'https://finance.yahoo.com/news/activist-investor-ryan-cohen-buys-124223435.html'],
 ['GME',
  'S&P 500 nears ha

In [None]:
import pandas as pd
df = pd.DataFrame(final_output)
df.to_csv('file2.csv',header = False)