In [1]:
%load_ext autoreload
%load_ext autotime
%autoreload 2

time: 491 µs (started: 2021-11-02 14:27:04 -07:00)


In [2]:
import requests, math
from bs4 import BeautifulSoup
import spacy
import yfinance as yf
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import time

time: 1.85 s (started: 2021-11-02 14:27:04 -07:00)


## requests: Grabbing Headlines

In [3]:
# grabbing headlines
resp = requests.get("http://feeds.marketwatch.com/marketwatch/topstories/")
resp

<Response [200]>

time: 426 ms (started: 2021-11-02 14:27:06 -07:00)


## bs4: Saving Headlines to a List

In [4]:
# saving headlines to a list
soup = BeautifulSoup(resp.content, features='lxml')
headlines = soup.findAll('title')
headlines

[<title>MarketWatch.com - Top Stories</title>,
 <title>MarketWatch.com - Top Stories</title>,
 <title>Washington Watch: ‘Big mistake’ for China to skip COP26 in-person climate summit, Biden says</title>,
 <title>Key Words: BlackRock’s Larry Fink warns that oil assets will shift to private hands to avoid scrutiny — and that’s ‘greenwashing’</title>,
 <title>: Bed Bath &amp; Beyond stock jumps 80% after online, in-store retail partnership with Kroger</title>,
 <title>NewsWatch: Zillow to stop flipping homes for good as it stands to lose more than $550 million, will lay off a quarter of staff</title>,
 <title>Earnings Results: Zillow to stop flipping homes for good as it stands to lose more than $550 million, will lay off a quarter of staff</title>,
 <title>Crypto: Can you buy Shiba Inu on Robinhood?</title>,
 <title>Earnings Results: T-Mobile tops earnings expectations but misses on revenue</title>,
 <title>: Biden promises decision ‘fairly quickly’ on Powell renomination, more Fed picks

time: 7.4 ms (started: 2021-11-02 14:27:07 -07:00)


## spaCy: Tests
Testing things on a single string.

### Loading Model

In [5]:
# loading spacy model
nlp = spacy.load('en_core_web_sm')

time: 498 ms (started: 2021-11-02 14:27:07 -07:00)


### Saving Headline String

In [6]:
# grabbing headline string
processed_hline = nlp(headlines[2].text)
processed_hline

Washington Watch: ‘Big mistake’ for China to skip COP26 in-person climate summit, Biden says

time: 12.2 ms (started: 2021-11-02 14:27:07 -07:00)


### Tokenizing String

In [7]:
# checking a test case
processed_hline = nlp(headlines[2].text)
print(headlines[2])
for token in processed_hline:
  print(token)

<title>Washington Watch: ‘Big mistake’ for China to skip COP26 in-person climate summit, Biden says</title>
Washington
Watch
:
‘
Big
mistake
’
for
China
to
skip
COP26
in
-
person
climate
summit
,
Biden
says
time: 10.1 ms (started: 2021-11-02 14:27:07 -07:00)


## Getting Org Names From Headlines

In [8]:
companies = []
for title in headlines:
    doc = nlp(title.text)
    for token in doc.ents:
        if token.label_ == 'ORG':
            companies.append(token.text)
        else:
            pass
companies = set(companies)
companies

{'Bath & Beyond', 'Biden', 'Fed', 'Kroger', 'Washington Watch'}

time: 91.5 ms (started: 2021-11-02 14:27:07 -07:00)


## Scraping S&P 500 w/BeautifulSoup

In [9]:
resp = requests.get('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
soup = BeautifulSoup(resp.text, 'lxml')
table = soup.find('table', {'class': 'wikitable sortable'})
# checking first stock
print(table.findAll('tr')[1:2])

[<tr>
<td><a class="external text" href="https://www.nyse.com/quote/XNYS:MMM" rel="nofollow">MMM</a>
</td>
<td><a href="/wiki/3M" title="3M">3M</a></td>
<td><a class="external text" href="https://www.sec.gov/cgi-bin/browse-edgar?CIK=MMM&amp;action=getcompany" rel="nofollow">reports</a></td>
<td>Industrials</td>
<td>Industrial Conglomerates</td>
<td><a href="/wiki/Saint_Paul,_Minnesota" title="Saint Paul, Minnesota">Saint Paul, Minnesota</a></td>
<td>1976-08-09</td>
<td>0000066740</td>
<td>1902
</td></tr>]
time: 670 ms (started: 2021-11-02 14:27:07 -07:00)


### Ticker Symbols

In [10]:
symbols = [row.findAll('td')[0].text for row in table.findAll('tr')[1:]]
symbols = list(map(lambda s: s.strip(), symbols))
symbols[:5]

['MMM', 'ABT', 'ABBV', 'ABMD', 'ACN']

time: 16.5 ms (started: 2021-11-02 14:27:08 -07:00)


### Names

In [11]:
names = [row.findAll('td')[1].text for row in table.findAll('tr')[1:]]
names = list(map(lambda n: n.strip(), names))
names[:5]

['3M', 'Abbott Laboratories', 'AbbVie', 'Abiomed', 'Accenture']

time: 16 ms (started: 2021-11-02 14:27:08 -07:00)
