In [23]:
%load_ext autoreload
%load_ext autotime
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 1.35 ms (started: 2021-11-11 12:17:22 -08:00)


In [24]:
import sys, os, math
import requests
from bs4 import BeautifulSoup
import spacy
from spacy import displacy
import yfinance as yf
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import time

#  setting path
gparent = os.path.join(os.pardir)
sys.path.append(gparent)

from src import helper_functions as h

time: 1.01 ms (started: 2021-11-11 12:17:22 -08:00)


## requests: Grabbing Headlines

In [25]:
# grabbing headlines
r = requests.get("http://feeds.marketwatch.com/marketwatch/topstories/")
r

<Response [200]>

time: 118 ms (started: 2021-11-11 12:17:22 -08:00)


## bs4: Saving Headlines

In [26]:
# saving headlines to a list
soup = BeautifulSoup(r.content, features='lxml')
headlines = soup.findAll('title')
print(f'{type(headlines)}\n')
print(headlines)

<class 'bs4.element.ResultSet'>

[<title>MarketWatch.com - Top Stories</title>, <title>MarketWatch.com - Top Stories</title>, <title>Market Extra: Why the hottest inflation in 3 decades isn’t rattling stock-market bulls</title>, <title>BookWatch: Businesses subject job candidates to so many indignities — and then they wonder why they can’t find people?</title>, <title>Futures Movers: Oil logs a partial rebound as traders weigh prospects for supply and demand</title>, <title>The Margin: Long before it became a trial-interrupting ringtone, ‘God Bless the U.S.A.’ was known as a political and patriotic musical staple — and a Trump favorite</title>, <title>IPO Report: Brilliant Earth shines with analysts who say a digital and sustainable approach to jewelry appeals to millennials and Gen Z</title>, <title>The Ratings Game: Disney’s stock drops after most disappointing earnings report in 10 years prompts price target cuts, even by bullish analysts</title>, <title>The Ratings Game: McDonald’s

### Grabbing Headline String
Grabbing headline and printing headline and isolated text.

In [27]:
# grabbing test headline
test_headline = headlines[2]
print(f'{test_headline}\n')
print(test_headline.text)

<title>Market Extra: Why the hottest inflation in 3 decades isn’t rattling stock-market bulls</title>

Market Extra: Why the hottest inflation in 3 decades isn’t rattling stock-market bulls
time: 1.24 ms (started: 2021-11-11 12:17:22 -08:00)


## spaCy: Tests
Testing tokenization and name extraction on a single string.

### Loading Model

In [28]:
# loading spacy model
nlp = spacy.load('en_core_web_sm')

time: 482 ms (started: 2021-11-11 12:17:22 -08:00)


### Tokenizing String
Tokenizing the test headline.

In [29]:
# checking the test case
processed_hline = nlp(test_headline.text)
print(f'{test_headline}\n')
for token in processed_hline:
  print(token)

<title>Market Extra: Why the hottest inflation in 3 decades isn’t rattling stock-market bulls</title>

Market
Extra
:
Why
the
hottest
inflation
in
3
decades
is
n’t
rattling
stock
-
market
bulls
time: 9.53 ms (started: 2021-11-11 12:17:23 -08:00)


## Saving List of Tokenized Headlines

In [30]:
processed_hlines = [nlp(headlines[i].text) for i in range(len(headlines))]
for line in processed_hlines:
    print(line)

MarketWatch.com - Top Stories
MarketWatch.com - Top Stories
Market Extra: Why the hottest inflation in 3 decades isn’t rattling stock-market bulls
BookWatch: Businesses subject job candidates to so many indignities — and then they wonder why they can’t find people?
Futures Movers: Oil logs a partial rebound as traders weigh prospects for supply and demand
The Margin: Long before it became a trial-interrupting ringtone, ‘God Bless the U.S.A.’ was known as a political and patriotic musical staple — and a Trump favorite
IPO Report: Brilliant Earth shines with analysts who say a digital and sustainable approach to jewelry appeals to millennials and Gen Z
The Ratings Game: Disney’s stock drops after most disappointing earnings report in 10 years prompts price target cuts, even by bullish analysts
The Ratings Game: McDonald’s was the only positive in a quarter full of negatives at Beyond Meat, analysts say
Distributed Ledger: Some hedge funds are seeing ether as an inflation hedge. A crypto 

## Getting Org Names From Headlines
Visualizing named entities (real world objects) in the headlines and creating a set of organiziations.

In [31]:
# pulling company name tokens from headlines
companies = []
for title in processed_hlines:
    doc = nlp(title.text)
    if len(doc.ents) != 0:
        displacy.render(doc, style='ent')
    else:
        pass
    for token in doc.ents:
        if token.label_ == 'ORG':
            companies.append(token.text)
        else:
            pass
companies = set(companies)
print(companies)

{'McDonald', 'Bless the U.S.A.’'}
time: 114 ms (started: 2021-11-11 12:17:23 -08:00)


## Scraping S&P 500 Stock Table w/Requests & BeautifulSoup

In [32]:
# scraping S&P wikipedia page
r = requests.get('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')

# parsing the html
soup = BeautifulSoup(r.text, 'lxml')

# extracting the table
table = soup.find('table', {'class': 'wikitable sortable'})

# printing row with first stock
print(table.findAll('tr')[1:2])

[<tr>
<td><a class="external text" href="https://www.nyse.com/quote/XNYS:MMM" rel="nofollow">MMM</a>
</td>
<td><a href="/wiki/3M" title="3M">3M</a></td>
<td><a class="external text" href="https://www.sec.gov/edgar/browse/?CIK=66740" rel="nofollow">reports</a></td>
<td>Industrials</td>
<td>Industrial Conglomerates</td>
<td><a href="/wiki/Saint_Paul,_Minnesota" title="Saint Paul, Minnesota">Saint Paul, Minnesota</a></td>
<td>1976-08-09</td>
<td>0000066740</td>
<td>1902
</td></tr>]
time: 496 ms (started: 2021-11-11 12:17:23 -08:00)


### Ticker Symbols
Grabbing the ticker symbol from the first cell of each row.

In [33]:
# making list of symbols
symbols = [row.findAll('td')[0].text for row in table.findAll('tr')[1:]]

# checking length
print(f'List Length: {len(symbols)} \n')

# checking first 5 symbols
print(f'First five symbols: {symbols[:5]}')

List Length: 505 

First five symbols: ['MMM\n', 'ABT\n', 'ABBV\n', 'ABMD\n', 'ACN\n']
time: 14.8 ms (started: 2021-11-11 12:17:24 -08:00)


In [34]:
# stripping new line character from the strings 
symbols = list(map(lambda s: s.strip(), symbols))

# checking first 5 symbols
print(symbols[:5])

['MMM', 'ABT', 'ABBV', 'ABMD', 'ACN']
time: 1.04 ms (started: 2021-11-11 12:17:24 -08:00)


### Names
Grabbing the company name from the second cell of each row.

In [35]:
# making list of names
names = [row.findAll('td')[1].text for row in table.findAll('tr')[1:]]

# checking first five names
print(names[:5])

['3M', 'Abbott Laboratories', 'AbbVie', 'Abiomed', 'Accenture']
time: 14.9 ms (started: 2021-11-11 12:17:24 -08:00)


## S&P 500 Dataframe
Creating a data frame of stocks in the S&P 500 index.

In [36]:
# making a data dictionary
data = {'Company Name': names, 'Symbol': symbols}

# creating data frame from the data
stocks_df = pd.DataFrame.from_dict(data)

# checking shape and first five rows
print(stocks_df.shape)
stocks_df.head()

(505, 2)


Unnamed: 0,Company Name,Symbol
0,3M,MMM
1,Abbott Laboratories,ABT
2,AbbVie,ABBV
3,Abiomed,ABMD
4,Accenture,ACN


time: 5.72 ms (started: 2021-11-11 12:17:24 -08:00)


### Checking yf Stock Info Dictionary Keys
Checking the dictionary keys available for pulling stock information.

In [37]:
# instantiating a ticker object
ACN = yf.Ticker('ACN')

time: 3.16 ms (started: 2021-11-11 12:17:24 -08:00)


In [None]:
# checking keys
ACN.info.keys()

### Data for Stocks in the News
Creating a data frame of price and dividend information for S&P 500 stocks in the news in the following steps:

- Create a data dictionary from the list of companies in the headlines.

- Create a data frame from data dictionary.

In [None]:
# creating empty stock info dictionary
stock_data = {
    'Company': [],
    'Symbol': [],
    'currentPrice': [],
    'dayHigh': [],
    'dayLow': [],
    '52wkHigh': [],
    '52wkLow': [],
    'dividendRate': []
    
}

# loading stocks from s&p dataframe and appending data from yf
for company in companies:

    try:
        if stocks_df['Company Name'].str.contains(company).sum():
            symbol = stocks_df[stocks_df['Company Name'].\
                                str.contains(company)]['Symbol'].values[0]
            org_name = stocks_df[stocks_df['Company Name'].\
                                str.contains(company)]['Company Name'].values[0]
            stock_data['Company'].append(org_name)
            stock_data['Symbol'].append(symbol)
            stock_info = yf.Ticker(symbol).info
            stock_data['currentPrice'].append(stock_info['currentPrice'])
            stock_data['dayHigh'].append(stock_info['dayHigh'])
            stock_data['dayLow'].append(stock_info['dayLow'])
            stock_data['52wkHigh'].append(stock_info['fiftyTwoWeekHigh'])
            stock_data['52wkLow'].append(stock_info['fiftyTwoWeekLow'])            
            
            # converting dividend None types to floats
            dividend = stock_info['dividendRate']
            if dividend != None:
                dividend = dividend
            else:
                dividend = 0
            stock_data['dividendRate'].append(dividend)
        else:
            pass
    except:
        pass

In [None]:
# checking dict
stock_data

### Data Frame of S&P 500 Stocks in the News

In [None]:
in_the_news = pd.DataFrame(stock_data)
in_the_news.head()

In [None]:
# checking Dtypes
in_the_news.info()

## Individual Stock Price Helper Function
Checking the helper function.

In [None]:
h.prices('MMM')

In [None]:
h.prices('AAPL')