In [1]:
%load_ext autoreload
%load_ext autotime
%autoreload 2

time: 444 µs (started: 2021-11-10 13:06:35 -08:00)


In [2]:
import sys, os, math
import requests
from bs4 import BeautifulSoup
import spacy
from spacy import displacy
import yfinance as yf
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import time

#  setting path
gparent = os.path.join(os.pardir)
sys.path.append(gparent)

from src import helper_functions as h

time: 15.1 s (started: 2021-11-10 13:06:35 -08:00)


## requests: Grabbing Headlines

In [3]:
# grabbing headlines
r = requests.get("http://feeds.marketwatch.com/marketwatch/topstories/")
r

<Response [200]>

time: 380 ms (started: 2021-11-10 13:06:50 -08:00)


## bs4: Saving Headlines

In [4]:
# saving headlines to a list
soup = BeautifulSoup(r.content, features='lxml')
headlines = soup.findAll('title')
print(f'{type(headlines)}\n')
print(headlines)

<class 'bs4.element.ResultSet'>

[<title>MarketWatch.com - Top Stories</title>, <title>MarketWatch.com - Top Stories</title>, <title>The Ratings Game: Poshmark shares plunge as analysts express optimism about resale but call quarterly results ‘disappointing’</title>, <title>: WW banks on new weight loss program after third-quarter slump</title>, <title>Bond Report: Ten-year Treasury yield sees largest one-day rise in a year as U.S. inflation hits a 31-year high</title>, <title>Living With Climate Change: Solar tax credits and heat pump rebates: All the ways Build Back Better would incentivize cleaner energy at home</title>, <title>: SEC blocks inflation-protected crypto token</title>, <title>: ‘Inflation hurts Americans’ pocketbooks’: How to stretch your dollar as CPI hits 31-year high — and how to negotiate lower prices</title>, <title>: Unvaccinated people will hear you out on the COVID-19 shot — as long as you don’t work for the government</title>, <title>: Mastercard delivers ‘stro

### Grabbing Headline String
Grabbing headline and printing headline and isolated text.

In [5]:
# grabbing test headline
test_headline = headlines[2]
print(f'{test_headline}\n')
print(test_headline.text)

<title>The Ratings Game: Poshmark shares plunge as analysts express optimism about resale but call quarterly results ‘disappointing’</title>

The Ratings Game: Poshmark shares plunge as analysts express optimism about resale but call quarterly results ‘disappointing’
time: 1.05 ms (started: 2021-11-10 13:06:51 -08:00)


## spaCy: Tests
Testing tokenization and name extraction on a single string.

### Loading Model

In [6]:
# loading spacy model
nlp = spacy.load('en_core_web_sm')

time: 684 ms (started: 2021-11-10 13:06:51 -08:00)


### Tokenizing String
Tokenizing the test headline.

In [7]:
# checking the test case
processed_hline = nlp(test_headline.text)
print(f'{test_headline}\n')
for token in processed_hline:
  print(token)

<title>The Ratings Game: Poshmark shares plunge as analysts express optimism about resale but call quarterly results ‘disappointing’</title>

The
Ratings
Game
:
Poshmark
shares
plunge
as
analysts
express
optimism
about
resale
but
call
quarterly
results
‘
disappointing
’
time: 13.2 ms (started: 2021-11-10 13:06:52 -08:00)


## Saving List of Tokenized Headlines

In [8]:
processed_hlines = [nlp(headlines[i].text) for i in range(len(headlines))]
for line in processed_hlines:
    print(line)

MarketWatch.com - Top Stories
MarketWatch.com - Top Stories
Where Should I Retire?: I never expected to retire to Panama — but we are living ‘very comfortably’ on $1,200 a month
Living With Climate Change: 3 ways to shrink your carbon footprint the next time you’re grocery shopping
The Conversation: What the Fed’s taper means for your pocketbook
Tax Guy: ‘Don’t give away loser shares’: It’s not too late to reduce your 2021 tax bill. Here’s how.
Outside the Box: A growing number of retirees are heading to work — as entrepreneurs
Project Syndicate: The policy pendulum has swung wildly from austerity to euphoria and back to austerity again
: Mortgage rates fall for the first time in weeks — even as the Fed decides to roll back stimulus
NewsWatch: ‘It’s a melt-up’: U.S. stocks are on an unusually strong run heading into the holidays
Next Avenue: Many employers are offering new and better benefits—how to choose the right ones for next year
Encore: Are teachers overpaid or underpaid?
time: 9

## Getting Org Names From Headlines
Visualizing named entities (real world objects) in the headlines and creating a set of organiziations.

In [8]:
# pulling company name tokens from headlines
companies = []
for title in processed_hlines:
    doc = nlp(title.text)
    if len(doc.ents) != 0:
        displacy.render(doc, style='ent')
    else:
        pass
    for token in doc.ents:
        if token.label_ == 'ORG':
            companies.append(token.text)
        else:
            pass
companies = set(companies)
print(companies)

{'SEC', 'Mastercard', 'Treasury', 'McDonald'}
time: 127 ms (started: 2021-11-10 13:06:52 -08:00)


## Scraping S&P 500 Stock Table w/Requests & BeautifulSoup

In [9]:
# scraping S&P wikipedia page
r = requests.get('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')

# parsing the html
soup = BeautifulSoup(r.text, 'lxml')

# extracting the table
table = soup.find('table', {'class': 'wikitable sortable'})

# printing row with first stock
print(table.findAll('tr')[1:2])

[<tr>
<td><a class="external text" href="https://www.nyse.com/quote/XNYS:MMM" rel="nofollow">MMM</a>
</td>
<td><a href="/wiki/3M" title="3M">3M</a></td>
<td><a class="external text" href="https://www.sec.gov/edgar/browse/?CIK=66740" rel="nofollow">reports</a></td>
<td>Industrials</td>
<td>Industrial Conglomerates</td>
<td><a href="/wiki/Saint_Paul,_Minnesota" title="Saint Paul, Minnesota">Saint Paul, Minnesota</a></td>
<td>1976-08-09</td>
<td>0000066740</td>
<td>1902
</td></tr>]
time: 900 ms (started: 2021-11-10 13:06:52 -08:00)


### Ticker Symbols
Grabbing the ticker symbol from the first cell of each row.

In [10]:
# making list of symbols
symbols = [row.findAll('td')[0].text for row in table.findAll('tr')[1:]]

# checking length
print(f'List Length: {len(symbols)} \n')

# checking first 5 symbols
print(f'First five symbols: {symbols[:5]}')

['MMM\n', 'ABT\n', 'ABBV\n', 'ABMD\n', 'ACN\n']
time: 16.9 ms (started: 2021-11-10 13:06:53 -08:00)


In [11]:
# stripping new line character from the strings 
symbols = list(map(lambda s: s.strip(), symbols))

# checking first 5 symbols
print(symbols[:5])

['MMM', 'ABT', 'ABBV', 'ABMD', 'ACN']
time: 1.26 ms (started: 2021-11-10 13:06:53 -08:00)


### Names
Grabbing the company name from the second cell of each row.

In [12]:
# making list of names
names = [row.findAll('td')[1].text for row in table.findAll('tr')[1:]]

# checking first five names
print(names[:5])

['3M', 'Abbott Laboratories', 'AbbVie', 'Abiomed', 'Accenture']
time: 14.5 ms (started: 2021-11-10 13:06:53 -08:00)


### Industries
Grabbing the company industry from the fifth cell of each row.

In [13]:
# making list of industries
industries = [row.findAll('td')[4].text for row in table.findAll('tr')[1:]]

# checking first five names
print(industries[:5])

['Industrial Conglomerates', 'Health Care Equipment', 'Pharmaceuticals', 'Health Care Equipment', 'IT Consulting & Other Services']
time: 13.5 ms (started: 2021-11-10 13:06:53 -08:00)


## S&P 500 Dataframe
Creating a data frame of stocks in the S&P 500 index.

In [14]:
# making a data dictionary
data = {'Company Name': names, 'Symbol': symbols, 'Industry': industries}

# creating data frame from the data
stocks_df = pd.DataFrame.from_dict(data)

# checking shape and first five rows
print(stocks_df.shape)
stocks_df.head()

(505, 3)


Unnamed: 0,Company Name,Symbol,Industry
0,3M,MMM,Industrial Conglomerates
1,Abbott Laboratories,ABT,Health Care Equipment
2,AbbVie,ABBV,Pharmaceuticals
3,Abiomed,ABMD,Health Care Equipment
4,Accenture,ACN,IT Consulting & Other Services


time: 17.3 ms (started: 2021-11-10 13:06:53 -08:00)


### Checking yf Stock Info Dictionary Keys
Checking the dictionary keys available for pulling stock information.

In [15]:
# instantiating a ticker object
ACN = yf.Ticker('ACN')

time: 2.93 ms (started: 2021-11-10 13:06:53 -08:00)


In [16]:
# checking keys
ACN.info.keys()

dict_keys(['zip', 'sector', 'fullTimeEmployees', 'longBusinessSummary', 'city', 'phone', 'country', 'companyOfficers', 'website', 'maxAge', 'address1', 'fax', 'industry', 'address2', 'ebitdaMargins', 'profitMargins', 'grossMargins', 'operatingCashflow', 'revenueGrowth', 'operatingMargins', 'ebitda', 'targetLowPrice', 'recommendationKey', 'grossProfits', 'freeCashflow', 'targetMedianPrice', 'currentPrice', 'earningsGrowth', 'currentRatio', 'returnOnAssets', 'numberOfAnalystOpinions', 'targetMeanPrice', 'debtToEquity', 'returnOnEquity', 'targetHighPrice', 'totalCash', 'totalDebt', 'totalRevenue', 'totalCashPerShare', 'financialCurrency', 'revenuePerShare', 'quickRatio', 'recommendationMean', 'exchange', 'shortName', 'longName', 'exchangeTimezoneName', 'exchangeTimezoneShortName', 'isEsgPopulated', 'gmtOffSetMilliseconds', 'quoteType', 'symbol', 'messageBoardId', 'market', 'annualHoldingsTurnover', 'enterpriseToRevenue', 'beta3Year', 'enterpriseToEbitda', '52WeekChange', 'morningStarRiskR

time: 3.63 s (started: 2021-11-10 13:06:53 -08:00)


### Data for Stocks in the News
Creating a data frame of price and dividend information for S&P 500 stocks in the news in the following steps:

- Create a data dictionary from the list of companies in the headlines.

- Create a data frame from data dictionary.

In [17]:
# creating empty stock info dictionary
stock_data = {
    'Company': [],
    'Symbol': [],
    'currentPrice': [],
    'dayHigh': [],
    'dayLow': [],
    '52wkHigh': [],
    '52wkLow': [],
    'dividendRate': []
    
}

# loading stocks from s&p dataframe and appending data from yf
for company in companies:

    try:
        if stocks_df['Company Name'].str.contains(company).sum():
            symbol = stocks_df[stocks_df['Company Name'].\
                                str.contains(company)]['Symbol'].values[0]
            org_name = stocks_df[stocks_df['Company Name'].\
                                str.contains(company)]['Company Name'].values[0]
            stock_data['Company'].append(org_name)
            stock_data['Symbol'].append(symbol)
            stock_info = yf.Ticker(symbol).info
            stock_data['currentPrice'].append(stock_info['currentPrice'])
            stock_data['dayHigh'].append(stock_info['dayHigh'])
            stock_data['dayLow'].append(stock_info['dayLow'])
            stock_data['52wkHigh'].append(stock_info['fiftyTwoWeekHigh'])
            stock_data['52wkLow'].append(stock_info['fiftyTwoWeekLow'])            
            
            # converting dividend None types to floats
            dividend = stock_info['dividendRate']
            if dividend != None:
                dividend = dividend
            else:
                dividend = 0
            stock_data['dividendRate'].append(dividend)
        else:
            pass
    except:
        pass

time: 6.46 s (started: 2021-11-10 13:06:57 -08:00)


In [18]:
# checking dict
stock_data

{'Company': ['Mastercard', "McDonald's"],
 'Symbol': ['MA', 'MCD'],
 'currentPrice': [357.7, 253.13],
 'dayHigh': [359.6, 253.7],
 'dayLow': [349.51, 250.81],
 '52wkHigh': [401.5, 257.53],
 '52wkLow': [312.38, 202.73],
 'dividendRate': [1.76, 5.52]}

time: 3.04 ms (started: 2021-11-10 13:07:03 -08:00)


### Data Frame of S&P 500 Stocks in the News

In [19]:
in_the_news = pd.DataFrame(stock_data)
in_the_news.head()

Unnamed: 0,Company,Symbol,currentPrice,dayHigh,dayLow,52wkHigh,52wkLow,dividendRate
0,Mastercard,MA,357.7,359.6,349.51,401.5,312.38,1.76
1,McDonald's,MCD,253.13,253.7,250.81,257.53,202.73,5.52


time: 9.05 ms (started: 2021-11-10 13:07:03 -08:00)


In [20]:
# checking Dtypes
in_the_news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Company       2 non-null      object 
 1   Symbol        2 non-null      object 
 2   currentPrice  2 non-null      float64
 3   dayHigh       2 non-null      float64
 4   dayLow        2 non-null      float64
 5   52wkHigh      2 non-null      float64
 6   52wkLow       2 non-null      float64
 7   dividendRate  2 non-null      float64
dtypes: float64(6), object(2)
memory usage: 256.0+ bytes
time: 5.92 ms (started: 2021-11-10 13:07:03 -08:00)


## Individual Stock Price Helper Function
Checking the helper function.

In [21]:
h.prices('MMM')

Unnamed: 0,Company,Symbol,Current Price,Intraday High,Intraday Low,52wkHigh,52wkLow,Dividend
0,3M Company,MMM,182.42,182.83,180.71,208.95,163.38,5.92


time: 3.23 s (started: 2021-11-10 13:07:03 -08:00)


In [22]:
h.prices('AAPL')

Unnamed: 0,Company,Symbol,Current Price,Intraday High,Intraday Low,52wkHigh,52wkLow,Dividend
0,Apple Inc.,AAPL,147.92,150.1297,147.85,157.26,112.59,0.88


time: 3.06 s (started: 2021-11-10 13:07:07 -08:00)
