In [1]:
%load_ext autoreload
%load_ext autotime
%autoreload 2

time: 619 µs (started: 2021-11-03 14:42:56 -07:00)


In [2]:
import requests, math, sys, os

#  setting path

gparent = os.path.join(os.pardir)
sys.path.append(gparent)

from bs4 import BeautifulSoup
import spacy
import yfinance as yf
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import time

from src import helper_functions as h

time: 2.54 s (started: 2021-11-03 14:42:56 -07:00)


## requests: Grabbing Headlines

In [3]:
# grabbing headlines
resp = requests.get("http://feeds.marketwatch.com/marketwatch/topstories/")
resp

<Response [200]>

time: 392 ms (started: 2021-11-03 14:42:58 -07:00)


## bs4: Saving Headlines to a List

In [4]:
# saving headlines to a list
soup = BeautifulSoup(resp.content, features='lxml')
headlines = soup.findAll('title')
headlines

[<title>MarketWatch.com - Top Stories</title>,
 <title>MarketWatch.com - Top Stories</title>,
 <title>Dow Jones Newswires: Hyatt Hotels Q3 revenue more than doubles</title>,
 <title>The Moneyist: ‘I just feel like I’m behind’: I’m 29 years old. I have $4,000 in the bank and a $20,000 emergency fund. Am I doing OK financially?</title>,
 <title>Earnings Results: Qorvo stock rocked by weak holiday forecast due to semiconductor shortage</title>,
 <title>The Moneyist: ‘I believe my ex-husband influenced my father to write me out of his will. He’s a narcissist and has turned everyone against me. Should I contest it?’</title>,
 <title>The Moneyist: I rented a 3-bedroom apartment for $1,500 and charged my roommates $800 per room. I got a $100 cut each month. Was I wrong?</title>,
 <title>Earnings Results: Take-Two stock ticks higher after bookings forecast gets a boost</title>,
 <title>NewsWatch: Why the Fed’s long-awaited taper announcement isn’t rattling the stock market</title>,
 <title>Ear

time: 7.48 ms (started: 2021-11-03 14:42:59 -07:00)


## spaCy: Tests
Testing things on a single string.

### Loading Model

In [5]:
# loading spacy model
nlp = spacy.load('en_core_web_sm')

time: 495 ms (started: 2021-11-03 14:42:59 -07:00)


### Saving Headline String

In [6]:
# grabbing test headline string
processed_hline = nlp(headlines[2].text)
processed_hline

Dow Jones Newswires: Hyatt Hotels Q3 revenue more than doubles

time: 10.8 ms (started: 2021-11-03 14:42:59 -07:00)


### Tokenizing String

In [7]:
# checking the test case
processed_hline = nlp(headlines[2].text)
print(headlines[2])
for token in processed_hline:
  print(token)

<title>Dow Jones Newswires: Hyatt Hotels Q3 revenue more than doubles</title>
Dow
Jones
Newswires
:
Hyatt
Hotels
Q3
revenue
more
than
doubles
time: 7.79 ms (started: 2021-11-03 14:42:59 -07:00)


## Getting Org Names From Headlines

In [8]:
# pulling company name tokens from headlines
companies = []
for title in headlines:
    doc = nlp(title.text)
    for token in doc.ents:
        if token.label_ == 'ORG':
            companies.append(token.text)
        else:
            pass
companies = set(companies)
companies

{'Fed', 'Fisker'}

time: 94.7 ms (started: 2021-11-03 14:42:59 -07:00)


## Scraping S&P 500 w/BeautifulSoup

In [9]:
# scraping S&P wikipedia page
resp = requests.get('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')

# parsing the html
soup = BeautifulSoup(resp.text, 'lxml')

# extracting the table
table = soup.find('table', {'class': 'wikitable sortable'})

# checking row with first stock
print(table.findAll('tr')[1:2])

[<tr>
<td><a class="external text" href="https://www.nyse.com/quote/XNYS:MMM" rel="nofollow">MMM</a>
</td>
<td><a href="/wiki/3M" title="3M">3M</a></td>
<td><a class="external text" href="https://www.sec.gov/cgi-bin/browse-edgar?CIK=MMM&amp;action=getcompany" rel="nofollow">reports</a></td>
<td>Industrials</td>
<td>Industrial Conglomerates</td>
<td><a href="/wiki/Saint_Paul,_Minnesota" title="Saint Paul, Minnesota">Saint Paul, Minnesota</a></td>
<td>1976-08-09</td>
<td>0000066740</td>
<td>1902
</td></tr>]
time: 618 ms (started: 2021-11-03 14:43:00 -07:00)


### Ticker Symbols

In [10]:
# making list of symbols
symbols = [row.findAll('td')[0].text for row in table.findAll('tr')[1:]]

# stripping whitespace
symbols = list(map(lambda s: s.strip(), symbols))

# checking first 5 symbols
symbols[:5]

['MMM', 'ABT', 'ABBV', 'ABMD', 'ACN']

time: 17.1 ms (started: 2021-11-03 14:43:00 -07:00)


### Names

In [11]:
# making list of names
names = [row.findAll('td')[1].text for row in table.findAll('tr')[1:]]

# stripping whitespace
names = list(map(lambda n: n.strip(), names))

# checking first five names
names[:5]

['3M', 'Abbott Laboratories', 'AbbVie', 'Abiomed', 'Accenture']

time: 15.8 ms (started: 2021-11-03 14:43:00 -07:00)


### Industries

In [12]:
# making list of industries

industries = [row.findAll('td')[4].text for row in table.findAll('tr')[1:]]

# stripping whitespace
industries = list(map(lambda i: i.strip(), industries))

# checking first five industries
industries[:5]

['Industrial Conglomerates',
 'Health Care Equipment',
 'Pharmaceuticals',
 'Health Care Equipment',
 'IT Consulting & Other Services']

time: 16.3 ms (started: 2021-11-03 14:43:00 -07:00)


## S&P 500 Dataframe

In [13]:
# making data dictionary
data = {'Company Name': names, 'Symbol': symbols, 'Industry': industries}

# creating data frame
stocks_df = pd.DataFrame.from_dict(data)

# checking shape and first five rows
print(stocks_df.shape)
stocks_df.head()

(505, 3)


Unnamed: 0,Company Name,Symbol,Industry
0,3M,MMM,Industrial Conglomerates
1,Abbott Laboratories,ABT,Health Care Equipment
2,AbbVie,ABBV,Pharmaceuticals
3,Abiomed,ABMD,Health Care Equipment
4,Accenture,ACN,IT Consulting & Other Services


time: 9.8 ms (started: 2021-11-03 14:43:00 -07:00)


## Checking yf Stock Info Dictionary Keys

In [14]:
# instantiating a ticker object
ACN = h.get_info('ACN')

time: 3.6 ms (started: 2021-11-03 14:43:00 -07:00)


In [15]:
# checking keys
ACN.info.keys()

dict_keys(['zip', 'sector', 'fullTimeEmployees', 'longBusinessSummary', 'city', 'phone', 'country', 'companyOfficers', 'website', 'maxAge', 'address1', 'fax', 'industry', 'address2', 'ebitdaMargins', 'profitMargins', 'grossMargins', 'operatingCashflow', 'revenueGrowth', 'operatingMargins', 'ebitda', 'targetLowPrice', 'recommendationKey', 'grossProfits', 'freeCashflow', 'targetMedianPrice', 'currentPrice', 'earningsGrowth', 'currentRatio', 'returnOnAssets', 'numberOfAnalystOpinions', 'targetMeanPrice', 'debtToEquity', 'returnOnEquity', 'targetHighPrice', 'totalCash', 'totalDebt', 'totalRevenue', 'totalCashPerShare', 'financialCurrency', 'revenuePerShare', 'quickRatio', 'recommendationMean', 'exchange', 'shortName', 'longName', 'exchangeTimezoneName', 'exchangeTimezoneShortName', 'isEsgPopulated', 'gmtOffSetMilliseconds', 'quoteType', 'symbol', 'messageBoardId', 'market', 'annualHoldingsTurnover', 'enterpriseToRevenue', 'beta3Year', 'enterpriseToEbitda', '52WeekChange', 'morningStarRiskR

time: 3.07 s (started: 2021-11-03 14:43:01 -07:00)


### Data for Stocks in the News

In [16]:
# creating empty stock info dictionary
stock_data = {
    'Org': [],
    'Symbol': [],
    'currentPrice': [],
    'dayHigh': [],
    'dayLow': [],
    '52wkHigh': [],
    '52wkLow': [],
    'dividendRate': []
    
}


# loading stocks from s&p dataframe and appending data from yf
for company in companies:

    try:
        if stocks_df['Company Name'].str.contains(company).sum():
            symbol = stocks_df[stocks_df['Company Name'].\
                                str.contains(company)]['Symbol'].values[0]
            org_name = stocks_df[stocks_df['Company Name'].\
                                str.contains(company)]['Company Name'].values[0]
            stock_data['Org'].append(org_name)
            stock_data['Symbol'].append(symbol)
            stock_info = yf.Ticker(symbol).info
            stock_data['currentPrice'].append(stock_info['currentPrice'])
            stock_data['dayHigh'].append(stock_info['dayHigh'])
            stock_data['dayLow'].append(stock_info['dayLow'])
            stock_data['52wkHigh'].append(stock_info['fiftyTwoWeekHigh'])
            stock_data['52wkLow'].append(stock_info['fiftyTwoWeekLow'])            
            
            # converting dividend None types to floats
            dividend = stock_info['dividendRate']
            if dividend != None:
                dividend = dividend
            else:
                dividend = 0
            stock_data['dividendRate'].append(dividend)
        else:
            pass
    except:
        pass

time: 2.95 s (started: 2021-11-03 14:43:04 -07:00)


In [17]:
# checking dict
stock_data

{'Org': ['Federal Realty Investment Trust'],
 'Symbol': ['FRT'],
 'currentPrice': [128.46],
 'dayHigh': [128.86],
 'dayLow': [124.58],
 '52wkHigh': [128.86],
 '52wkLow': [67.9],
 'dividendRate': [4.28]}

time: 1.98 ms (started: 2021-11-03 14:43:07 -07:00)


### Data Frame of S&P 500 Stocks in the News

In [18]:
in_the_news = pd.DataFrame(stock_data)
in_the_news.head()

Unnamed: 0,Org,Symbol,currentPrice,dayHigh,dayLow,52wkHigh,52wkLow,dividendRate
0,Federal Realty Investment Trust,FRT,128.46,128.86,124.58,128.86,67.9,4.28


time: 7.34 ms (started: 2021-11-03 14:43:07 -07:00)


In [19]:
# checking Dtypes
in_the_news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Org           1 non-null      object 
 1   Symbol        1 non-null      object 
 2   currentPrice  1 non-null      float64
 3   dayHigh       1 non-null      float64
 4   dayLow        1 non-null      float64
 5   52wkHigh      1 non-null      float64
 6   52wkLow       1 non-null      float64
 7   dividendRate  1 non-null      float64
dtypes: float64(6), object(2)
memory usage: 192.0+ bytes
time: 6.42 ms (started: 2021-11-03 14:43:07 -07:00)
