# Project

## Packages

In [20]:
from bs4 import BeautifulSoup # to parse external data
import yfinance as yf
import pandas as pd # to read CSV files
import requests # to get data
import spacy #to extraxt entities

## RSS feeds

In [21]:
# Check if data from the page are downloadable
yahoo = requests.get("https://finance.yahoo.com/news/rssindex")
yahoo

<Response [200]>

In [22]:
# Get headlines
Tyahoo = BeautifulSoup(yahoo.content, features='xml')
TY = Tyahoo.findAll('title')
TY # list of titles

[<title>Yahoo Finance</title>,
 <title>Yahoo Finance</title>,
 <title>Forgotten BlackRock ETF Posts Mystery $3.7 Billion Inflow in Day</title>,
 <title>RMDs May Soon Start Even Later for Retirement Plan Savers</title>,
 <title>Eight High Dividend Stocks You Can Count On</title>,
 <title>The Next Recession Is Coming. Here’s How to Time It.</title>,
 <title>Tech Is Still Getting Crushed. Here Are 15 Stocks to Buy in a Changed World.</title>,
 <title>Intel stock hits lowest price in more than a year, and there is another shoe to drop</title>,
 <title>Here are the odds you’ll outlive your money</title>,
 <title>Apple, QCOM Stock Among The 5 Best Tech Stocks To Buy Or Watch Now</title>,
 <title>Stocks Are on a Wild Ride. 20 Bargains to Buy Now, According to Barron’s Roundtable Experts.</title>,
 <title>A Guide to the Capital Gains Tax Rate: Short-term vs. Long-term Capital Gains Taxes</title>,
 <title>Apple could have another blowout year amid ‘scary’ demand for its products</title>,
 <titl

In [23]:
wsj = requests.get("https://feeds.a.dj.com/rss/RSSMarketsMain.xml?fbclid=IwAR17gY8vV2SdoTLP_35v7zGYmPireg5xIX_y1VEgPYRoXVd5jVouoKRlXAc")
wsj

<Response [200]>

In [24]:
Twsj = BeautifulSoup(wsj.content, features='xml')
TW = Twsj.findAll('title')
TW

[<title>WSJ.com: Markets</title>,
 <title>WSJ.com: Markets</title>,
 <title>Wall Street's Green Push Exposes New Conflicts of Interest</title>,
 <title>Who Really Got Rich From the GameStop Revolution?</title>,
 <title>Whirlpool, Kroger, Apple: Stocks That Defined the Week</title>,
 <title>S&amp;P 500 Rises, Snaps Three-Week Losing Streak</title>,
 <title>Goldman Doubles David Solomon's Pay to $35 Million</title>,
 <title>Robinhood Shares Swing in Volatile Session After Earnings Miss</title>,
 <title>Chevron Still Has Gas in Its Tank</title>,
 <title>SoftBank Operating Chief Marcelo Claure to Leave After Pay Dispute</title>,
 <title>The Two Things to Do When the Stock Market Gets Crazy</title>,
 <title>Elliott Management to Sell Stake in Hong Kong Bank, Ending Activist Campaign</title>,
 <title>Steel Market Cools as Supplies Expand</title>,
 <title>Hedge Fund Melvin Lost $6.8 Billion in a Month. Winning It Back Is Taking a Lot Longer.</title>,
 <title>Are Foreign Investors in China Sec

In [25]:
cnbc = requests.get("https://www.cnbc.com/id/15839135/device/rss/rss.html?fbclid=IwAR2o0zeWtmgEwZob45_F6e02pkTVo9uBGL0VI1GQv8mPyScEFY-hn9t089Y")
cnbc

<Response [200]>

In [26]:
Tcnbc = BeautifulSoup(cnbc.content, features='xml')
TC = Tcnbc.findAll('title')
TC

[<title>Earnings</title>,
 <title>Chevron shares retreat from a record after fourth-quarter profit falls short of expectations</title>,
 <title>Atlassian jumps as quarterly results top estimates and company raises guidance for 2022 subscriptions</title>,
 <title>Apple revenue pops 11% to $123.9 billion, Cook says supply chain improving</title>,
 <title>Robinhood loses active users in the fourth quarter, forecasts weak revenue</title>,
 <title>Southwest Airlines says omicron will drive a first-quarter loss but expects 2022 profits</title>,
 <title>Comcast beats earnings expectations, but falls short on new internet customers</title>,
 <title>McDonald's earnings miss estimates as rising costs weigh on profits</title>,
 <title>Deutsche Bank posts a surprise profit on strong investment bank performance</title>,
 <title>Beaten-down cloud software stocks get boost as ServiceNow and Qualtrics top estimates</title>,
 <title>Tesla beats on earnings and revenue, says supply chain issues were 'ma

## Extract entities

In [27]:
# python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm") # en_core_web_sm - basic NLP task (to process extracted text data)

In [28]:
processed_hline = nlp(TC[1].text) #Get first headline
print(TC[1])
for token in processed_hline:
    print(token.text, "-----", spacy.explain(token.pos_),'-----', spacy.explain(token.dep_)) # tokens + tags + dependencies

<title>Chevron shares retreat from a record after fourth-quarter profit falls short of expectations</title>
Chevron ----- proper noun ----- compound
shares ----- noun ----- nominal subject
retreat ----- verb ----- None
from ----- adposition ----- prepositional modifier
a ----- determiner ----- determiner
record ----- noun ----- object of preposition
after ----- subordinating conjunction ----- marker
fourth ----- adjective ----- adjectival modifier
- ----- punctuation ----- punctuation
quarter ----- noun ----- compound
profit ----- noun ----- nominal subject
falls ----- verb ----- adverbial clause modifier
short ----- adjective ----- adverbial modifier
of ----- adposition ----- prepositional modifier
expectations ----- noun ----- object of preposition


In [29]:
# See the dependencies
spacy.displacy.render(processed_hline, style='dep', jupyter=True, options={'distance': 110}) 

In [30]:
# Show important entities; tag ORG = Companies
spacy.displacy.render(processed_hline, style='ent', jupyter=True, options={'distance': 120}) 

In [31]:
# Extract ORG from headlines
companies = []
for title in TC:
    doc = nlp(title.text)
    for token in doc.ents:
        if token.label_ == 'ORG':
            companies.append(token.text)
        else: 
            pass
        
companies

['Chevron',
 'Apple',
 'Southwest Airlines',
 "McDonald's",
 'Deutsche Bank',
 'ServiceNow',
 'Qualtrics',
 'Intel',
 'Boeing',
 'Microsoft',
 'J&J',
 'IBM',
 'American Airlines',
 'United',
 'P&G',
 'Morgan Stanley',
 'Bank of America',
 'Goldman',
 "Wells Fargo's",
 'Citigroup',
 'JPMorgan',
 'Apple',
 'TSMC',
 "Bed Bath & Beyond's",
 'Nike']

In [32]:
stocks_df = pd.read_csv('./SP500.csv') ## https://github.com/datasets/s-and-p-500-companies
stocks_df.head() 

Unnamed: 0,Name,Symbol,Sector
0,3M,MMM,Industrials
1,A. O. Smith,AOS,Industrials
2,Abbott,ABT,Health Care
3,AbbVie,ABBV,Health Care
4,Abiomed,ABMD,Health Care


In [33]:
# attributes of stocks
stock_dict = {
    'Org': [],
    'Symbol': [],
    'currentPrice': [],
    'dayHigh': [],
    'dayLow': [],
    'forwardPE': [],
    'dividendYield': []
}

# collecting information
for company in companies:
    try:
        if stocks_df['Name'].str.contains(company).sum():
            symbol = stocks_df[stocks_df['Name'].\
                                str.contains(company)]['Symbol'].values[0]
            org_name = stocks_df[stocks_df['Name'].\
                                str.contains(company)]['Name'].values[0]
            stock_dict['Org'].append(org_name)
            stock_dict['Symbol'].append(symbol)
            stock_info = yf.Ticker(symbol).info
            stock_dict['currentPrice'].append(stock_info['currentPrice'])
            stock_dict['dayHigh'].append(stock_info['dayHigh'])
            stock_dict['dayLow'].append(stock_info['dayLow'])
            stock_dict['forwardPE'].append(stock_info['forwardPE'])
            stock_dict['dividendYield'].append(stock_info['dividendYield'])
        else:
            pass
    except:
        pass

# dataframe
pd.DataFrame(stock_dict)


Unnamed: 0,Org,Symbol,currentPrice,dayHigh,dayLow,forwardPE,dividendYield
0,Chevron,CVX,130.61,132.06,128.07,12.996019,0.0435
1,Apple,AAPL,170.33,170.35,162.8,27.472582,0.0052
2,Southwest Airlines,LUV,42.93,42.96,40.63,11.99162,
3,McDonald's,MCD,256.09,256.3453,247.16,23.07117,0.0222
4,ServiceNow,NOW,561.08,563.4979,507.8,60.20172,
5,Intel,INTC,47.73,48.2,46.3,12.830645,0.0291
6,Boeing,BA,190.57,190.72,183.77,25.308102,
7,Microsoft,MSFT,308.26,308.48,294.45,28.809347,0.008
8,IBM,IBM,134.5,134.53,131.79,12.858508,0.0495
9,American Airlines Group,AAL,15.64,15.775,14.905,7.629269,
