# Project

## Packages

In [23]:
from bs4 import BeautifulSoup # to parse external data
import yfinance as yf
import pandas as pd # to read CSV files
import requests # to get data
import spacy #to extraxt entities

## RSS feeds

In [24]:
# Check if data from the page are downloadable
yahoo = requests.get("https://finance.yahoo.com/news/rssindex")
yahoo

<Response [200]>

In [25]:
# Get headlines
Tyahoo = BeautifulSoup(yahoo.content, features='xml')
TY = Tyahoo.findAll('title')
TY # list of titles

[<title>Yahoo Finance</title>,
 <title>Yahoo Finance</title>,
 <title>Goldman Sachs sees gains of up to 60% in these 3 beaten-down stocks</title>,
 <title>Jim Cramer: The metaverse is coming — and these 4 stocks will make it real</title>,
 <title>Walmart heirs to candy conglomerates: Here are the 25 richest families in the US</title>,
 <title>Strong Insider Buying Could Indicate a Bottom in These 2 Stocks</title>,
 <title>Ray Dalio says your cash savings are not safe and will be ‘taxed by inflation’ — build a hedge with 3 alternative places to stash your money</title>,
 <title>Five stocks to buy because company insiders love them as they get hit by year-end tax-loss selling</title>,
 <title>Cathie Wood says there’s a stock bubble but it’s not in tech</title>,
 <title>I’ll have $5 million for retirement when I sell my dental practice next year — but my wife and kids don’t want me to retire</title>,
 <title>3 “Strong Buy” Stocks Raymond James Predicts Will Surge Over 50% in 2022</title>,

In [26]:
wsj = requests.get("https://feeds.a.dj.com/rss/RSSMarketsMain.xml?fbclid=IwAR17gY8vV2SdoTLP_35v7zGYmPireg5xIX_y1VEgPYRoXVd5jVouoKRlXAc")
wsj

<Response [200]>

In [27]:
Twsj = BeautifulSoup(wsj.content, features='xml')
TW = Twsj.findAll('title')
TW

[<title>WSJ.com: Markets</title>,
 <title>WSJ.com: Markets</title>,
 <title>Stocks Add Gains After S&amp;P 500, Dow Rally</title>,
 <title>China Evergrande Says State-Backed Risk Team Will Engage With Creditors</title>,
 <title>Private-Equity Boss's Win in Virginia Stokes Political Dreams on Wall Street</title>,
 <title>Turkey's Plan to Save the Lira Is a Risky Bluff</title>,
 <title>Fed Regulatory Agenda Could Swing Bank Stocks</title>,
 <title>European Banks Prepare for Pullback in ECB Stimulus</title>,
 <title>BlackBerry, Tesla, Cassava Sciences, Paychex: What to Watch When the Stock Market Opens Today</title>,
 <title>Death Business Is Booming---For Now</title>,
 <title>China's Tech-Investment Paradox</title>,
 <title>SoftBank Finalizing $4 Billion Loan From Apollo-Led Group</title>,
 <title>Five Big Tech Stocks Are Driving Markets. That Worries Some Investors</title>,
 <title>U.S. Stocks Finish Sharply Higher, Ending Losing Streak</title>,
 <title>Wall Street Had a Red-Hot Year, B

In [28]:
cnbc = requests.get("https://www.cnbc.com/id/15839135/device/rss/rss.html?fbclid=IwAR2o0zeWtmgEwZob45_F6e02pkTVo9uBGL0VI1GQv8mPyScEFY-hn9t089Y")
cnbc

<Response [200]>

In [29]:
Tcnbc = BeautifulSoup(cnbc.content, features='xml')
TC = Tcnbc.findAll('title')
TC

[<title>Earnings</title>,
 <title>Nike shares rise as earnings, sales top estimates, fueled by strong North American demand</title>,
 <title>Darden shares fall as CEO announces plans to retire; Olive Garden parent raises forecast despite planned wage hike</title>,
 <title>Adobe plunges 10% and has second-worst day in past decade on weak guidance</title>,
 <title>Lowe's says pandemic-fueled home improvement demand could cool in year ahead</title>,
 <title>Lululemon earnings top estimates, but shares fall after retailer cuts forecast for Mirror sales</title>,
 <title>Oracle swings to loss because of payment tied to dispute over former CEO Hurd's employment</title>,
 <title>GameStop shares fall as video game retailer reports widening losses in third quarter</title>,
 <title>Rent the Runway posts widening losses, as subscribers have yet to return to pre-pandemic levels</title>,
 <title>Stitch Fix shares crater as retailer cuts forecast, despite topping earnings estimates</title>,
 <title>M

## Extract entities

In [30]:
# python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm") # en_core_web_sm - basic NLP task (to process extracted text data)

In [34]:
processed_hline = nlp(TC[1].text) #Get first headline
print(TC[1])
for token in processed_hline:
    print(token.text, "-----", spacy.explain(token.pos_),'-----', spacy.explain(token.dep_)) # tokens + tags + dependencies

<title>Nike shares rise as earnings, sales top estimates, fueled by strong North American demand</title>
Nike ----- proper noun ----- compound
shares ----- noun ----- nominal subject
rise ----- verb ----- None
as ----- adposition ----- prepositional modifier
earnings ----- noun ----- object of preposition
, ----- punctuation ----- punctuation
sales ----- noun ----- modifier of nominal
top ----- adjective ----- adjectival modifier
estimates ----- noun ----- noun phrase as adverbial modifier
, ----- punctuation ----- punctuation
fueled ----- verb ----- clausal modifier of noun (adjectival clause)
by ----- adposition ----- agent
strong ----- adjective ----- adjectival modifier
North ----- adjective ----- adjectival modifier
American ----- adjective ----- adjectival modifier
demand ----- noun ----- object of preposition


In [35]:
# See the dependencies
spacy.displacy.render(processed_hline, style='dep', jupyter=True, options={'distance': 110}) 

In [36]:
# Show important entities; tag ORG = Companies
spacy.displacy.render(processed_hline, style='ent', jupyter=True, options={'distance': 120}) 

In [37]:
# Extract ORG from headlines
companies = []
for title in TC:
    doc = nlp(title.text)
    for token in doc.ents:
        if token.label_ == 'ORG':
            companies.append(token.text)
        else: 
            pass
        
companies

['Nike',
 'Darden',
 'Olive Garden',
 'Lowe',
 'Mirror',
 'Runway',
 'Kay Jewelers',
 'Nordstrom',
 "Dick's Sporting Goods",
 'Macy',
 'Lowe',
 'Target',
 'Home Depot']

In [18]:
stocks_df = pd.read_csv('./SP500.csv') ## https://github.com/datasets/s-and-p-500-companies
stocks_df.head() 

Unnamed: 0,Symbol,Name,Sector
0,MMM,3M,Industrials
1,AOS,A. O. Smith,Industrials
2,ABT,Abbott Laboratories,Health Care
3,ABBV,AbbVie,Health Care
4,ABMD,Abiomed,Health Care


In [40]:
# attributes of stocks
stock_dict = {
    'Org': [],
    'Symbol': [],
    'currentPrice': [],
    'dayHigh': [],
    'dayLow': [],
    'forwardPE': [],
    'dividendYield': []
}

# collecting information
for company in companies:
    try:
        if stocks_df['Name'].str.contains(company).sum():
            symbol = stocks_df[stocks_df['Name'].\
                                str.contains(company)]['Symbol'].values[0]
            org_name = stocks_df[stocks_df['Name'].\
                                str.contains(company)]['Name'].values[0]
            stock_dict['Org'].append(org_name)
            stock_dict['Symbol'].append(symbol)
            stock_info = yf.Ticker(symbol).info
            stock_dict['currentPrice'].append(stock_info['currentPrice'])
            stock_dict['dayHigh'].append(stock_info['dayHigh'])
            stock_dict['dayLow'].append(stock_info['dayLow'])
            stock_dict['forwardPE'].append(stock_info['forwardPE'])
            stock_dict['dividendYield'].append(stock_info['dividendYield'])
        else:
            pass
    except:
        pass

# dataframe
pd.DataFrame(stock_dict)

Unnamed: 0,Org,Symbol,currentPrice,dayHigh,dayLow,forwardPE,dividendYield
0,Nike,NKE,165.2,168.9997,164.1,34.345116,0.0073
1,Darden Restaurants,DRI,144.83,146.27,143.235,17.159954,0.0308
2,Lowe's,LOW,249.63,249.789,246.14,19.351164,0.013
3,Lowe's,LOW,249.63,249.789,246.14,19.351164,0.013
4,Target Corporation,TGT,217.505,221.27,217.01,16.25598,0.0164
5,Home Depot,HD,394.98,395.67,388.8567,24.396542,0.0169
