# Project

## Packages

In [1]:
from bs4 import BeautifulSoup # to parse external data
import yfinance as yf
import pandas as pd # to read CSV files
import requests # to get data
import spacy #to extraxt entities
import matplotlib.pyplot as plt

## RSS feeds

In [2]:
# Check if data from the page are downloadable
yahoo = requests.get("https://finance.yahoo.com/news/rssindex")
yahoo

<Response [200]>

In [3]:
# Get headlines
Tyahoo = BeautifulSoup(yahoo.content, features='xml')
TY = Tyahoo.findAll('title')
TY # list of titles

[<title>Yahoo Finance</title>,
 <title>Yahoo Finance</title>,
 <title>No, the US doesn’t have $30 trillion in debt</title>,
 <title>2 “Strong Buy” Stocks Flashing Signs of Strong Insider Buying</title>,
 <title>‘We do not want him to receive anything from our estate’: How do we ensure our son-in-law does not get his hands on our money?</title>,
 <title>How to Avoid Capital Gains Tax on Real Estate</title>,
 <title>‘No, no, no, no, no!’ My wife and I are close to retirement, but we want to buy a house. Should I empty my 401(k) for the down payment?</title>,
 <title>Amazon stock soars 15% after earnings, will hike Prime membership fee</title>,
 <title>AT&amp;T’s Dividend Cut Puts It in an Unenviable Club</title>,
 <title>Facebook wasn’t Thursday’s only big loser — these 16 other Nasdaq-100 stocks dropped at least 5%</title>,
 <title>I inherited ‘a sizeable amount’ from my mother.  A financial adviser took me out for a free meal at an investment seminar, and made ‘some good, interesting p

In [4]:
wsj = requests.get("https://feeds.a.dj.com/rss/RSSMarketsMain.xml?fbclid=IwAR17gY8vV2SdoTLP_35v7zGYmPireg5xIX_y1VEgPYRoXVd5jVouoKRlXAc")
wsj

<Response [200]>

In [5]:
Twsj = BeautifulSoup(wsj.content, features='xml')
TW = Twsj.findAll('title')
TW

[<title>WSJ.com: Markets</title>,
 <title>WSJ.com: Markets</title>,
 <title>Europe's Hawkish Pivot Could Be More Bark Than Bite</title>,
 <title>Stocks Waver After Jobs Report, Tech Selloff</title>,
 <title>Omicron Was No Match for the Job Market</title>,
 <title>You Can Get Crypto Right and Still Play It Wrong</title>,
 <title>Investing in a Global Equity Stock Fund? Find Out Where the Fund Manager Is From</title>,
 <title>Amazon Shares Surge After Bumper Earnings</title>,
 <title>Snap, Amazon, Pinterest, Ford, Clorox: What to Watch in the Stock Market Today</title>,
 <title>Bond Yields Surge in Europe as Old Jitters Return</title>,
 <title>U.S. Treasury Yields Climb After Strong Jobs Report</title>,
 <title>Saudi Aramco Looks to Sell $50 Billion Stake in Fresh Share Listing</title>,
 <title>Why the Beijing Olympics Are Awkward for Corporate Do-Gooders</title>,
 <title>The Zuckerberg Effect Fading in Social Media</title>,
 <title>Texas Storm Forecast Is Painful Déjà Vu</title>,
 <titl

In [6]:
cnbc = requests.get("https://www.cnbc.com/id/15839135/device/rss/rss.html?fbclid=IwAR2o0zeWtmgEwZob45_F6e02pkTVo9uBGL0VI1GQv8mPyScEFY-hn9t089Y")
cnbc

<Response [200]>

In [7]:
Tcnbc = BeautifulSoup(cnbc.content, features='xml')
TC = Tcnbc.findAll('title')
TC

[<title>Earnings</title>,
 <title>Amazon shares soar on cloud revenue beat and huge profit gain from Rivian stake</title>,
 <title>Ford shares fall after fourth-quarter earnings significantly miss Wall Street's expectations</title>,
 <title>Oil major Shell reports sharp upswing in full-year profit, raises dividend and buybacks</title>,
 <title>Facebook shares plunge more than 20% on weak earnings, big forecast miss</title>,
 <title>Qualcomm beats expectations for revenue and earnings, sales up 30%</title>,
 <title>Spotify stock plunges on middling user growth projections</title>,
 <title>Alphabet reports big fourth-quarter beat; stock pops</title>,
 <title>AMD rises 10% after issuing strong 2022 sales outlook</title>,
 <title>GM forecasts earnings 'at or near record levels' in 2022 as chip shortage eases</title>,
 <title>Starbucks earnings miss as higher costs weigh on profits, coffee chain cuts earnings outlook</title>,
 <title>PayPal stock plunges on weak earnings guidance </title>,


## Extract entities

In [8]:
# python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm") # en_core_web_sm - basic NLP task (to process extracted text data)

In [9]:
processed_hline = nlp(TC[1].text) #Get first headline
print(TC[1])
for token in processed_hline:
    print(token.text, "-----", spacy.explain(token.pos_),'-----', spacy.explain(token.dep_)) # tokens + tags + dependencies

<title>Amazon shares soar on cloud revenue beat and huge profit gain from Rivian stake</title>
Amazon ----- proper noun ----- compound
shares ----- noun ----- nominal subject
soar ----- verb ----- None
on ----- adposition ----- prepositional modifier
cloud ----- noun ----- compound
revenue ----- noun ----- compound
beat ----- noun ----- object of preposition
and ----- coordinating conjunction ----- coordinating conjunction
huge ----- adjective ----- adjectival modifier
profit ----- noun ----- compound
gain ----- noun ----- conjunct
from ----- adposition ----- prepositional modifier
Rivian ----- adjective ----- adjectival modifier
stake ----- noun ----- object of preposition


In [10]:
# See the dependencies
spacy.displacy.render(processed_hline, style='dep', jupyter=True, options={'distance': 110}) 

In [11]:
# Show important entities; tag ORG = Companies
spacy.displacy.render(processed_hline, style='ent', jupyter=True, options={'distance': 120}) 

In [12]:
# All headlines
headlines = TC + TW + TY

In [13]:
# Extract ORG from headlines
companies = []
for title in headlines:
    doc = nlp(title.text)
    for token in doc.ents:
        if token.label_ == 'ORG':
            companies.append(token.text)
        else: 
            pass
        
companies

['Amazon',
 'Ford',
 'Shell',
 'AMD',
 'GM',
 'PayPal',
 "Exxon Mobil's",
 'UBS',
 'Chevron',
 'Apple',
 'Southwest Airlines',
 "McDonald's",
 'Deutsche Bank',
 'ServiceNow',
 'Qualtrics',
 'Intel',
 'Boeing',
 'Microsoft',
 'J&J',
 'IBM',
 'American Airlines',
 'Global Equity Stock Fund',
 'Amazon Shares Surge After Bumper Earnings',
 'Amazon',
 'Pinterest',
 'Ford',
 'Clorox',
 'Bond Yields Surge',
 'U.S. Treasury',
 'the Beijing Olympics Are Awkward for Corporate Do-Gooders',
 'The Zuckerberg Effect Fading',
 'Social Media',
 'Texas Storm Forecast Is Painful Déjà Vu',
 'Sony',
 'Nintendo',
 'Supply-Chain Pile-Up',
 'Ford',
 'GM Trade Places',
 'PayPal Earnings Send Shudders Through Stock Market',
 'Amazon Flexes Its Pricing Muscles',
 'Amazon',
 'AT&T’s Dividend Cut Puts',
 'Unenviable Club',
 'AMC',
 'Shell',
 'PayPal Shuts',
 'ARK',
 'Ford Stock Falls After Earnings',
 'Ignore',
 'PayPal',
 'Ford',
 'Meta',
 'Meta',
 'Nokia Reinstates Dividend',
 'Sets New',
 'India’s',
 'Adani',


## Scraping S&P500

In [14]:
rSP = requests.get('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
soupSP = BeautifulSoup(rSP.text, 'lxml')
tableSP=soupSP.find('table',{'id':'constituents'}).find('tbody').findAll('tr')[1:]

stocks_df = pd.DataFrame()
for row in tableSP:
    title = row.findAll('td')[1].text.strip()
    symbol = row.findAll('td')[0].text.strip()
    sector = row.findAll('td')[3].text.strip()
    row_ = pd.Series({"Name": title, "Symbol":symbol,"Sector":sector})
    stocks_df = pd.concat([stocks_df, row_], axis=1)

stocks_df = stocks_df.T

In [15]:
stocks_df.head() 

Unnamed: 0,Name,Symbol,Sector
0,3M,MMM,Industrials
0,A. O. Smith,AOS,Industrials
0,Abbott,ABT,Health Care
0,AbbVie,ABBV,Health Care
0,Abiomed,ABMD,Health Care


In [16]:
# attributes of stocks
stock_dict = {
    'Org': [],
    'Symbol': [],
    'Sector': [],
    'currentPrice': [],
    'dayHigh': [],
    'dayLow': [],
    'forwardPE': [],
    'dividendYield': []
}

# collecting information
for company in companies:
    try:
        if stocks_df['Name'].str.contains(company).sum():
            symbol = stocks_df[stocks_df['Name'].\
                                str.contains(company)]['Symbol'].values[0]
            org_name = stocks_df[stocks_df['Name'].\
                                str.contains(company)]['Name'].values[0]
            stock_dict['Org'].append(org_name)
            stock_dict['Symbol'].append(symbol)
            stock_info = yf.Ticker(symbol).info
            stock_dict['Sector'].append(stock_info['sector'])
            stock_dict['currentPrice'].append(stock_info['currentPrice'])
            stock_dict['dayHigh'].append(stock_info['dayHigh'])
            stock_dict['dayLow'].append(stock_info['dayLow'])
            stock_dict['forwardPE'].append(stock_info['forwardPE'])
            stock_dict['dividendYield'].append(stock_info['dividendYield'])
        else:
            pass
    except:
        pass

# dataframe
pd.DataFrame(stock_dict)


Unnamed: 0,Org,Symbol,Sector,currentPrice,dayHigh,dayLow,forwardPE,dividendYield
0,Amazon,AMZN,Consumer Cyclical,2776.91,2884.95,2766.66,53.88919,
1,Ford,F,Consumer Cyclical,19.885,20.62,19.87,9.992462,0.0165
2,AMD,AMD,Technology,120.08,125.37,118.816,35.9521,
3,GM,GM,Consumer Cyclical,53.11,54.42,52.55,7.775989,
4,PayPal,PYPL,Financial Services,124.3,131.45,123.85,23.586338,
5,Chevron,CVX,Energy,134.17,135.35,133.19,14.108307,0.0414
6,Apple,AAPL,Technology,172.9,176.2399,172.12,27.977346,0.0049
7,Southwest Airlines,LUV,Industrials,43.44,44.4,43.33,22.39175,
8,McDonald's,MCD,Consumer Cyclical,260.64,262.786,260.09,25.704144,0.0206
9,ServiceNow,NOW,Technology,560.85,582.0399,559.48,77.68005,


In [152]:
df = pd.DataFrame(stock_dict)