# Project

## Packages

In [1]:
from bs4 import BeautifulSoup # to parse external data
import yfinance as yf # financial data from Yahoo
import pandas as pd # to read CSV files
import requests # to get data
import spacy #to extraxt entities
import matplotlib.pyplot as plt

## RSS feeds

Scrapable RSS feeds must give us a successful response with HTTP code 200. Using the BeautifulSoup class, the code parses the XML document. It provides all the headlines inside Python lists. 

In [2]:
# Check if data from the page are downloadable
yahoo = requests.get("https://finance.yahoo.com/news/rssindex")
yahoo

<Response [200]>

In [3]:
# Get headlines
Tyahoo = BeautifulSoup(yahoo.content, features='xml')
TY = Tyahoo.findAll('title')
TY # list of titles

[<title>Yahoo Finance</title>,
 <title>Yahoo Finance</title>,
 <title>3 ‘Strong Buy’ Small-Cap Stocks That Are Too Cheap to Ignore</title>,
 <title>Amazon Prime Fee Rising to $180, Not $139, for Many Members</title>,
 <title>Jamie Dimon says he no longer uses the word “cryptocurrency”</title>,
 <title>This way of picking value stocks has actually worked — and Berkshire Hathaway screens the best</title>,
 <title>Five Oil Stocks At Or Near Buy Range As Oil Surges Above $90</title>,
 <title>How to Handle Taxes After AT&amp;T’s Spinoff of WarnerMedia</title>,
 <title>14 Crashing Stocks You May Own Are Already In Serious Trouble</title>,
 <title>I retired at 50, went back to work at 53, and then a medical issue left me jobless: ‘There’s no such thing as a safe amount of money’</title>,
 <title>Is Ford Stock A Buy After Earnings?</title>,
 <title>AT&amp;T’s Dividend Cut Puts It in an Unenviable Club</title>,
 <title>Amazon May Have Saved the Stock Market. But It’s Looking Even Worse for Meta

In [4]:
wsj = requests.get("https://feeds.a.dj.com/rss/RSSMarketsMain.xml?fbclid=IwAR17gY8vV2SdoTLP_35v7zGYmPireg5xIX_y1VEgPYRoXVd5jVouoKRlXAc")
wsj

<Response [200]>

In [5]:
Twsj = BeautifulSoup(wsj.content, features='xml')
TW = Twsj.findAll('title')
TW

[<title>WSJ.com: Markets</title>,
 <title>WSJ.com: Markets</title>,
 <title>Amazon Breaks Record for One-Day Gain in Market Cap</title>,
 <title>How to Prepare for Student-Loan Payments Restarting</title>,
 <title>Proxy Contest Has a Better Chance of Success if the Target CEO Is a Woman</title>,
 <title>GameStop Investors Await Riches From Short Squeeze</title>,
 <title>Rising Battery Prices Add Uncertainty to Electric-Vehicle Costs</title>,
 <title>Meta, Exxon, Snap: Stocks That Defined the Week</title>,
 <title>Amazon, Other Potential Suitors Circle Peloton</title>,
 <title>BofA Lifts CEO's Pay 31% for 2021 to $32 Million</title>,
 <title>S&amp;P 500 Rises After Amazon Earnings, Jobs Report</title>,
 <title>10-Year Treasury Yield Surges to Highest Since 2019</title>,
 <title>Kohl's Rejects $9 Billion Takeover Bid, Adopts Poison Pill</title>,
 <title>Omicron Was No Match for the Job Market</title>,
 <title>Europe's Hawkish Pivot Could Be More Bark Than Bite</title>,
 <title>You Can Ge

In [6]:
cnbc = requests.get("https://www.cnbc.com/id/15839135/device/rss/rss.html?fbclid=IwAR2o0zeWtmgEwZob45_F6e02pkTVo9uBGL0VI1GQv8mPyScEFY-hn9t089Y")
cnbc

<Response [200]>

In [7]:
Tcnbc = BeautifulSoup(cnbc.content, features='xml')
TC = Tcnbc.findAll('title')
TC

[<title>Earnings</title>,
 <title>Amazon shares soar on cloud revenue beat and huge profit gain from Rivian stake</title>,
 <title>Ford shares fall after fourth-quarter earnings significantly miss Wall Street's expectations</title>,
 <title>Oil major Shell reports sharp upswing in full-year profit, raises dividend and buybacks</title>,
 <title>Facebook shares plunge more than 20% on weak earnings, big forecast miss</title>,
 <title>Qualcomm beats expectations for revenue and earnings, sales up 30%</title>,
 <title>Spotify stock plunges on middling user growth projections</title>,
 <title>Alphabet reports big fourth-quarter beat; stock pops</title>,
 <title>AMD rises 10% after issuing strong 2022 sales outlook</title>,
 <title>GM forecasts earnings 'at or near record levels' in 2022 as chip shortage eases</title>,
 <title>Starbucks earnings miss as higher costs weigh on profits, coffee chain cuts earnings outlook</title>,
 <title>PayPal stock plunges on weak earnings guidance </title>,


## Extract entities

To extract companies' names from headlines, the works with an open-source NLP library, spaCy. The en_core_web_sm should be able to detect not only organizations in any text. In the following chunks of code, we demonstrate how it works.

In [8]:
# python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm") # en_core_web_sm - basic NLP task (to process extracted text data)

In [20]:
processed_hline = nlp(TY[3].text) #Get first headline
print(TY[3])
for token in processed_hline:
    print(token.text, "-----", spacy.explain(token.pos_),'-----', spacy.explain(token.dep_)) # tokens + tags + dependencies

<title>Amazon Prime Fee Rising to $180, Not $139, for Many Members</title>
Amazon ----- proper noun ----- compound
Prime ----- proper noun ----- compound
Fee ----- proper noun ----- compound
Rising ----- proper noun ----- None
to ----- adposition ----- prepositional modifier
$ ----- symbol ----- modifier of nominal
180 ----- numeral ----- object of preposition
, ----- punctuation ----- punctuation
Not ----- particle ----- negation modifier
$ ----- symbol ----- modifier of nominal
139 ----- numeral ----- appositional modifier
, ----- punctuation ----- punctuation
for ----- adposition ----- prepositional modifier
Many ----- adjective ----- adjectival modifier
Members ----- noun ----- object of preposition


In [21]:
# See the dependencies
spacy.displacy.render(processed_hline, style='dep', jupyter=True, options={'distance': 110}) 

In [22]:
# Show important entities; tag ORG = Companies
spacy.displacy.render(processed_hline, style='ent', jupyter=True, options={'distance': 120}) 

In [23]:
# All headlines
headlines = TC + TW + TY

In [13]:
# Extract ORG from headlines
companies = []
for title in headlines:
    doc = nlp(title.text)
    for token in doc.ents:
        if token.label_ == 'ORG':
            companies.append(token.text)
        else: 
            pass
        
companies

['Amazon',
 'Ford',
 'Shell',
 'AMD',
 'GM',
 'PayPal',
 "Exxon Mobil's",
 'UBS',
 'Chevron',
 'Apple',
 'Southwest Airlines',
 "McDonald's",
 'Deutsche Bank',
 'ServiceNow',
 'Qualtrics',
 'Intel',
 'Boeing',
 'Microsoft',
 'J&J',
 'IBM',
 'American Airlines',
 'Rising Battery Prices Add Uncertainty',
 'Meta',
 'Exxon',
 'Amazon',
 'Amazon Earnings',
 'Amazon Breaks Record',
 'Treasury',
 'Yield Surges to Highest',
 "Kohl's",
 'Global Equity Stock Fund',
 'Amazon',
 'Pinterest',
 'Ford',
 'Clorox',
 'U.S. Treasury',
 'the Beijing Olympics Are Awkward for Corporate Do-Gooders',
 'The Zuckerberg Effect Fading',
 'Social Media',
 'Texas Storm Forecast Is Painful Déjà Vu',
 'Sony',
 'Nintendo',
 'Supply-Chain Pile-Up',
 'U.S. Shale Surge',
 'Amazon',
 'AT&T Spent',
 'AMC',
 'Critics',
 'Ignore',
 'BYD',
 'Intel',
 'the U.S. Shale Boom',
 'Inflation Are Sinking Solar',
 'Apple',
 'QCOM Stock',
 'Fidelity Opens Direct Indexing Option',
 'AT&T’s Dividend Cut Puts',
 'Unenviable Club',
 'Ford

## Scraping S&P500

We have obtained list of companies' names. We will need the company’s trading stock symbol to get their trading details. Since we are extracting the news mainly from the U.S. and the U.S. exchange stock market is the largest one, we are going to use an external database of S&P 500 index tracking the performance of 500 large firms listed on stock exchanges in the United States.

In [14]:
rSP = requests.get('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
soupSP = BeautifulSoup(rSP.text, 'lxml')
tableSP=soupSP.find('table',{'id':'constituents'}).find('tbody').findAll('tr')[1:]

stocks_df = pd.DataFrame()
for row in tableSP:
    title = row.findAll('td')[1].text.strip()
    symbol = row.findAll('td')[0].text.strip()
    sector = row.findAll('td')[3].text.strip()
    row_ = pd.Series({"Name": title, "Symbol":symbol,"Sector":sector})
    stocks_df = pd.concat([stocks_df, row_], axis=1)

stocks_df = stocks_df.T

In [15]:
stocks_df.head() 

Unnamed: 0,Name,Symbol,Sector
0,3M,MMM,Industrials
0,A. O. Smith,AOS,Industrials
0,Abbott,ABT,Health Care
0,AbbVie,ABBV,Health Care
0,Abiomed,ABMD,Health Care


For each company, we will look it up in the list of companies using pandas. Then, we will gather the stock market statistics using the Yahoo.

In [19]:
# attributes of stocks
stock_dict = {
    'Org': [],
    'Symbol': [],
    'Sector': [],
    'currentPrice': [],
    'dayHigh': [],
    'dayLow': [],
    'forwardPE': [],
    'dividendYield': []
}

# collecting information
for company in companies:
    try:
        if stocks_df['Name'].str.contains(company).sum() and yf.Ticker(stocks_df[stocks_df['Name'].\
                                str.contains(company)]['Symbol'].values[0]).info['currentPrice'] > 0:
            symbol = stocks_df[stocks_df['Name'].\
                                str.contains(company)]['Symbol'].values[0]
            org_name = stocks_df[stocks_df['Name'].\
                                str.contains(company)]['Name'].values[0]
            stock_dict['Org'].append(org_name)
            stock_dict['Symbol'].append(symbol)
            stock_info = yf.Ticker(symbol).info
            stock_dict['Sector'].append(stock_info['sector'])
            stock_dict['currentPrice'].append(stock_info['currentPrice'])
            stock_dict['dayHigh'].append(stock_info['dayHigh'])
            stock_dict['dayLow'].append(stock_info['dayLow'])
            stock_dict['forwardPE'].append(stock_info['forwardPE'])
            stock_dict['dividendYield'].append(stock_info['dividendYield'])
        else:
            pass
    except:
        pass

# dataframe
pd.DataFrame(stock_dict)


Unnamed: 0,Org,Symbol,Sector,currentPrice,dayHigh,dayLow,forwardPE,dividendYield
0,Amazon,AMZN,Consumer Cyclical,3152.79,2884.95,2766.66,61.183586,
1,Ford,F,Consumer Cyclical,17.96,20.62,19.87,9.025125,0.0165
2,AMD,AMD,Technology,123.6,124.96,118.58,37.00599,
3,GM,GM,Consumer Cyclical,51.29,51.85,50.515,7.509517,
4,PayPal,PYPL,Financial Services,126.08,126.7,121.4,23.924099,
5,Chevron,CVX,Energy,135.88,135.35,133.19,14.288118,0.0414
6,Apple,AAPL,Technology,172.39,174.0974,170.68,27.894823,0.0049
7,Southwest Airlines,LUV,Industrials,43.66,43.95,42.76,22.505154,
8,McDonald's,MCD,Consumer Cyclical,260.06,261.73,257.79,25.646942,0.0206
9,ServiceNow,NOW,Technology,577.52,582.0399,559.3,79.98892,


In [152]:
df = pd.DataFrame(stock_dict)