# Project

## Packages

In [2]:
from bs4 import BeautifulSoup # to parse external data
import yfinance as yf
import pandas as pd # to read CSV files
import requests # to get data
import spacy #to extraxt entities

## RSS feeds

In [3]:
# Check if data from the page are downloadable
yahoo = requests.get("https://finance.yahoo.com/news/rssindex")
yahoo

<Response [200]>

In [4]:
# Get headlines
Tyahoo = BeautifulSoup(yahoo.content, features='xml')
TY = Tyahoo.findAll('title')
TY # list of titles

[<title>Yahoo Finance</title>,
 <title>Yahoo Finance</title>,
 <title>Turkey’s lira leaps by more than 40% in a day after President Erdogan unveils unorthodox plan to lure Turks away from dollars</title>,
 <title>Bank Of America Names Top 11 Stock Picks For 2022</title>,
 <title>2 Big Dividend Stocks Yielding at Least 9%; RBC Says ‘Buy’</title>,
 <title>Transfer IRA Money to an HSA</title>,
 <title>Diversify your portfolio the right way ⁠— here are 5 assets with little connection to the stock market’s wild swings</title>,
 <title>Here Are Barron’s 10 Top Stocks for the New Year</title>,
 <title>Tesla is at risk of losing its market dominance: analyst</title>,
 <title>Super-Rich Americans Feel Relief as Tax Hikes Are Canceled for Now</title>,
 <title>Average Retirement Savings by Group</title>,
 <title>Analysts Say You Should Sell 9 Big Stock Winners Right Now</title>,
 <title>Flipping Houses for Beginners: Top 5 Mistakes</title>,
 <title>The Rise of the Semi-Retired Life</title>,
 <tit

In [5]:
wsj = requests.get("https://feeds.a.dj.com/rss/RSSMarketsMain.xml?fbclid=IwAR17gY8vV2SdoTLP_35v7zGYmPireg5xIX_y1VEgPYRoXVd5jVouoKRlXAc")
wsj

<Response [200]>

In [6]:
Twsj = BeautifulSoup(wsj.content, features='xml')
TW = Twsj.findAll('title')
TW

[<title>WSJ.com: Markets</title>,
 <title>WSJ.com: Markets</title>,
 <title>NatWest Unit Pleads Guilty to Manipulating Treasury Markets</title>,
 <title>SoftBank Finalizing $4 Billion Loan From Apollo-Led Group</title>,
 <title>Cruise Industry Keeps Calm and Sails On</title>,
 <title>Food Companies Are Having Trouble Keeping Up With Inflation</title>,
 <title>Stocks Rise, Recouping Some Losses After Selloff</title>,
 <title>Micron, Nike, BlackBerry, Braze: What to Watch in the Stock Market Today</title>,
 <title>Nikola to Pay $125 Million in SEC Settlement</title>,
 <title>Property Logistics Specialist GLP Plans IPO for Investment Arm</title>,
 <title>Turkey Rolls Out Economic Rescue Plan, Reversing Lira Spiral</title>,
 <title>China's Yuan May Have Peaked</title>,
 <title>Five Big Tech Stocks Are Driving Markets. That Worries Some Investors</title>,
 <title>Wall Street Had a Red-Hot Year, But Can It Last?</title>,
 <title>China Mobile Plans to Raise $7.64 Billion Through Shanghai List

In [7]:
cnbc = requests.get("https://www.cnbc.com/id/15839135/device/rss/rss.html?fbclid=IwAR2o0zeWtmgEwZob45_F6e02pkTVo9uBGL0VI1GQv8mPyScEFY-hn9t089Y")
cnbc

<Response [200]>

In [8]:
Tcnbc = BeautifulSoup(cnbc.content, features='xml')
TC = Tcnbc.findAll('title')
TC

[<title>Earnings</title>,
 <title>Nike shares rise as earnings, sales top estimates, fueled by strong North American demand</title>,
 <title>Darden shares fall as CEO announces plans to retire; Olive Garden parent raises forecast despite planned wage hike</title>,
 <title>Adobe plunges 10% and has second-worst day in past decade on weak guidance</title>,
 <title>Lowe's says pandemic-fueled home improvement demand could cool in year ahead</title>,
 <title>Lululemon earnings top estimates, but shares fall after retailer cuts forecast for Mirror sales</title>,
 <title>Oracle swings to loss because of payment tied to dispute over former CEO Hurd's employment</title>,
 <title>GameStop shares fall as video game retailer reports widening losses in third quarter</title>,
 <title>Rent the Runway posts widening losses, as subscribers have yet to return to pre-pandemic levels</title>,
 <title>Stitch Fix shares crater as retailer cuts forecast, despite topping earnings estimates</title>,
 <title>M

## Extract entities

In [14]:
# python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm") # en_core_web_sm - basic NLP task (to process extracted text data)

In [17]:
processed_hline = nlp(TC[3].text) #Get first headline
print(TC[3])
for token in processed_hline:
    print(token.text, "-----", spacy.explain(token.pos_),'-----', spacy.explain(token.dep_)) # tokens + tags + dependencies

<title>Adobe plunges 10% and has second-worst day in past decade on weak guidance</title>
Adobe ----- proper noun ----- compound
plunges ----- noun ----- None
10 ----- numeral ----- numeric modifier
% ----- noun ----- noun phrase as adverbial modifier
and ----- coordinating conjunction ----- coordinating conjunction
has ----- verb ----- conjunct
second ----- adverb ----- adverbial modifier
- ----- punctuation ----- punctuation
worst ----- adjective ----- adjectival modifier
day ----- noun ----- direct object
in ----- adposition ----- prepositional modifier
past ----- adjective ----- adjectival modifier
decade ----- noun ----- object of preposition
on ----- adposition ----- prepositional modifier
weak ----- adjective ----- adjectival modifier
guidance ----- noun ----- object of preposition


In [18]:
# See the dependencies
spacy.displacy.render(processed_hline, style='dep', jupyter=True, options={'distance': 110}) 

In [19]:
# Show important entities; tag ORG = Companies
spacy.displacy.render(processed_hline, style='ent', jupyter=True, options={'distance': 120}) 

In [20]:
# Extract ORG from headlines
companies = []
for title in TC:
    doc = nlp(title.text)
    for token in doc.ents:
        if token.label_ == 'ORG':
            companies.append(token.text)
        else: 
            pass
        
companies

['Nike',
 'Darden',
 'Olive Garden',
 'Lowe',
 'Mirror',
 'Runway',
 'Kay Jewelers',
 'Nordstrom',
 "Dick's Sporting Goods",
 'Macy',
 'Lowe',
 'Target',
 'Home Depot']