# Webscraping from Bloomberg.

This notebook scrapes data from bloomberg using Selenium

\* requests + BeautifulSoup doesn't work

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

In [107]:
url = "https://www.bloomberg.com/news/articles/2024-08-06/stock-market-today-dow-s-p-live-updates?embedded-checkout=true"

In [109]:
driver = webdriver.Chrome()
raw_html = driver.get(url)

#### Find the header wrapper

In [110]:
header_wrapper = driver.find_elements(By.CLASS_NAME, "basicHeader_headAndDek__1QUPg")
header_wrapper = header_wrapper[0] if header_wrapper else header_wrapper

#### Find the headline.

In [87]:
h1_elem = header_wrapper.find_elements(By.TAG_NAME, "h1")
news_headline = h1_elem[0].text

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=126.0.6478.127)
Stacktrace:
	GetHandleVerifier [0x00007FF615A0EEB2+31554]
	(No symbol) [0x00007FF615987EE9]
	(No symbol) [0x00007FF61584872A]
	(No symbol) [0x00007FF61581D995]
	(No symbol) [0x00007FF6158C44D7]
	(No symbol) [0x00007FF6158DC051]
	(No symbol) [0x00007FF6158BCDD3]
	(No symbol) [0x00007FF61588A33B]
	(No symbol) [0x00007FF61588AED1]
	GetHandleVerifier [0x00007FF615D18B2D+3217341]
	GetHandleVerifier [0x00007FF615D65AF3+3532675]
	GetHandleVerifier [0x00007FF615D5B0F0+3489152]
	GetHandleVerifier [0x00007FF615ABE786+750614]
	(No symbol) [0x00007FF61599376F]
	(No symbol) [0x00007FF61598EB24]
	(No symbol) [0x00007FF61598ECB2]
	(No symbol) [0x00007FF61597E17F]
	BaseThreadInitThunk [0x00007FFB81AD7374+20]
	RtlUserThreadStart [0x00007FFB83A1CC91+33]


#### Find the subheadlines.

In [80]:
header_wrapper.find_elements(By.TAG_NAME, "li")[0].text

'High-grade issuers offer $31.8 billion of bonds, most in 2024'

#### Find the article time.

In [51]:
times = driver.find_elements(By.TAG_NAME, "time")
[time.text for time in times]

['August 7, 2024 at 6:38 AM GMT+8', 'August 8, 2024 at 5:47 AM GMT+8']

#### Close the driver or else bloomberg will become sus of you if you open it again.

In [7]:
driver.close()

### Wrap into a function.

In [88]:
def get_headline(date: str) -> dict:
    """Get news headline and date from Bloomberg for a particular date.

    Args:
        date (str): date string to put in the URL

    Returns:
        dict: contains headline and time of the article.
    """
    res = {}
    url = f"https://www.bloomberg.com/news/articles/{date}/stock-market-today-dow-s-p-live-updates?embedded-checkout=true"
    driver = webdriver.Chrome()
    driver.get(url)

    header_wrapper_list: list = driver.find_elements(By.CLASS_NAME, "basicHeader_headAndDek__1QUPg") 
    header_wrapper = header_wrapper_list[0] if len(header_wrapper_list) > 0 else None
    
    if header_wrapper:
        h1_elem: list = header_wrapper.find_elements(By.TAG_NAME, "h1")
        news_headline = h1_elem[0].text if len(h1_elem) > 0 else ""
        res['headline'] = news_headline

        subheaders: list = header_wrapper.find_elements(By.TAG_NAME, "li")
        subheader_texts = [x.text for x in subheaders]
        res['subheaders'] = subheader_texts

    time_elem: list = driver.find_elements(By.TAG_NAME, "time")
    time = time_elem[0].text if len(time_elem) > 0 else ""
    res['time'] = time

    driver.close()

    return res

In [82]:
sample_date = '2024-08-05'
print(get_headline(sample_date))

{'headline': 'S&P 500 Climbs 1% After Wall Street’s Wild Rout: Markets Wrap', 'time': 'August 6, 2024 at 7:25 AM GMT+8', 'subheaders': ['Goldman says buying S&P after 5% drop is usually profitable', 'US Treasuries slip as waning haven bid smooths auction result']}


## Get SPY ticker so that we can get the trading dates

In [10]:
import yfinance as yf
import pandas as pd

In [103]:
spy_ticker = yf.Ticker('SPY')
SPY_prices = spy_ticker.history(period='1y')

len(SPY_prices)

252

### Test out by iterating through the trading dates

In [111]:
for index, (i, row) in enumerate(SPY_prices.iterrows()):
    try:
        formatted_date = dt.strftime(i, format)
        retrieved_data = get_headline(formatted_date)

        SPY_prices.at[i, 'headline'] = retrieved_data.get('headline', pd.NA)
        SPY_prices.at[i, 'article_time'] = retrieved_data.get('time', pd.NA)
        SPY_prices.at[i, 'subheaders'] = retrieved_data.get('subheaders', pd.NA)
    except Exception as e:
        continue

In [113]:
SPY_prices

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Capital Gains,headline,article_time,subheaders
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2023-08-31 00:00:00-04:00,445.419720,446.583435,443.950283,444.137665,66084600,0.0,0.0,0.0,Stock Rally Wanes at End of Best Week Since Ju...,"September 1, 2023 at 6:22 AM GMT+8",
2023-09-01 00:00:00-04:00,446.918793,447.411896,443.476915,444.966095,58875700,0.0,0.0,0.0,,,
2023-09-05 00:00:00-04:00,444.512435,444.837870,442.973957,443.042969,55166200,0.0,0.0,0.0,Stocks Retreat After Hot ISM Fuels Fed-Hike Wa...,"September 6, 2023 at 6:15 AM GMT+8","[US service gauge rises to six-month high, top..."
2023-09-06 00:00:00-04:00,442.214557,442.323055,437.687877,440.064636,70758500,0.0,0.0,0.0,Traders Shun Risk Amid Apple’s $190 Billion Ro...,"September 7, 2023 at 6:39 AM GMT+8",[Apple extends two-day drop on concern over Ch...
2023-09-07 00:00:00-04:00,436.997513,439.403857,436.642494,438.713531,70355400,0.0,0.0,0.0,Dollar Bulls Vindicated in Longest Run Since 2...,"September 8, 2023 at 6:27 AM GMT+8",[Greenback posts eighth straight up week amid ...
...,...,...,...,...,...,...,...,...,...,...,...
2024-08-26 00:00:00-04:00,563.179993,563.909973,559.049988,560.789978,35788600,0.0,0.0,0.0,"Stocks Churn as Nvidia’s 1,000% Rally to Face ...","August 27, 2024 at 6:36 AM GMT+8",[Chipmaker is due to report results after the ...
2024-08-27 00:00:00-04:00,559.489990,562.059998,558.320007,561.559998,32693900,0.0,0.0,0.0,Tech Hit in Late Hours as Nvidia Fails to Insp...,"August 28, 2024 at 6:28 AM GMT+8",[Chipmaker’s revenue forecast fails to meet lo...
2024-08-28 00:00:00-04:00,561.210022,561.650024,555.039978,558.299988,41066000,0.0,0.0,0.0,"S&P 500 Rally Loses Steam, But Most US Stocks ...","August 29, 2024 at 6:58 AM GMT+8",[US economy expands at revised 3% rate on resi...
2024-08-29 00:00:00-04:00,560.309998,563.679993,557.179993,558.349976,38715200,0.0,0.0,0.0,S&P 500 Spikes in Last 10 Minutes of US Tradin...,"August 30, 2024 at 6:47 AM GMT+8",[Fed favored inflation gauge’s mild gain sets ...


In [114]:
SPY_prices.to_csv("scraped_headlines.csv")

In [70]:
for i in input_dates:
    print(i, get_headline(i))

2024-08-06 {'headline': 'Stocks Swoon After Weak $42 Billion Treasury Sale: Markets Wrap', 'time': 'August 7, 2024 at 6:38 AM GMT+8'}
2024-08-07 {'headline': 'S&P 500 Notches Biggest Rally Since November 2022: Markets Wrap', 'time': 'August 8, 2024 at 6:19 AM GMT+8'}
2024-08-08 {'headline': 'Stocks Make Comeback at End of Dizzying Week: Markets Wrap', 'time': 'August 9, 2024 at 6:20 AM GMT+8'}
2024-08-09 {'headline': 'Our apologies\nWe’re unable to find the\npage you’re looking for.', 'time': ''}
2024-08-12 {'headline': 'Stocks Rally in Countdown to CPI as US Yields Sink: Markets Wrap', 'time': 'August 13, 2024 at 6:48 AM GMT+8'}
2024-08-13 {'headline': 'Stocks Rise as CPI ‘Checks The Box’ for Fed Cuts: Markets Wrap', 'time': 'August 14, 2024 at 6:30 AM GMT+8'}
2024-08-14 {'headline': 'Our apologies\nWe’re unable to find the\npage you’re looking for.', 'time': ''}
2024-08-15 {'headline': 'Stocks See Best Week in ’24 as Buyers Race Back In: Markets Wrap', 'time': 'August 16, 2024 at 6:2