# Webscraping from Bloomberg.

This notebook is to explore how to scrape the daily news from Bloomberg.

## First we try requests + beautifulsoup

In [55]:
from datetime import datetime as dt

In [4]:
import requests
from bs4 import BeautifulSoup

In [7]:
url = "https://www.bloomberg.com/news/articles/2024-08-06/stock-market-today-dow-s-p-live-updates"

In [8]:
response = requests.get(url)
content = response.content

In [11]:
soup = BeautifulSoup(content)
soup

<!DOCTYPE html>
<html lang="en">
<head>
<title>Bloomberg - Are you a robot?</title>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<meta content="noindex" name="robots"/>
<style rel="stylesheet">
        @font-face {
            font-family: BWHaasGroteskWeb;
            font-display: swap;
            font-weight: 400;
            font-style: normal;
            src: url('https://assets.bwbx.io/s3/fontservice/fonts/BWHaasGrotesk-55Roman-Web-7998fbcba8.eot');
            src: local('?'), url('https://assets.bwbx.io/s3/fontservice/fonts/BWHaasGrotesk-55Roman-Web-5cf5733cd9.woff2') format('woff2'), url('https://assets.bwbx.io/s3/fontservice/fonts/BWHaasGrotesk-55Roman-Web-a1ea4fd367.woff') format('woff')
        }

        @font-face {
            font-family: BWHaasGroteskWeb;
            font-display: swap;
            font-weight: 700;
            font-style: normal;
            src: url('https://assets.bwbx.io/s3/fontservice/fonts/BWHaasGrotesk-75Bold-Web-de8c7

#### Looking at the html, it seems like bloomberg blocks web scraping through this method.

## Next we try selenium.

In [20]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

In [36]:
url = "https://www.bloomberg.com/news/articles/2024-08-06/stock-market-today-dow-s-p-live-updates?embedded-checkout=true"

In [64]:
driver = webdriver.Chrome()
raw_html = driver.get(url)

In [26]:
h1_elem = driver.find_elements(By.TAG_NAME, "h1")
news_headline = h1_elem[0].text

In [66]:
times = driver.find_elements(By.TAG_NAME, "time")
[time.text for time in times]

In [28]:
driver.close()

### Try looping url now.

In [68]:
def get_headline(date: str) -> dict:
    """Get news headline and date from Bloomberg for a particular date.

    Args:
        date (str): date string to put in the URL

    Returns:
        dict: contains headline and time of the article.
    """
    driver = webdriver.Chrome()

    url = f"https://www.bloomberg.com/news/articles/{date}/stock-market-today-dow-s-p-live-updates?embedded-checkout=true"
    driver.get(url)
    
    h1_elem = driver.find_elements(By.TAG_NAME, "h1")
    news_headline = h1_elem[0].text if len(h1_elem) > 0 else ""

    time_elem = driver.find_elements(By.TAG_NAME, "time")
    time = time_elem[0].text if len(time_elem) > 0 else ""

    driver.close()  # Must close driver or else bloomberg might get sus.

    return {'headline': news_headline, 'time': time}

In [71]:
sample_date = '2024-08-05'

print(get_headline(sample_date))

{'headline': 'S&P 500 Climbs 1% After Wall Street’s Wild Rout: Markets Wrap', 'time': 'August 6, 2024 at 7:25 AM GMT+8'}


## Get SPY ticker so that we can get the trading dates

In [40]:
import yfinance as yf

In [45]:
spy_ticker = yf.Ticker('SPY')
SPY_prices = spy_ticker.history(period='max')

In [59]:
format = '%Y-%m-%d'
input_dates = [dt.strftime(date, format) for date in SPY_prices.index]

In [60]:
len(input_dates)

7950

### Test out by iterating through the trading dates

In [70]:
for i in input_dates[-15:]:
    print(i, get_headline(i))

2024-08-06 {'headline': 'Stocks Swoon After Weak $42 Billion Treasury Sale: Markets Wrap', 'time': 'August 7, 2024 at 6:38 AM GMT+8'}
2024-08-07 {'headline': 'S&P 500 Notches Biggest Rally Since November 2022: Markets Wrap', 'time': 'August 8, 2024 at 6:19 AM GMT+8'}
2024-08-08 {'headline': 'Stocks Make Comeback at End of Dizzying Week: Markets Wrap', 'time': 'August 9, 2024 at 6:20 AM GMT+8'}
2024-08-09 {'headline': 'Our apologies\nWe’re unable to find the\npage you’re looking for.', 'time': ''}
2024-08-12 {'headline': 'Stocks Rally in Countdown to CPI as US Yields Sink: Markets Wrap', 'time': 'August 13, 2024 at 6:48 AM GMT+8'}
2024-08-13 {'headline': 'Stocks Rise as CPI ‘Checks The Box’ for Fed Cuts: Markets Wrap', 'time': 'August 14, 2024 at 6:30 AM GMT+8'}
2024-08-14 {'headline': 'Our apologies\nWe’re unable to find the\npage you’re looking for.', 'time': ''}
2024-08-15 {'headline': 'Stocks See Best Week in ’24 as Buyers Race Back In: Markets Wrap', 'time': 'August 16, 2024 at 6:2