## Scraping Børsen for articles about Novo Nordisk

In [26]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException

import time
import requests
import os
from bs4 import BeautifulSoup
import pandas as pd

In [28]:
# Set up Selenium with Chrome driver
driver = webdriver.Chrome()
driver.maximize_window()

# Specify the URL
url = "https://borsen.dk/"
driver.get(url)
time.sleep(0.5)  # Add a delay to avoid overwhelming the server and getting blocked

# We want the cookies to go away
cookie = driver.find_element(By.ID, 'acceptButton')
cookie.click()
time.sleep(0.5) # Cookie pop-up will disappear

# Click on the search button
search_button = driver.find_element(By.CLASS_NAME, 'icon-search')
search_button.click()

# Wait for the search input field to appear (you might need to adjust the waiting time)
import time
time.sleep(2)  # Wait for 2 seconds

# Find the search input field and type in "Novo Nordisk"
search_input = driver.find_element(By.XPATH, '//input[@type="text"]')
search_input.send_keys('Novo Nordisk')
search_input.send_keys(Keys.RETURN)


In [None]:
all_headlines = []

while True:  # This will keep running until we break out of it
    # 1. Scrape the headlines on the current page
    headlines = driver.find_elements(By.CSS_SELECTOR, 'div.title.mb-3')  # Using the CSS selector for the title
    for headline in headlines:
        all_headlines.append(headline.text)

    # 2. Check if there's a "Next" button/link
    try:
        next_button = driver.find_element(By.XPATH, '//div[@class="col-3 next"]/a')
    except NoSuchElementException:
        # If there's no "Next" button, break out of the loop
        break

    # 3. If there is, click on it
    next_button.click()

    # Optional: Add a delay to ensure the next page loads properly
    time.sleep(2)

# Now, all_headlines should contain the headlines from all pages
print(all_headlines)

In [48]:
# Get the page source
page_source = driver.page_source

# Use Beautiful Soup to parse the HTML
soup = BeautifulSoup(page_source, "html.parser")

# Find all article containers with the specified class
article_containers = soup.find_all('div', class_='title mb-3')

# Lists to hold extracted data
titles = []
dates = []

# Set a limit for the number of pages to scrape
page_limit = 1
current_page = -50

while current_page <= page_limit:
    # Get the page source
    page_source = driver.page_source

    # Use Beautiful Soup to parse the HTML
    soup = BeautifulSoup(page_source, "html.parser")

    # Find all article containers with the specified class
    title_divs = soup.find_all('div', class_='title mb-3')
    date_divs = soup.find_all('div', class_='gta-condensed date')

    for title_div, date_div in zip(title_divs, date_divs):
        titles.append(title_div.text.strip())
        dates.append(date_div.text.strip())

    # Try to find the "Next" button/link and click it to go to the next page
    try:
        next_button = driver.find_element(By.CSS_SELECTOR, ".col-3.next a")
        next_button.click()
        current_page += 1
        time.sleep(2)  # Wait for the next page to load
    except NoSuchElementException:
        break  # Exit the loop if there's no "Next" button

# Convert the lists to a pandas DataFrame
df = pd.DataFrame({
    'Title': titles,
    'Date': dates
})

df



Unnamed: 0,Title,Date
0,Lars Rebien afløser Sten Scheibye hos Novo-ejer,18. MAJ. 2018
1,KORT NYT - NAVNE,18. MAJ. 2018
2,Lars Rebien om ny formandsrolle i Novo Fonden:...,17. MAJ. 2018
3,Scheibye stopper brat i Novo-fonden: Tidligere...,17. MAJ. 2018
4,Det skriver medierne: Novo Nordisk får topadvo...,17. MAJ. 2018
...,...,...
2595,Novo Nordisk risikerer prissmæk i USA i 2017,1. AUG. 2016
2596,En positiv uge for C20 Cap giver et solidt for...,1. AUG. 2016
2597,KORT NYT - NAVNE,1. AUG. 2016
2598,Ugen der kommer: Prismiljøet er i fokus ved No...,30. JUL. 2016


Unnamed: 0,Title,Date
0,Markedsåbning: Novo Nordisk åbner i plus trods...,7. SEP. 2020
1,Novo er stævnet i ny whistleblowersag i USA,7. SEP. 2020
2,Danske aktier får store kurshug: C25 lukker me...,4. SEP. 2020
3,Novo vil se på positive effekter af GLP-1-beha...,4. SEP. 2020
4,Røde tal dominerer C25 efter store kursfald i USA,4. SEP. 2020
...,...,...
2495,"""Vi ser masser af købsmuligheder""",22. MAJ. 2018
2496,Sådan smager aktiehajens egen medicin: Her er ...,22. MAJ. 2018
2497,KORT NYT - MARKEDER UDLAND,22. MAJ. 2018
2498,Dette er Nordens pengemaskine - et dansk selsk...,20. MAJ. 2018
