## Scraping Børsen for articles about Novo Nordisk

In [72]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException

import time
import requests
import os
from bs4 import BeautifulSoup
import pandas as pd
import re

In [62]:
# Set up Selenium with Chrome driver
driver = webdriver.Chrome()
driver.maximize_window()

# Specify the URL
url = "https://borsen.dk/"
driver.get(url)
time.sleep(0.5)

# Accept cookies
cookie = driver.find_element(By.ID, 'acceptButton')
cookie.click()
time.sleep(0.5)

# Click on the search button
search_button = driver.find_element(By.CLASS_NAME, 'icon-search')
search_button.click()
time.sleep(2)

# Search for "Novo Nordisk"
search_input = driver.find_element(By.XPATH, '//input[@type="text"]')
search_input.send_keys('Novo Nordisk')
search_input.send_keys(Keys.RETURN)



In [63]:
titles = []
dates = []

while True:
    # Use Beautiful Soup to parse the current page's HTML
    soup = BeautifulSoup(driver.page_source, "html.parser")

    # Find all titles and dates with the specified class
    title_divs = soup.find_all('div', class_='title mb-3')
    date_divs = soup.find_all('div', class_='gta-condensed date')

    for title_div, date_div in zip(title_divs, date_divs):
        titles.append(title_div.text.strip())
        dates.append(date_div.text.strip())

    # Try to find the "Next" button/link and click it to go to the next page
    try:
        next_button = driver.find_element(By.CSS_SELECTOR, ".col-3.next a")
        next_button.click()
        time.sleep(2)
    except NoSuchElementException:
        break

# Convert the lists to a pandas DataFrame
df = pd.DataFrame({
    'Title': titles,
    'Date': dates
})

df

Unnamed: 0,Title,Date
0,Investorer høster aktieprofitter – her er akti...,9.15
1,Før markedet åbner: Trykket stemning efter hæs...,8.36
2,Novos udviklingschef efter vild uge: Flere suc...,10. AUG.
3,"Fruergaard: “Vi investerer, hvor vi kan flytte...",10. AUG.
4,Inflationstal udløste C25-optur: Kursprygl til...,10. AUG.
...,...,...
26825,Centerkontrakter,28. JUN. 1996
26826,Fremgang for Micro Matic,27. JUN. 1996
26827,Øser af sin miljø-erfaring,26. JUN. 1996
26828,Danske virksomheder tvinges til nedskæringer,24. JUN. 1996


In [79]:
#To filter the articles, we make a list of keywords
#Initially, this list must inclide the names of all Novo Nordisk medical products. These can be obtained as follows: 

# URL to scrape from
url = "https://min.medicin.dk/Adresser/Medicinvirksomheder/126/"

# Send a GET request
response = requests.get(url, headers={'name': 'Rosemarie Blicher', 'e-mail':'mzc44@alumni.ku.dk', 'purpose':'University of Copenhagen Social Data Science exam project'})

# Parse the content with BeautifulSoup
soup = BeautifulSoup(response.content, 'lxml')

# Find all product elements
product_elements = soup.find_all('a', class_='glob-search_link')

# Extract the product names from the title attribute
product_names = [product['title'] for product in product_elements]

# Removing special characters and numbers from each product name by using Regular Expressions
product_names_cleaned = [re.sub(r'[^a-zA-Z\s]', '', name) for name in product_names]

# For the 28th element, split it into 2 by space
words = product_names_cleaned[28].split()

# Replace the 28th name with the first word
product_names_cleaned[28] = words[0]

# Insert the second word as the 29th element
product_names_cleaned.insert(30, words[1])

#Print the list
product_names_cleaned

['Activelle',
 'Actrapid',
 'Esperoct',
 'Estrofem',
 'Fiasp',
 'GlucaGen',
 'Insulatard',
 'Kliogest',
 'Levemir',
 'Mixtard ',
 'Norditropin',
 'NovoEight',
 'Novofem',
 'NovoMix',
 'NovoNorm',
 'NovoRapid',
 'Novoseven',
 'NovoThirteen',
 'Ozempic',
 'Refixia',
 'Rybelsus',
 'Ryzodeg',
 'Saxenda',
 'Sogroya',
 'Tresiba',
 'Trisekvens',
 'Vagifem',
 'Victoza',
 'Wegovy',
 'Xultophy',
 'Flextouch']

In [80]:
# Define additional keywords, these are determined manually by research on Novo Nordisk medical expertise
keywords = ["Novo", "Nordisk", "insulin", "fedme", "diabetes", "vægttab", "præparat", "overvægt", "slank", "medicin", "pille"]

# We then append the product_names_cleaned list to the keywords list
for product in product_names_cleaned:
    keywords.append(product)

print(keywords)

['Novo', 'Nordisk', 'insulin', 'fedme', 'diabetes', 'vægttab', 'præparat', 'overvægt', 'slank', 'medicin', 'pille', 'Activelle', 'Actrapid', 'Esperoct', 'Estrofem', 'Fiasp', 'GlucaGen', 'Insulatard', 'Kliogest', 'Levemir', 'Mixtard ', 'Norditropin', 'NovoEight', 'Novofem', 'NovoMix', 'NovoNorm', 'NovoRapid', 'Novoseven', 'NovoThirteen', 'Ozempic', 'Refixia', 'Rybelsus', 'Ryzodeg', 'Saxenda', 'Sogroya', 'Tresiba', 'Trisekvens', 'Vagifem', 'Victoza', 'Wegovy', 'Xultophy', 'Flextouch']


In [81]:
# Convert the keywords to lowercase for case-insensitive matching
keywords_lower = [word.lower() for word in keywords]

# Filter the dataframe based on the condition
df_filtered = df[df['Title'].str.lower().str.contains('|'.join(keywords_lower))]

#Reset index
df_filtered = df_filtered.reset_index(drop=True)

#Print
df_filtered

Unnamed: 0,Title,Date
0,Novos udviklingschef efter vild uge: Flere suc...,10. AUG.
1,Et nationalregnskab uden Novo Nordisk? Ikke nø...,10. AUG.
2,Nu splittes Novo Nordisk-aktien: Sådan påvirke...,10. AUG.
3,Analytiker: Novo-regnskab svagere end ventet,10. AUG.
4,Debat | LA: Skal næste generations Novo se dag...,10. AUG.
...,...,...
7954,Novo på vej mod rekordresultat,15. AUG. 1996
7955,Novo trak KFX frem til rekord,15. AUG. 1996
7956,Novo-resultat bliver som lovet,14. AUG. 1996
7957,Historien om insulin-krigen i Novo Nordisk,18. JUL. 1996
