## Scraping Berlingske for Novo Nordisk Articles

In [43]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException


from bs4 import BeautifulSoup
import time
import re
import requests
import pandas as pd

In [44]:
# Set up the driver and navigate to the URL
url = 'https://www.berlingske.dk'
driver = webdriver.Chrome()  # Ensure you have the ChromeDriver set up correctly
driver.get(url)

# Find and click on the "Allow Cookies" button
allow_cookies_button = driver.find_element(By.CSS_SELECTOR, "#CybotCookiebotDialogBodyButtonDecline")
allow_cookies_button.click()

#Find and click the menu button
menu_data_id = "menuToggler"
menu_element = driver.find_element(By.CSS_SELECTOR, f"[data-id='{menu_data_id}']")
menu_element.click()

# Locate the search input element by its ID
search_input_id = "header-search"
search_input = driver.find_element(By.ID, search_input_id)

# Perform a search action in the search input
search_input.send_keys("Novo Nordisk")
search_input.submit()

In [45]:
# Loop to click the "Load More" button and stop clicking when it is no longer possible
for _ in range(1000): 
    try:
        load_button = driver.find_element(By.XPATH, '/html/body/div[2]/div[4]/div[2]/div/div/div/div[2]/button')
        load_button.click()
        time.sleep(0.2)
    except (NoSuchElementException, ElementNotInteractableException):
        print("Load More button not found or not clickable. Exiting loop.")
        break

In [46]:
#Because we use the Selenium package and don't make use of the get response, we cannot include a header with contact information. Due to the structure of the website, we found that the most effecient way to obtain data was via Selenium.
# Get the page source
page_source = driver.page_source

# Use Beautiful Soup to parse the HTML
soup = BeautifulSoup(page_source, "html.parser")


# Find all article containers with the specified class
article_containers = soup.find_all('div', class_='teaser teaser--border-bottom teaser--search d-flex flex-row w-100')

# Lists to hold extracted data
titles = []
dates = []

# Loop through the article containers and extract title and date
for container in article_containers:
    title = container.find('a', class_='teaser__title-link').text.strip()
    date = container.find('div', class_='teaser__date').text.strip()
    
    titles.append(title)
    dates.append(date)

# Convert the lists to a pandas DataFrame
df = pd.DataFrame({
    'Title': titles,
    'Date': dates})

df


Unnamed: 0,Title,Date
0,Han er topchef for 55.000 mennesker og introve...,11. august 2023
1,Business-overblik: Lauritz-manøvre skjulte dun...,11. august 2023
2,Novo vokser og vokser. Er nu mere værd end Por...,10. august 2023
3,Lars Fruergaard affejer kritik af mangel på di...,10. august 2023
4,Business-update: »En narkobande af Mærsk-ansat...,10. august 2023
...,...,...
9456,Kendt dansk kommunikationsbureau går i opløsning,1. oktober 2015
9457,"»Jeg kan godt knibe mig i armen og spørge, hvo...",1. oktober 2015
9458,Minister: Ny prisportal kan skærpe konkurrence...,1. oktober 2015
9459,Fra mælkemand til kødkonge: Her er Danish Crow...,1. oktober 2015


In [59]:
#To filter the articles, we make a list of keywords
#Initially, this list must inclide the names of all Novo Nordisk medical produkts. These can be obtained as follows: 


# URL to scrape from
url = "https://min.medicin.dk/Adresser/Medicinvirksomheder/126/"

# Send a GET request
response = requests.get(url, headers={'name': 'Lea Lund Jeppesen', 'e-mail':'snk956@alumni.ku.dk', 'purpose':'University of Copenhagen Social Data Science exam project'})

# Parse the content with BeautifulSoup
soup = BeautifulSoup(response.content, 'lxml')

# Find all product elements
product_elements = soup.find_all('a', class_='glob-search_link')

# Extract the product names from the title attribute
product_names = [product['title'] for product in product_elements]

# Removing special characters and numbers from each product name by using Regular Expressions
product_names_cleaned = [re.sub(r'[^a-zA-Z\s]', '', name) for name in product_names_updated]

# For the 28th element, split it into 2 by space
words = product_names_cleaned[28].split()

# Replace the 28th name with the first word
product_names_cleaned[28] = words[0]

# Insert the second word as the 29th element
product_names_cleaned.insert(30, words[1])

#Print the list
product_names_cleaned


['Activelle',
 'Actrapid',
 'Esperoct',
 'Estrofem',
 'Fiasp',
 'GlucaGen',
 'Insulatard',
 'Kliogest',
 'Levemir',
 'Mixtard ',
 'Norditropin',
 'NovoEight',
 'Novofem',
 'NovoMix',
 'NovoNorm',
 'NovoRapid',
 'Novoseven',
 'NovoThirtee',
 'Ozempic',
 'Refixia',
 'Rybelsus',
 'Ryzodeg',
 'Saxenda',
 'Sogroya',
 'Tresiba',
 'Trisekvens',
 'Vagifem',
 'Victoza',
 'Wegovy',
 'Xultophy',
 'Flextouch']

In [61]:
# Define additional keywords, these are determined manually by research on Novo Nordisk medical expertise
keywords = ["Novo", "Nordisk", "insulin", "fedme", "diabetes", "vægttab", "præparat", "overvægt", "slank", "medicin", "pille"]

# We then append the product_names_cleaned list to the keywords list
for product in product_names_cleaned:
    keywords.append(product)

print(keywords)

['Novo', 'Nordisk', 'insulin', 'fedme', 'diabetes', 'vægttab', 'præparat', 'overvægt', 'slank', 'medicin', 'pille', 'Activelle', 'Actrapid', 'Esperoct', 'Estrofem', 'Fiasp', 'GlucaGen', 'Insulatard', 'Kliogest', 'Levemir', 'Mixtard ', 'Norditropin', 'NovoEight', 'Novofem', 'NovoMix', 'NovoNorm', 'NovoRapid', 'Novoseven', 'NovoThirtee', 'Ozempic', 'Refixia', 'Rybelsus', 'Ryzodeg', 'Saxenda', 'Sogroya', 'Tresiba', 'Trisekvens', 'Vagifem', 'Victoza', 'Wegovy', 'Xultophy', 'Flextouch']


In [64]:
# Convert the keywords to lowercase for case-insensitive matching
keywords_lower = [word.lower() for word in keywords]

# Filter the dataframe based on the condition
df_filtered = df[df['Title'].str.lower().str.contains('|'.join(keywords_lower))]

#Reset index
df_filtered = df_filtered.reset_index(drop=True)

#Print
df_filtered

Unnamed: 0,Title,Date
0,Novo vokser og vokser. Er nu mere værd end Por...,10. august 2023
1,Lars Fruergaard affejer kritik af mangel på di...,10. august 2023
2,Novo Nordisk kæmper med at følge den store eft...,10. august 2023
3,Salg af Novo-slankemiddel boomer: Mere end sek...,10. august 2023
4,Novo Nordisk leverer atter milliardoverskud og...,10. august 2023
...,...,...
1713,Novo Nordisk aktien ramt af nedtur,14. oktober 2015
1714,Novo-boss kåret som verdens bedste topchef,13. oktober 2015
1715,Novo-boss kåret som verdens bedste topchef,13. oktober 2015
1716,"Novozymes-topchef: »Ja, vi har fået en ridse i...",11. oktober 2015
