In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

from difflib import SequenceMatcher

import pandas as pd

import random
import time
import requests
import re

In [2]:
"""This function provides a random user agent string to use for a web scraping session, 
helping to reduce the chances of being blocked by a website server."""

def get_new_useragent():
    list_agents = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"]
    USER_AGENT = random.choice(list_agents)
    return {"user-agent": USER_AGENT}

In [3]:
"""This is a Python function that performs web scraping on the website of the Dutch public broadcasting organization, 
NPO, to extract information about content in a specified category."""

def scrap_npo(category, keys):
    results = pd.DataFrame(columns = ["Name", "Description", "Category", "URL", "Small_Image", "Large_Image"])

    # Set the options for the Chrome webdriver
    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")
    options.add_argument("--disable-notifications")

    # Start the Chrome webdriver
    driver = webdriver.Chrome('/Users/abril/Documents/Utrecht_University/Block_3/Personalisation_for_Public_Media/Final_Project/back/scrapping/chromedriver', options=options)

    # Navigate to the webpage with the list of TV shows
    driver.get(keys["url"])

    # Handle cookie consent box (if it exists)
    try:
        wait = WebDriverWait(driver, 10)
        cookie_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#ccm_close")))
        cookie_button.click()
    except:
        pass

    # Wait for the "load more" button to be available
    wait = WebDriverWait(driver, 10)
    load_more_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".load-more-button")))

    # Click the "load more" button until it disappears
    while load_more_button.is_displayed():
        try:
            load_more_button.click()
            load_more_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".load-more-button")))
        except:
            break

    # Parse the HTML content of the webpage using BeautifulSoup
    soup = BeautifulSoup(driver.page_source, "html.parser")

    # Find the parent div that contains the list of all TV shows
    parent_div = soup.find("div", {"class": keys["classes"]["parent_div"]})

    # Find all the child divs that contain the TV shows
    tvshow_divs = parent_div.find_all("div", {"class": keys["classes"]["tile-container"]})

    # Loop through each TV show in the list and extract its name, image, and URL
    for tvshow in tvshow_divs:
        url = tvshow.find("a")["href"]
        name = tvshow.find("div", {"class": keys["classes"]["tile-title"]}).text.strip()
        small_image = tvshow.find("div", {"class": keys["classes"]["tile-image"]}).find("img")["src"]

        # Send a GET request to the TV show's URL
        time.sleep(2)
        headers = get_new_useragent() # Adding a random user agent
        response = requests.get(url, headers=headers)

        # Parse the HTML content of the TV show's page using BeautifulSoup
        soup2 = BeautifulSoup(response.content, "html.parser")

        """ The description and image of a show are included in the meta tags of a web page because 
        they provide useful information to search engines and social media platform. Including this information
        help to improve the visibility of the webpage and engagement on both search engines and social media platforms."""
        
        # Find the description of the item in the meta tag that contains the description
        try: description = soup2.find("meta", property="description")["content"]
        except: description = ""

        # Find the image of the item in the meta tag
        try: background_image_url = soup2.find("meta", property="og:image")["content"]
        except: background_image_url = small_image

        new_result = pd.DataFrame({"Name" : name, "Description" : description, "Category": category, "URL" : url, "Small_Image": small_image, "Large_Image": background_image_url}, index=[0])
        results = pd.concat([new_result, results.loc[:]]).reset_index(drop=True)

    # Close the browsers
    driver.quit()
    return results

In [3]:
# These are the parameters that we need to webscrap different pages in the NPOStart Platform
# The classes change in the programmas section, that's why we need to differentiate them
programmas_classes = {
    "parent_div": "npo-grid-teaser",
    "tile-container": "npo-ankeiler-tile-container",
    "tile-image": "npo-ankeiler-tile-image",
    "tile-title": "npo-ankeiler-tile-image"
}

other_classes = {
    "parent_div": "npo-grid-asset",
    "tile-container": "npo-asset-tile-container",
    "tile-image": "npo-asset-tile-image",
    "tile-title": "npo-asset-tile-title"
}

# For each category the section of the webpage that we want to scrap changes, that's why we are including different urls in this dictionary
categories = {
    "Programmas": {"url": "https://www.npostart.nl/programmas", "classes": programmas_classes},
    "Series": {"url": "https://www.npostart.nl/collectie/POMS_S_NPO_3712010", "classes": other_classes},
    "Documentaires": {"url": "https://www.npostart.nl/collectie/POMS_S_NPO_8166639", "classes": other_classes},
    "Films": {"url": "https://www.npostart.nl/collectie/POMS_S_NPO_8282004", "classes": other_classes}    
}

In [5]:
for key, value in categories.items():
    result = scrap_npo(key, value)
    result.to_csv("{}.csv".format(key))
    time.sleep(10)

  driver = webdriver.Chrome('/Users/abril/Documents/Utrecht_University/Block_3/Personalisation_for_Public_Media/Final_Project/back/scrapping/chromedriver', options=options)


In [4]:
"""This function computes the similarity ratio between two strings a and b using the SequenceMatcher class 
from the difflib module in Python. The SequenceMatcher class takes two sequences and returns a measure of their similarity 
as a ratio between 0 and 1, with 0 indicating no similarity between the two strings, and 1 indicating that the two strings 
are identical."""

def similar(a, b):
    return SequenceMatcher(None, str(a), str(b)).ratio()

In [5]:
""" This code is a Python function named get_more_info that scrapes information about TV shows from the Movie Meter website. 
The function takes a row as an argument, which is a Pandas dataframe row that contains information about a TV show such as 
its name, description, small image and large image. We will use this function to complete the information that could not be
found in the NPO website."""

def get_more_info(row, driver):
    # Set the options for the Chrome webdriver
    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")
    options.add_argument("--disable-notifications")

    # Navigate to Movie Meter search page
    driver.get("https://www.moviemeter.nl/site/zoeken/{}".format(row["Name"]))

    # Wait for the results to load
    driver.implicitly_wait(1)

    # Find the first search result and click on it
    try:
        first_result = driver.find_element(By.CSS_SELECTOR, "div.sResults a")
        first_result.click()
    except:
        return {"tags": [], "description": row["Description"], "small_image": row["Small_Image"], "large_image": row["Large_Image"]}
    
    # If we continue it's because we got results in the search website
    # Send a GET request to the TV show's URL
    time.sleep(1)
    headers = get_new_useragent() # adding a random user agent
    response = requests.get(driver.current_url, headers=headers)

    # Parse the HTML content of the TV show's page using BeautifulSoup
    soup2 = BeautifulSoup(response.content, "html.parser")

    # Get title of the webpage and check how similar it is from the title that we are trying to search
    try:
        title = soup2.find("div", {"class": "title"}).find("h1").text.strip()
    except:
        title = ""
    similarity = similar(title, row["Name"])

    # If the found title and the show we are searching are very similar, we perform the webscrapping
    if similarity >= 0.65:
        # Find the tags in the HTML using a CSS selector
        try: tags = soup2.find("div", {"class": "genre_holder"}).find_all("a")
        except: tags = []

        result = []
        for tag in tags:
            result.append(tag.text.strip())

        if pd.isna(row["Description"]):
            description_div = soup2.find("div", {"class": "blog-bar"}).find("p")
            if description_div:
                description = description_div.text.strip()
            else:
                description = row["Description"]
        else: description = row["Description"]

        # I we didn't find a small image in the NPO website, we will try to find one in the movie meter platform
        if not row["Small_Image"].startswith("https://"):
            small_image = soup2.find("div", {"class": "figure"}).find("img")["src"]
        else: small_image = row["Small_Image"]
    
        if not row["Large_Image"].startswith("https://"):
            # Find the div containing the background image using a CSS selector
            div = driver.find_element(By.CSS_SELECTOR, "div.film-header-bar")
    
            # Get the background image URL using JavaScript
            background_image = driver.execute_script("return getComputedStyle(arguments[0]).backgroundImage;", div)
            # Define the regular expression pattern
            pattern = r"url\(\"(.*)\"\)"

            # Find the URL using the regular expression
            match = re.search(pattern, background_image)

            # Extract the URL from the match object
            background_image_url = match.group(1)
        else:
            background_image_url = small_image
    else:
        return {"tags": [], "description": row["Description"], "small_image": row["Small_Image"], "large_image": row["Large_Image"]}

    return {"tags": result, "description": description, "small_image": small_image, "large_image": background_image_url}

In [7]:
# Set the options for the Chrome webdriver
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")

for key in categories:
    df = pd.read_csv("{}.csv".format(key))
    df = df.drop_duplicates(subset=["Name"], keep="last")
    df = df.drop("Unnamed: 0", axis=1)
    df["Tags"] = ""

    # Start the Chrome webdriver
    driver = webdriver.Chrome('/Users/abril/Documents/Utrecht_University/Block_3/Personalisation_for_Public_Media/Final_Project/back/scrapping/chromedriver', options=options)
    
    for index, row in df.iterrows():
        results = get_more_info(row, driver)
        df.at[index, "Tags"] = results["tags"]
        df.at[index, "Description"] = results["description"]
        df.at[index, "Small_Image"] = results["small_image"]
        df.at[index, "Large_Image"] = results["large_image"]
    
    # Close the browser
    driver.quit()
    
    df.to_csv("{}_complete.csv".format(key))

  driver = webdriver.Chrome('/Users/abril/Documents/Utrecht_University/Block_3/Personalisation_for_Public_Media/Final_Project/back/scrapping/chromedriver', options=options)
