#### Jupyter Notebook Foundations of Data Science 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#### Receiving Data with CDX API

In [None]:
# please put the file name of the .json file here so my script works with the variable 'file_name'

file_name = ''

#### Scrapping words out of received URLs

- Open every website and webscrape the title-element in the HTML-Script

- Safe the original URL as Key and the extracted words as value in a dictionary, if the title contains 'Chega' or 'CHEGA'
- Write it into a new .json file

In [None]:
import json
import requests
from bs4 import BeautifulSoup
import time

# Path to the JSON file
json_file_path = 'test_for_scraping_cdx_results.json'

# Open and load the JSON file
with open(file_name, 'r') as file:
    data = json.load(file)

titles = {}

# Loop through URLs; search for the title element in HTML-Script
for v in data:
    url = v['url']
    
    try:
        # Requesting the website and setting a timeout of 10 seconds
        response = requests.get(url, timeout=10, stream=True) # stream=True for downloading data in chunks not everything at once 
        
        # Delay between requests to avoid overloading the server
        time.sleep(1)  # 1-second delay

        # Initialize an empty content variable and stream content chunks
        html_content = b""
        
        # Stream the content until we find the closing </title> tag
        for chunk in response.iter_content(chunk_size=512):
            html_content += chunk
            if b"</title>" in html_content:
                break  # Stop streaming once the <title> tag is found

        # Parse only the partial content with BeautifulSoup
        soup = BeautifulSoup(html_content, "html.parser")

        # Extract the <title> element
        title_tag = soup.find("title")

        # If no <title> tag is found, skip this page, makes program faster
        if not title_tag:
            continue

        # Extract the text from the <title> tag
        title_text = title_tag.get_text()

        # Check for "Chega" or "CHEGA" (case-sensitive check)
        if "Chega" in title_text or "CHEGA" in title_text:
            titles[url] = title_text  # Store the URL and title in the dictionary

    except requests.exceptions.Timeout:
        print(f"Timeout occurred for URL: {url}")
        continue  # Skip to the next URL if a timeout occurs

    except requests.exceptions.RequestException as e:
        print(f"Request failed for URL: {url} with error: {e}")
        continue  # Skip to the next URL if another error occurs

# Writing the dictionary to a JSON file
with open('titles.json', 'w', encoding='utf-8') as json_file:
    json.dump(titles, json_file, indent=4, ensure_ascii=False)

# Print the results, only for debugging, to remove later
for url, title in titles.items():
    print(url, title)

Maybe tomorrow or at the second project submission: How to use LLM/NLP like NER or GPT to understand the content of the newspaper