##### Testing to retreive Data through CDX API

In [18]:
import requests
from bs4 import BeautifulSoup

# CDX API base URL
cdx_api_url = "http://web.archive.org/cdx/search/cdx"

# Set the parameters for the CDX API request
params = {
    "url": "publico.pt",  # News domain
    "from": "20190101",   # Start date (2019)
    "to": "20190110",     # End date (2019)
    "output": "json",     # Return output in JSON format
    "filter": "statuscode:200",  # Only return pages that successfully loaded
    "collapse": "urlkey",  # Collapse identical URLs to avoid duplicates
    "matchType": "prefix",  # Match URLs that start with the domain
    "limit": 10  # Limit the number of results (you can adjust this)
}

# Open a file to write the results
with open("results.txt", "w") as file:
    # Send a request to the CDX API
    response = requests.get(cdx_api_url, params=params)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the response JSON (CDX API returns a list of metadata for URLs)
        data = response.json()
        
        # Loop over each entry in the JSON data
        for entry in data:
            # Get the URL and timestamp of the archived page
            timestamp = entry[1]
            original_url = entry[2]

            # Construct the archived URL for Wayback Machine
            archived_url = f"https://web.archive.org/web/{timestamp}/{original_url}"
            file.write(f"Fetching archived URL: {archived_url}\n")
            
            # Now make a request to the archived page
            archived_page_response = requests.get(archived_url)
            
            # Check if the archived page request was successful
            if archived_page_response.status_code == 200:
                # Parse the page content using BeautifulSoup
                soup = BeautifulSoup(archived_page_response.content, 'html.parser')
                
                # Find all heading tags (h1, h2, h3, h4, h5, h6)
                heading_tags = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

                # List of keywords to search for
                keywords = ["Chega", "André Ventura", "Mário Machado"]
                
                # Loop through the heading tags and print their text
                for heading in heading_tags:
                    heading_text = heading.get_text()
                    file.write(f"Heading found: {heading_text}\n")
                    
                    for keyword in keywords:
                        if keyword in heading_text:
                            file.write(f"Keyword '{keyword}' mentioned in heading\n")
            else:
                file.write(f"Error fetching archived page: {archived_page_response.status_code}\n")
    else:
        file.write(f"Error retrieving data from CDX API: {response.status_code}\n")

In [16]:
import json

# Path to the JSON file
json_file_path = 'cdx_results.json'

# Open and load the JSON file
with open(json_file_path, 'r') as file:
    data = json.load(file)

# List to store the URLs
urls = []

# Loop through the data and extract 'url' values
for entry in data:
    if 'url' in entry:  # Check if 'url' key exists
        urls.append(entry['url'])  # Add the URL to the list

# Open a file to write the results
with open("titles.txt", "w") as file:
    # Loop through each URL
    for url in urls:
        # Remove 'https://' and split the URL at slashes
        parts = url.split('/')
        
        # Extract the last part after the slash, which usually contains the title
        title_with_params = parts[-1]
        
        # Remove any parameters (everything after '?')
        title = title_with_params.split('?')[0]
        
        # Replace hyphens with spaces to improve readability
        formatted_title = title.replace('-', ' ')
        
        # Write the formatted title to the file
        file.write(f"{formatted_title}\n")

print("Titles extracted and written to 'titles.txt'.")

Titles extracted and written to 'titles.txt'.
