<a href="https://colab.research.google.com/github/hxiufan/Capstone713/blob/main/Multi_page_Webscrapping_CorrectionsNZ_23Dec2024_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os

In [None]:
from google.colab import drive

In [None]:
# Updated Code to Keep Wayback Machine Timestamp and Format It

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from tqdm import tqdm
from datetime import datetime

# Function to query Wayback Machine API and get all snapshots within a date range
def get_wayback_snapshots(url, start_date, end_date):
    """
    Query the Wayback Machine API for all snapshots of a URL within a date range.

    Parameters:
        url (str): The URL to query.
        start_date (str): Start date in YYYYMMDD format.
        end_date (str): End date in YYYYMMDD format.

    Returns:
        list: A list of dictionaries containing snapshot URLs and timestamps.
    """
    wayback_api_url = f"http://web.archive.org/cdx/search/cdx?url={url}&output=json&collapse=digest&from={start_date}&to={end_date}"
    try:
        response = requests.get(wayback_api_url, timeout=30)
        response.raise_for_status()
        snapshots = response.json()[1:]  # Skip the header row
        return [
            {"snapshot_url": f"https://web.archive.org/web/{snapshot[1]}/{snapshot[2]}", "timestamp": snapshot[1]}
            for snapshot in snapshots
        ]
    except requests.exceptions.RequestException as e:
        print(f"Error fetching Wayback Machine snapshots: {e}")
        return []

# Function to get SEO features and additional metadata from a webpage
def get_seo_features(url, session):
    """
    Extract SEO features and additional metadata from a webpage.

    Parameters:
        url (str): The URL of the webpage.
        session (requests.Session): The session for making requests.

    Returns:
        dict: A dictionary of SEO data.
    """
    try:
        response = session.get(url, timeout=30)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'lxml')

        # Extract metadata
        keywords = soup.find('meta', attrs={'name': 'dc.keywords'})
        description = soup.find('meta', attrs={'name': 'dc.description'})
        dc_description = soup.find('meta', attrs={'name': 'dc.description'})

        return {
            'title': soup.find('title').text.strip() if soup.find('title') else 'No title found',
            'meta_description': description['content'].strip() if description else 'No description found',
            'h1_tags': [h1.text.strip() for h1 in soup.find_all('h1')],
            'word_count': len(soup.get_text(strip=True).split()),
            'keywords': keywords['content'].strip() if keywords else 'No keywords found',
            'description': description['content'].strip() if description else 'No description found',
            'DC.Description': dc_description['content'].strip() if dc_description else 'No DC.Description found'
        }
    except requests.exceptions.RequestException as e:
        print(f"Network error while scraping {url}: {e}")
    except Exception as e:
        print(f"Parsing error for {url}: {e}")
    return {
        'title': 'Error',
        'meta_description': 'Error',
        'h1_tags': 'Error',
        'word_count': 0,
        'keywords': 'Error',
        'description': 'Error',
        'DC.Description': 'Error'
    }

# Function to save data to CSV
def save_to_csv(data, filename):
    """
    Save data to a CSV file.

    Parameters:
        data (list): List of dictionaries containing data.
        filename (str): Output CSV filename.
    """
    pd.DataFrame(data).to_csv(filename, index=False, encoding='utf-8')
    print(f"Data saved to {filename}")

# Main function to scrape all snapshots from Wayback Machine
def scrape_wayback_pages(original_url, start_date, end_date, output_filename="/content/drive/MyDrive/DA713/wayback_filtered_snapshots_corrections_data.csv"):
    """
    Scrape all snapshots from Wayback Machine for a given URL and save SEO data to a CSV.

    Parameters:
        original_url (str): The URL to scrape snapshots for.
        start_date (str): Start date in YYYYMMDD format.
        end_date (str): End date in YYYYMMDD format.
        output_filename (str): The name of the output CSV file.
    """
    snapshots = get_wayback_snapshots(original_url, start_date, end_date)
    print(f"Found {len(snapshots)} snapshots.")

    all_data = []
    session = requests.Session()  # Reuse session for efficiency

    for snapshot in tqdm(snapshots, desc="Scraping snapshots"):
        snapshot_url = snapshot['snapshot_url']
        original_timestamp = snapshot['timestamp']

        # Convert Wayback Machine timestamp to human-readable format
        formatted_timestamp = datetime.strptime(original_timestamp, "%Y%m%d%H%M%S").strftime("%Y-%m-%d %H:%M:%S")

        seo_data = get_seo_features(snapshot_url, session)
        seo_data['snapshot_url'] = snapshot_url
        seo_data['Timestamp'] = formatted_timestamp  # Use formatted timestamp
        all_data.append(seo_data)
        time.sleep(1)  # Be polite to the server

    save_to_csv(all_data, output_filename)

# Use "https://www.corrections.govt.nz/" as the targe page
original_url = "https://www.corrections.govt.nz/"
start_date = "20150101"  # Start date: 01 Jan 2015
end_date = "20241130"    # End date: 30 Nov 2024
scrape_wayback_pages(original_url, start_date, end_date, output_filename="/content/drive/MyDrive/DA713/wayback_snapshots_with_timestamps_23Dec2024_corrections_v3.csv")


Found 926 snapshots.


Scraping snapshots:  76%|███████▌  | 706/926 [31:42<08:21,  2.28s/it]

Network error while scraping https://web.archive.org/web/20221006040815/https://corrections.govt.nz/: HTTPSConnectionPool(host='web.archive.org', port=443): Read timed out. (read timeout=30)


Scraping snapshots:  88%|████████▊ | 819/926 [39:03<05:41,  3.19s/it]

Network error while scraping https://web.archive.org/web/20240117071144/https://corrections.govt.nz/: HTTPSConnectionPool(host='web.archive.org', port=443): Read timed out. (read timeout=30)


Scraping snapshots:  89%|████████▊ | 820/926 [39:34<20:23, 11.54s/it]

Network error while scraping https://web.archive.org/web/20240117150616/http://www.corrections.govt.nz/: HTTPSConnectionPool(host='web.archive.org', port=443): Read timed out. (read timeout=30)


Scraping snapshots: 100%|██████████| 926/926 [44:54<00:00,  2.91s/it]

Data saved to /content/drive/MyDrive/DA713/wayback_snapshots_with_timestamps_23Dec2024_corrections_v3.csv



