<a href="https://colab.research.google.com/github/hxiufan/Capstone713/blob/main/Multi_page_Webscrapping_BusinessNZ_23Dec2024_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import os

In [4]:
from google.colab import drive

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
# Updated Code to Keep Wayback Machine Timestamp and Format It

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from tqdm import tqdm
from datetime import datetime

# Function to query Wayback Machine API and get all snapshots within a date range
def get_wayback_snapshots(url, start_date, end_date):
    """
    Query the Wayback Machine API for all snapshots of a URL within a date range.

    Parameters:
        url (str): The URL to query.
        start_date (str): Start date in YYYYMMDD format.
        end_date (str): End date in YYYYMMDD format.

    Returns:
        list: A list of dictionaries containing snapshot URLs and timestamps.
    """
    wayback_api_url = f"http://web.archive.org/cdx/search/cdx?url={url}&output=json&collapse=digest&from={start_date}&to={end_date}"
    try:
        response = requests.get(wayback_api_url, timeout=30)
        response.raise_for_status()
        snapshots = response.json()[1:]  # Skip the header row
        return [
            {"snapshot_url": f"https://web.archive.org/web/{snapshot[1]}/{snapshot[2]}", "timestamp": snapshot[1]}
            for snapshot in snapshots
        ]
    except requests.exceptions.RequestException as e:
        print(f"Error fetching Wayback Machine snapshots: {e}")
        return []

# Function to get SEO features and additional metadata from a webpage
def get_seo_features(url, session):
    """
    Extract SEO features and additional metadata from a webpage.

    Parameters:
        url (str): The URL of the webpage.
        session (requests.Session): The session for making requests.

    Returns:
        dict: A dictionary of SEO data.
    """
    try:
        response = session.get(url, timeout=30)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'lxml')

        # Extract metadata
        keywords = soup.find('meta', attrs={'name': 'keywords'})
        description = soup.find('meta', attrs={'name': 'description'})
        dc_description = soup.find('meta', attrs={'name': 'DC.Description'})

        return {
            'title': soup.find('title').text.strip() if soup.find('title') else 'No title found',
            'meta_description': description['content'].strip() if description else 'No description found',
            'h1_tags': [h1.text.strip() for h1 in soup.find_all('h1')],
            'word_count': len(soup.get_text(strip=True).split()),
            'keywords': keywords['content'].strip() if keywords else 'No keywords found',
            'description': description['content'].strip() if description else 'No description found',
            'DC.Description': dc_description['content'].strip() if dc_description else 'No DC.Description found'
        }
    except requests.exceptions.RequestException as e:
        print(f"Network error while scraping {url}: {e}")
    except Exception as e:
        print(f"Parsing error for {url}: {e}")
    return {
        'title': 'Error',
        'meta_description': 'Error',
        'h1_tags': 'Error',
        'word_count': 0,
        'keywords': 'Error',
        'description': 'Error',
        'DC.Description': 'Error'
    }

# Function to save data to CSV
def save_to_csv(data, filename):
    """
    Save data to a CSV file.

    Parameters:
        data (list): List of dictionaries containing data.
        filename (str): Output CSV filename.
    """
    pd.DataFrame(data).to_csv(filename, index=False, encoding='utf-8')
    print(f"Data saved to {filename}")

# Main function to scrape all snapshots from Wayback Machine
def scrape_wayback_pages(original_url, start_date, end_date, output_filename="/content/drive/MyDrive/DA713/wayback_filtered_snapshots_data.csv"):
    """
    Scrape all snapshots from Wayback Machine for a given URL and save SEO data to a CSV.

    Parameters:
        original_url (str): The URL to scrape snapshots for.
        start_date (str): Start date in YYYYMMDD format.
        end_date (str): End date in YYYYMMDD format.
        output_filename (str): The name of the output CSV file.
    """
    snapshots = get_wayback_snapshots(original_url, start_date, end_date)
    print(f"Found {len(snapshots)} snapshots.")

    all_data = []
    session = requests.Session()  # Reuse session for efficiency

    for snapshot in tqdm(snapshots, desc="Scraping snapshots"):
        snapshot_url = snapshot['snapshot_url']
        original_timestamp = snapshot['timestamp']

        # Convert Wayback Machine timestamp to human-readable format
        formatted_timestamp = datetime.strptime(original_timestamp, "%Y%m%d%H%M%S").strftime("%Y-%m-%d %H:%M:%S")

        seo_data = get_seo_features(snapshot_url, session)
        seo_data['snapshot_url'] = snapshot_url
        seo_data['Timestamp'] = formatted_timestamp  # Use formatted timestamp
        all_data.append(seo_data)
        time.sleep(1)  # Be polite to the server

    save_to_csv(all_data, output_filename)

# Example usage
original_url = "https://www.business.govt.nz/"
start_date = "20150101"  # Start date: 01 Jan 2015
end_date = "20241130"    # End date: 30 Nov 2024
scrape_wayback_pages(original_url, start_date, end_date, output_filename="/content/drive/MyDrive/DA713/wayback_snapshots_with_timestamps_23Dec2024_business.csv")


Found 2413 snapshots.


Scraping snapshots:   6%|▌         | 134/2413 [07:06<2:00:45,  3.18s/it]

Network error while scraping https://web.archive.org/web/20160824092744/http://www.business.govt.nz:80/?: 403 Client Error: Forbidden for url: https://web.archive.org/web/20160824092744/http://www.business.govt.nz:80/


Scraping snapshots:   6%|▌         | 135/2413 [07:09<1:55:49,  3.05s/it]

Network error while scraping https://web.archive.org/web/20160826000049/https://www.business.govt.nz/: 403 Client Error: Forbidden for url: https://web.archive.org/web/20160824092744/http://www.business.govt.nz/


Scraping snapshots:  32%|███▏      | 764/2413 [38:40<1:12:48,  2.65s/it]

Network error while scraping https://web.archive.org/web/20211018040331/https://www.business.govt.nz/: 403 Client Error: Forbidden for url: https://web.archive.org/web/20211018040331/https://www.business.govt.nz/


Scraping snapshots:  32%|███▏      | 765/2413 [38:43<1:09:46,  2.54s/it]

Network error while scraping https://web.archive.org/web/20211023020754/http://www.business.govt.nz/: 403 Client Error: Forbidden for url: https://web.archive.org/web/20211018040331/https://www.business.govt.nz/


Scraping snapshots:  32%|███▏      | 768/2413 [38:51<1:12:15,  2.64s/it]

Network error while scraping https://web.archive.org/web/20211105111442/https://www.business.govt.nz/: 403 Client Error: Forbidden for url: https://web.archive.org/web/20211105111442/https://www.business.govt.nz/


Scraping snapshots:  58%|█████▊    | 1410/2413 [1:11:00<43:28,  2.60s/it]

Network error while scraping https://web.archive.org/web/20220329093935/http://www.business.govt.nz/: 403 Client Error: Forbidden for url: https://web.archive.org/web/20220329093936/https://www.business.govt.nz/


Scraping snapshots:  58%|█████▊    | 1411/2413 [1:11:05<52:31,  3.15s/it]

Network error while scraping https://web.archive.org/web/20220329093936/https://www.business.govt.nz/: 403 Client Error: Forbidden for url: https://web.archive.org/web/20220329093936/https://www.business.govt.nz/


Scraping snapshots:  61%|██████    | 1473/2413 [1:14:19<51:45,  3.30s/it]

Network error while scraping https://web.archive.org/web/20220508154633/https://www.business.govt.nz/: 403 Client Error: Forbidden for url: https://web.archive.org/web/20220508154633/https://www.business.govt.nz/


Scraping snapshots:  61%|██████    | 1474/2413 [1:14:22<48:28,  3.10s/it]

Network error while scraping https://web.archive.org/web/20220509021258/http://www.business.govt.nz/: 403 Client Error: Forbidden for url: https://web.archive.org/web/20220508154633/https://www.business.govt.nz/


Scraping snapshots:  63%|██████▎   | 1512/2413 [1:16:37<48:25,  3.22s/it]

Network error while scraping https://web.archive.org/web/20220626031810/https://business.govt.nz/: 403 Client Error: Forbidden for url: https://web.archive.org/web/20220626031810/https://business.govt.nz/


Scraping snapshots:  63%|██████▎   | 1513/2413 [1:16:39<45:22,  3.02s/it]

Network error while scraping https://web.archive.org/web/20220627022228/http://www.business.govt.nz/: 403 Client Error: Forbidden for url: https://web.archive.org/web/20220626031810/https://business.govt.nz/


Scraping snapshots:  63%|██████▎   | 1531/2413 [1:17:37<43:41,  2.97s/it]

Network error while scraping https://web.archive.org/web/20220709003737/http://www.business.govt.nz/: 403 Client Error: Forbidden for url: https://web.archive.org/web/20220709003737/http://www.business.govt.nz/


Scraping snapshots:  63%|██████▎   | 1532/2413 [1:17:39<38:33,  2.63s/it]

Network error while scraping https://web.archive.org/web/20220709020918/http://www.business.govt.nz/: 403 Client Error: Forbidden for url: https://web.archive.org/web/20220709003737/http://www.business.govt.nz/


Scraping snapshots:  64%|██████▎   | 1535/2413 [1:17:49<44:21,  3.03s/it]

Network error while scraping https://web.archive.org/web/20220710193756/https://business.govt.nz/: 403 Client Error: Forbidden for url: https://web.archive.org/web/20220710193802/https://www.business.govt.nz/


Scraping snapshots:  64%|██████▎   | 1536/2413 [1:17:52<44:17,  3.03s/it]

Network error while scraping https://web.archive.org/web/20220710193802/https://www.business.govt.nz/: 403 Client Error: Forbidden for url: https://web.archive.org/web/20220710193802/https://www.business.govt.nz/


Scraping snapshots:  64%|██████▎   | 1537/2413 [1:17:54<40:24,  2.77s/it]

Network error while scraping https://web.archive.org/web/20220711022210/http://www.business.govt.nz/: 403 Client Error: Forbidden for url: https://web.archive.org/web/20220710193802/https://www.business.govt.nz/


Scraping snapshots:  65%|██████▍   | 1566/2413 [1:19:15<36:25,  2.58s/it]

Network error while scraping https://web.archive.org/web/20220823111423/https://www.business.govt.nz/: 403 Client Error: Forbidden for url: https://web.archive.org/web/20220823111423/https://www.business.govt.nz/


Scraping snapshots:  65%|██████▍   | 1567/2413 [1:19:17<36:22,  2.58s/it]

Network error while scraping https://web.archive.org/web/20220823141317/http://www.business.govt.nz/: 403 Client Error: Forbidden for url: https://web.archive.org/web/20220823111423/https://www.business.govt.nz/


Scraping snapshots:  80%|████████  | 1938/2413 [1:38:40<20:17,  2.56s/it]

Network error while scraping https://web.archive.org/web/20230306220700/http://www.business.govt.nz/: 403 Client Error: Forbidden for url: https://web.archive.org/web/20230306220700/http://www.business.govt.nz/


Scraping snapshots:  80%|████████  | 1939/2413 [1:38:43<20:50,  2.64s/it]

Network error while scraping https://web.archive.org/web/20230306220700/https://www.business.govt.nz/: 403 Client Error: Forbidden for url: https://web.archive.org/web/20230306220700/https://www.business.govt.nz/


Scraping snapshots:  80%|████████  | 1940/2413 [1:38:45<20:10,  2.56s/it]

Network error while scraping https://web.archive.org/web/20230306230842/http://business.govt.nz/: 403 Client Error: Forbidden for url: https://web.archive.org/web/20230306220700/https://www.business.govt.nz/


Scraping snapshots:  81%|████████  | 1943/2413 [1:38:56<22:59,  2.94s/it]

Network error while scraping https://web.archive.org/web/20230307181403/https://business.govt.nz/: 403 Client Error: Forbidden for url: https://web.archive.org/web/20230307181405/https://www.business.govt.nz/


Scraping snapshots:  81%|████████  | 1944/2413 [1:38:59<24:05,  3.08s/it]

Network error while scraping https://web.archive.org/web/20230307181405/https://www.business.govt.nz/: 403 Client Error: Forbidden for url: https://web.archive.org/web/20230307181405/https://www.business.govt.nz/


Scraping snapshots:  82%|████████▏ | 1986/2413 [1:41:27<25:39,  3.61s/it]

Network error while scraping https://web.archive.org/web/20230316121743/https://www.business.govt.nz/: 403 Client Error: Forbidden for url: https://web.archive.org/web/20230316121743/https://www.business.govt.nz/


Scraping snapshots:  84%|████████▍ | 2027/2413 [1:43:32<19:00,  2.95s/it]

Network error while scraping https://web.archive.org/web/20230326160621/http://business.govt.nz/: 403 Client Error: Forbidden for url: https://web.archive.org/web/20230326160636/http://www.business.govt.nz/


Scraping snapshots:  84%|████████▍ | 2028/2413 [1:43:35<19:23,  3.02s/it]

Network error while scraping https://web.archive.org/web/20230326160636/http://www.business.govt.nz/: 403 Client Error: Forbidden for url: https://web.archive.org/web/20230326160636/http://www.business.govt.nz/


Scraping snapshots:  86%|████████▌ | 2065/2413 [1:45:52<18:44,  3.23s/it]

Network error while scraping https://web.archive.org/web/20230403230337/http://business.govt.nz/: 403 Client Error: Forbidden for url: https://web.archive.org/web/20230403230337/http://business.govt.nz/


Scraping snapshots:  86%|████████▌ | 2066/2413 [1:45:55<18:28,  3.19s/it]

Network error while scraping https://web.archive.org/web/20230403230337/http://www.business.govt.nz/: 403 Client Error: Forbidden for url: https://web.archive.org/web/20230403230337/http://www.business.govt.nz/


Scraping snapshots:  87%|████████▋ | 2093/2413 [1:47:21<18:22,  3.44s/it]

Network error while scraping https://web.archive.org/web/20230414112004/https://www.business.govt.nz/: 429 Client Error: Too Many Requests for url: https://web.archive.org/web/20230414112004/https://www.business.govt.nz/


Scraping snapshots:  87%|████████▋ | 2094/2413 [1:47:22<14:32,  2.73s/it]

Network error while scraping https://web.archive.org/web/20230415095735/https://www.business.govt.nz/: 429 Client Error: Too Many Requests for url: https://web.archive.org/web/20230415095735/https://www.business.govt.nz/


Scraping snapshots:  87%|████████▋ | 2095/2413 [1:47:23<11:51,  2.24s/it]

Network error while scraping https://web.archive.org/web/20230415215700/https://www.business.govt.nz/: 429 Client Error: Too Many Requests for url: https://web.archive.org/web/20230415215700/https://www.business.govt.nz/


Scraping snapshots:  87%|████████▋ | 2096/2413 [1:47:24<09:59,  1.89s/it]

Network error while scraping https://web.archive.org/web/20230416100152/https://www.business.govt.nz/: 429 Client Error: Too Many Requests for url: https://web.archive.org/web/20230416100152/https://www.business.govt.nz/


Scraping snapshots:  87%|████████▋ | 2097/2413 [1:47:25<08:40,  1.65s/it]

Network error while scraping https://web.archive.org/web/20230417090709/https://www.business.govt.nz/: 429 Client Error: Too Many Requests for url: https://web.archive.org/web/20230417090709/https://www.business.govt.nz/


Scraping snapshots:  87%|████████▋ | 2098/2413 [1:47:26<07:44,  1.48s/it]

Network error while scraping https://web.archive.org/web/20230417095901/https://www.business.govt.nz/: 429 Client Error: Too Many Requests for url: https://web.archive.org/web/20230417095901/https://www.business.govt.nz/


Scraping snapshots:  87%|████████▋ | 2099/2413 [1:47:27<07:05,  1.36s/it]

Network error while scraping https://web.archive.org/web/20230417215642/https://www.business.govt.nz/: 429 Client Error: Too Many Requests for url: https://web.archive.org/web/20230417215642/https://www.business.govt.nz/


Scraping snapshots:  87%|████████▋ | 2100/2413 [1:47:29<06:38,  1.27s/it]

Network error while scraping https://web.archive.org/web/20230418103813/https://www.business.govt.nz/: 429 Client Error: Too Many Requests for url: https://web.archive.org/web/20230418103813/https://www.business.govt.nz/


Scraping snapshots:  87%|████████▋ | 2101/2413 [1:47:30<06:18,  1.21s/it]

Network error while scraping https://web.archive.org/web/20230418220330/https://www.business.govt.nz/: 429 Client Error: Too Many Requests for url: https://web.archive.org/web/20230418220330/https://www.business.govt.nz/


Scraping snapshots:  87%|████████▋ | 2102/2413 [1:47:31<06:05,  1.18s/it]

Network error while scraping https://web.archive.org/web/20230419200702/https://www.business.govt.nz/: 429 Client Error: Too Many Requests for url: https://web.archive.org/web/20230419200702/https://www.business.govt.nz/


Scraping snapshots:  89%|████████▉ | 2159/2413 [1:50:26<13:04,  3.09s/it]

Network error while scraping https://web.archive.org/web/20230514005811/https://www.business.govt.nz/: 403 Client Error: Forbidden for url: https://web.archive.org/web/20230514005811/https://www.business.govt.nz/


Scraping snapshots:  95%|█████████▍| 2284/2413 [1:56:54<06:26,  3.00s/it]

Network error while scraping https://web.archive.org/web/20240413065546/https://www.business.govt.nz/: 429 Client Error: Too Many Requests for url: https://web.archive.org/web/20240413065546/https://www.business.govt.nz/


Scraping snapshots: 100%|██████████| 2413/2413 [2:03:39<00:00,  3.07s/it]

Data saved to /content/drive/MyDrive/DA713/wayback_snapshots_with_timestamps_23Dec2024_business.csv



