<a href="https://colab.research.google.com/github/hxiufan/Capstone713/blob/main/Multi_page_Web_Scrapping_05Dec2024_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from tqdm import tqdm

In [None]:
import os


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Function to query Wayback Machine API and get all snapshots within a date range
def get_wayback_snapshots(url, start_date, end_date):
    """
    Query the Wayback Machine API for all snapshots of a URL within a date range.

    Parameters:
        url (str): The URL to query.
        start_date (str): Start date in YYYYMMDD format.
        end_date (str): End date in YYYYMMDD format.

    Returns:
        list: A list of Wayback Machine URLs for all snapshots.
    """
    wayback_api_url = f"http://web.archive.org/cdx/search/cdx?url={url}&output=json&collapse=digest&from={start_date}&to={end_date}"
    try:
        response = requests.get(wayback_api_url, timeout=30)
        response.raise_for_status()
        snapshots = response.json()[1:]  # Skip the header row
        return [
            f"https://web.archive.org/web/{snapshot[1]}/{snapshot[2]}"
            for snapshot in snapshots
        ]
    except requests.exceptions.RequestException as e:
        print(f"Error fetching Wayback Machine snapshots: {e}")
        return []

# Function to get SEO features and additional meta-data from a webpage
def get_seo_features(url, session):
    """
    Extract SEO features and additional metadata from a webpage.

    Parameters:
        url (str): The URL of the webpage.
        session (requests.Session): The session for making requests.

    Returns:
        dict: A dictionary of SEO data.
    """
    try:
        response = session.get(url, timeout=30)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'lxml')

        # Extract the additional metadata
        keywords = soup.find('meta', attrs={'name': 'keywords'})
        description = soup.find('meta', attrs={'name': 'description'})
        dc_description = soup.find('meta', attrs={'name': 'DC.Description'})
        dc_language = soup.find('meta', attrs={'name': 'DC.Language'})
        dc_date_created = soup.find('meta', attrs={'name': 'DC.Date.created'})
        dc_date_modified = soup.find('meta', attrs={'name': 'DC.Date.modified'})
        dc_date_valid = soup.find('meta', attrs={'name': 'DC.Date.valid'})

        return {
            'page_url': url,
            'title': soup.find('title').text.strip() if soup.find('title') else 'No title found',
            'meta_description': description['content'].strip() if description else 'No description found',
            'h1_tags': [h1.text.strip() for h1 in soup.find_all('h1')],
            'word_count': len(soup.get_text(strip=True).split()),
            'keywords': keywords['content'].strip() if keywords else 'No keywords found',
            'description': description['content'].strip() if description else 'No description found',
            'DC.Description': dc_description['content'].strip() if dc_description else 'No DC.Description found',
            'DC.Language': dc_language['content'].strip() if dc_language else 'No DC.Language found',
            'DC.Date.created': dc_date_created['content'].strip() if dc_date_created else 'No DC.Date.created found',
            'DC.Date.modified': dc_date_modified['content'].strip() if dc_date_modified else 'No DC.Date.modified found',
            'DC.Date.valid': dc_date_valid['content'].strip() if dc_date_valid else 'No DC.Date.valid found'
        }
    except requests.exceptions.RequestException as e:
        print(f"Network error while scraping {url}: {e}")
    except Exception as e:
        print(f"Parsing error for {url}: {e}")
    return {
        'page_url': url,
        'title': 'Error',
        'meta_description': 'Error',
        'h1_tags': 'Error',
        'word_count': 0,
        'keywords': 'Error',
        'description': 'Error',
        'DC.Description': 'Error',
        'DC.Language': 'Error',
        'DC.Date.created': 'Error',
        'DC.Date.modified': 'Error',
        'DC.Date.valid': 'Error'
    }

# Function to save data to CSV
def save_to_csv(data, filename):
    """
    Save data to a CSV file.

    Parameters:
        data (list): List of dictionaries containing data.
        filename (str): Output CSV filename.
    """
    pd.DataFrame(data).to_csv(filename, index=False, encoding='utf-8')
    print(f"Data saved to {filename}")

# Main function to scrape all snapshots from Wayback Machine
def scrape_wayback_pages(original_url, start_date, end_date, output_filename="/content/drive/MyDrive/DA713/wayback_filtered_snapshots_data.csv"):
    """
    Scrape all snapshots from Wayback Machine for a given URL and save SEO data to a CSV.

    Parameters:
        original_url (str): The URL to scrape snapshots for.
        start_date (str): Start date in YYYYMMDD format.
        end_date (str): End date in YYYYMMDD format.
        output_filename (str): The name of the output CSV file.
    """
    snapshot_urls = get_wayback_snapshots(original_url, start_date, end_date)
    print(f"Found {len(snapshot_urls)} snapshots.")

    all_data = []
    session = requests.Session()  # Reuse session for efficiency

    for snapshot_url in tqdm(snapshot_urls, desc="Scraping snapshots"):
        seo_data = get_seo_features(snapshot_url, session)
        all_data.append(seo_data)
        time.sleep(1)  # Be polite to the server

    save_to_csv(all_data, output_filename)

# Example usage
original_url = "https://www.studylink.govt.nz/"
start_date = "20020524"  # Start date: 01 Jan 2002
end_date = "20241130"    # End date: 30 Nov 2023
scrape_wayback_pages(original_url, start_date, end_date, output_filename="/content/drive/MyDrive/DA713/wayback_snapshots_2019_2023_with_meta.csv")


Found 634 snapshots.


Scraping snapshots:   4%|▍         | 28/634 [01:11<26:23,  2.61s/it]

Network error while scraping https://web.archive.org/web/20050816210600/http://www.studylink.govt.nz:80/: 400 Client Error: Bad request for url: https://web.archive.org/web/20050816210600/http://www.studylink.govt.nz:80/


Scraping snapshots:  53%|█████▎    | 338/634 [14:46<11:12,  2.27s/it]

Network error while scraping https://web.archive.org/web/20160707213313/http://www.studylink.govt.nz:80/: 400 Client Error: Bad Request for url: https://web.archive.org/web/20160707213313/http://www.studylink.govt.nz:80/


Scraping snapshots:  56%|█████▌    | 353/634 [15:23<11:33,  2.47s/it]

Network error while scraping https://web.archive.org/web/20161002040608/http://studylink.govt.nz/: HTTPSConnectionPool(host='web.archive.org', port=443): Read timed out. (read timeout=30)


Scraping snapshots: 100%|██████████| 634/634 [28:18<00:00,  2.68s/it]


Data saved to /content/drive/MyDrive/DA713/wayback_snapshots_2019_2023_with_meta.csv
