In [2]:
import requests
import zipfile
import os
from bs4 import BeautifulSoup
import pandas as pd

- country (ctry), 
- lower layer super output area (lsoa), 
- local authority (ltla), 
- middle layer super output area (msoa), 
- output area (oa), 
- region (rgn), 
- upper tier local authority (utla)

In [7]:
def find_download_links(url):
    """
    Function to scrape the web page to find direct download links for zip files.
    """
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Base URL for constructing the full URL
    base_url = 'https://www.nomisweb.co.uk'

    # Find all <a> tags with href attribute containing the zip file path
    download_links = soup.find_all('a', href=lambda href: href and '.zip' in href)

    # Construct full URLs
    full_urls = [base_url + link['href'] for link in download_links]

    # Filter and return URLs that match the expected format
    return [url for url in full_urls if url.startswith('https://www.nomisweb.co.uk/output/census/2021/')]


def download_and_extract_zip(url, extract_to_folder):
    """
    function to download, extract only CSV files containing 'oa' or 'lsoa' in their names from a given zip file URL, 
    and delete the zip file, whether it's empty or not, after attempting extraction.
    """
    response = requests.get(url)
    zip_filename = os.path.join(extract_to_folder, url.split('/')[-1])
    
    with open(zip_filename, 'wb') as file:
        file.write(response.content)

    try:
        with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
            # Check if the zip file is empty
            if len(zip_ref.namelist()) == 0:
                raise ValueError(f"Zip file {zip_filename} is empty and can't be extracted.")

            # List all file names in the zip file
            all_files = zip_ref.namelist()

            # Filter out files that are not CSV or don't contain 'oa' or 'lsoa' in their names
            filtered_files = [f for f in all_files if f.endswith('.csv') and ('-oa' in f.lower() or '-lsoa' in f.lower())]

            # Extract only the filtered files
            for file in filtered_files:
                zip_ref.extract(file, extract_to_folder)
    finally:
        # Delete the zip file after extraction or if it's empty
        os.remove(zip_filename)

def download_and_extract_shapefiles(url, extract_to_folder):
    """
    Download a zip file from the URL and extract its contents to the specified folder.
    """
    try:
        # Get the file name from the URL
        zip_filename = os.path.join(extract_to_folder, url.split('/')[-1])

        # Download the zip file
        response = requests.get(url)
        response.raise_for_status()  # will raise an exception for HTTP error codes

        # Write the downloaded content to a file
        with open(zip_filename, 'wb') as file:
            file.write(response.content)

        # Extract the zip file
        with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
            zip_ref.extractall(extract_to_folder)

        print(f"Extracted {zip_filename} to {extract_to_folder}")

    except Exception as e:
        print(f"An error occurred: {e}")

    finally:
        # Delete the zip file after extraction
        if os.path.exists(zip_filename):
            os.remove(zip_filename)
            print(f"Deleted {zip_filename}")

In [8]:
# URL of the page to scrape
scrape_url = 'https://www.nomisweb.co.uk/sources/census_2021_bulk'

# Directory for downloads and extractions
extract_to_folder = os.path.join(os.getcwd(), "data")
os.makedirs(extract_to_folder, exist_ok=True)

# Get all download links
try:
    zip_file_urls = find_download_links(scrape_url)

    # Download and extract each zip file
    for url in zip_file_urls:
        try:
            download_and_extract_zip(url, extract_to_folder)
        except Exception as e:
            print(f"Error downloading or extracting {url}: {e}")
except Exception as e:
    print(f"Error scraping {scrape_url}: {e}")

print("Download and extraction completed.")

Error downloading or extracting https://www.nomisweb.co.uk/output/census/2021/census2021-ts079.zip: File is not a zip file
Error downloading or extracting https://www.nomisweb.co.uk/output/census/2021/census2021-ts079-extra.zip: File is not a zip file
Error downloading or extracting https://www.nomisweb.co.uk/output/census/2021/census2021-ts070-extra.zip: File is not a zip file
Error downloading or extracting https://www.nomisweb.co.uk/output/census/2021/census2021-ts077-extra.zip: File is not a zip file
Error downloading or extracting https://www.nomisweb.co.uk/output/census/2021/census2021-ts078-extra.zip: File is not a zip file
Error downloading or extracting https://www.nomisweb.co.uk/output/census/2021/census2021-ts037asp-extra.zip: File is not a zip file
Error downloading or extracting https://www.nomisweb.co.uk/output/census/2021/census2021-ts038asp-extra.zip: File is not a zip file
Error downloading or extracting https://www.nomisweb.co.uk/output/census/2021/census2021-ts039asp

In [9]:
# URL of shape file
shapes_url = 'https://data.london.gov.uk/download/statistical-gis-boundary-files-london/9ba8c833-6370-4b11-abdc-314aa020d5e0/statistical-gis-boundaries-london.zip'
try:
    download_and_extract_shapefiles(shapes_url, extract_to_folder)
except Exception as e:
    print(f"Error downloading or extracting {shapes_url}: {e}")

Extracted c:\Users\Ivy\IvyProjects\census_data_scraping_analysis\data\statistical-gis-boundaries-london.zip to c:\Users\Ivy\IvyProjects\census_data_scraping_analysis\data
Deleted c:\Users\Ivy\IvyProjects\census_data_scraping_analysis\data\statistical-gis-boundaries-london.zip


In [10]:
def check_spatial_resolution_consistency(folder_path, spatial_column):
    """
    Check spatial resolution consistency for all CSV files in the given folder.

    :param folder_path: Path to the folder containing CSV files.
    :param spatial_column: Name of the column containing spatial resolution data.
    :return: None
    """
    inconsistencies = []
    resolutions = {}

    # Iterate through all files in the folder
    for file in os.listdir(folder_path):
        if file.endswith('.csv'):
            file_path = os.path.join(folder_path, file)
            try:
                # Read the CSV file
                df = pd.read_csv(file_path)

                # Check if the spatial column exists
                if spatial_column in df.columns:
                    # Get unique values in the spatial column
                    unique_values = df[spatial_column].unique()

                    # Store the count of unique values for each file
                    resolutions[file] = len(unique_values)
                else:
                    inconsistencies.append(f"{file}: Spatial column '{spatial_column}' not found.")

            except Exception as e:
                inconsistencies.append(f"{file}: Error reading file - {e}")

    # Compare resolutions across files
    if len(resolutions) > 1:
        ref_resolution = next(iter(resolutions.values()))  # Reference resolution from the first file
        for file, res in resolutions.items():
            if res != ref_resolution:
                inconsistencies.append(f"{file}: Spatial resolution inconsistency (Expected: {ref_resolution}, Found: {res})")

    if inconsistencies:
        print("Inconsistencies found:")
        for issue in inconsistencies:
            print(issue)
    else:
        print("All files are consistent in spatial resolution.")

# Example usage
check_spatial_resolution_consistency(folder_path=extract_to_folder, spatial_column='geography code')

Inconsistencies found:
census2021-ts010-lsoa.csv: Error reading file - No columns to parse from file
census2021-ts010-oa.csv: Error reading file - No columns to parse from file
census2021-ts001-oa.csv: Spatial resolution inconsistency (Expected: 35672, Found: 188880)
census2021-ts002-oa.csv: Spatial resolution inconsistency (Expected: 35672, Found: 188880)
census2021-ts003-oa.csv: Spatial resolution inconsistency (Expected: 35672, Found: 188880)
census2021-ts004-oa.csv: Spatial resolution inconsistency (Expected: 35672, Found: 188880)
census2021-ts005-oa.csv: Spatial resolution inconsistency (Expected: 35672, Found: 188880)
census2021-ts006-oa.csv: Spatial resolution inconsistency (Expected: 35672, Found: 188880)
census2021-ts007a-oa.csv: Spatial resolution inconsistency (Expected: 35672, Found: 188880)
census2021-ts008-oa.csv: Spatial resolution inconsistency (Expected: 35672, Found: 188880)
census2021-ts011-oa.csv: Spatial resolution inconsistency (Expected: 35672, Found: 188880)
cen