In [25]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from collections import deque
import csv

def is_valid(url):
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

def get_internal_links(url, soup):
    internal_links = set()
    domain_name = urlparse(url).scheme + "://" + urlparse(url).netloc
    for a_tag in soup.find_all("a"):
        href = a_tag.attrs.get("href")
        if href == "" or href is None:
            continue
        href = urljoin(url, href)
        parsed_href = urlparse(href)
        href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
        if not is_valid(href):
            continue
        if href in internal_links:
            continue
        if domain_name not in href:
            continue
        internal_links.add(href)
    return internal_links

def download_page(url, folder):
    if '/cy' in url:
        return None
    parsed_url = urlparse(url)
    page_name = parsed_url.path

    # Get the file extension
    file_extension = os.path.splitext(page_name)[1]

    # Skip .zip and .doc files
    if file_extension in ['.zip', '.doc', '.docx', '.xlsx']:
        return None
    response = requests.get(url)
    if response.status_code == 200:
        page_name = urlparse(url).path
        if page_name == "":
            page_name = "index.txt"
        else:
            # Extract the file name from the URL, and if it doesn't have an extension, append .html
            page_name = page_name.strip("/").replace("/", "_")
            if '.' not in page_name:
                page_name += ".txt"
        
        page_path = os.path.join(folder, page_name)
        
        # If the file is an HTML file, parse it with BeautifulSoup and save it as a formatted string.
        if page_name.endswith(".txt"):
            soup = BeautifulSoup(response.text, "html.parser")
            text = soup.get_text(separator=' ', strip=True)
            print(f"Downloaded and stripped {page_name}")
            try:
                with open(page_path, "w", encoding="utf-8") as f:
                    f.write(text)
            except OSError:
                # If the filename is too long, truncate it and save again
                short_page_path = os.path.join(folder, page_name[:255])
                with open(short_page_path, "w", encoding="utf-8") as f:
                    f.write(text)
            return soup
        else:
            # For non-HTML files, write the response content as bytes.
            with open(page_path, "wb") as f:
                f.write(response.content)
    return None

def scrape_website(start_url, folder):
    os.makedirs(folder, exist_ok=True)
    visited_links = set()
    try:
        with open('downloaded_pages.csv', 'r', newline='') as f:
            reader = csv.reader(f)
            for row in reader:
                visited_links.add(row[0])
    except FileNotFoundError:
        pass
    links_to_visit = deque()
    links_to_visit.append(start_url)
    while links_to_visit:
        url = links_to_visit.popleft()
        if url in visited_links:
            continue
        soup = download_page(url, folder)
        if soup is not None:
            visited_links.add(url)
            internal_links = get_internal_links(url, soup)
            for link in internal_links:
                if link not in visited_links:
                    links_to_visit.append(link)
        if len(visited_links) % 50 == 0:
            with open('downloaded_pages.csv', 'w', newline='') as f:
                writer = csv.writer(f)
                for link in visited_links:
                    writer.writerow([link])
    print("Scraping completed!")

# Replace with your URL and folder name
scrape_website("https://www.electoralcommission.org.uk/sites/default/files/2023-10/Accredited%20Observers%202023-10-02", "downloaded_pages")



Scraping completed!


In [None]:
how-maintain-your-registered-political-party/removing-your-party-register

In [12]:
# Compare old and new sets of docs for discrepancies
import os
import csv
from itertools import zip_longest

# Define the directories
new_dir = "/Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages"
old_dir = "/Users/cardigan/Documents/GitHub/old_electoral_commision_chatbot/docs"

# Get the list of file names in each directory, without extensions
new_files = {os.path.splitext(f)[0] for f in os.listdir(new_dir)}
old_files = {os.path.splitext(f)[0] for f in os.listdir(old_dir)}

# Find the files that are in new_dir but not in old_dir, and vice versa
in_new_not_old = new_files - old_files
in_old_not_new = old_files - new_files

# Write the results to a CSV file
with open('file_comparison.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['in new but not old', 'in old but not new'])
    for new_file, old_file in zip_longest(in_new_not_old, in_old_not_new):
        writer.writerow([new_file, old_file])

In [30]:
# Strip header and footer, store as txt
import os
from bs4 import BeautifulSoup
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas

def get_files(folder):
    return [f for f in os.listdir(folder) if f.endswith('.html') or f.endswith('.txt')]

def extract_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'html.parser')
    return soup.get_text(separator=' ', strip=True)

def remove_similar_structure(texts):
    if not texts:
        return []
    lines = [text.splitlines() for text in texts]
    common_start = 0
    common_end = 0

    for i in range(min(len(lines[0]), min(len(text) for text in lines))):
        if all(lines[0][i] == text[i] for text in lines):
            common_start = i + 1
        else:
            break
    
    for i in range(min(len(lines[0]), min(len(text) for text in lines))):
        if all(lines[0][-(i + 1)] == text[-(i + 1)] for text in lines):
            common_end = i + 1
        else:
            break

    return ['\n'.join(text[common_start: len(text) - common_end]) for text in lines]

def process_folder(folder):
    # Define the strings to search for
    start_string = "Part of our role includes conducting research and gathering data, and publishing reports and making recommendations about elections that take place across the UK. Our reports and data on past elections and referendums Electoral registration research Public attitudes Modernising voting: flexible voting feasibility studies Pilot evaluations Electoral fraud data Referendum question research Political registration and regulation Political registration and regulation We are responsible for registering political parties, publishing political finance data, and taking action if we have reason to suspect political finance law has been broken. Political party registration Financial reporting Our enforcement work Imprints News and views News and views Get the latest news from us, and read media handbooks for elections. We also hold consultations, and publish our responses to consultations from others. Media centre Elections Act Our priorities for reforming elections Parliamentary briefings Key correspondence Our consultations Our responses to consultations Resources Resources Use our resources to share information about voting and democracy. Take a look at our resources for young people. Download lesson plans and activities for use in the classroom or youth groups. Share our assets and get the message out. Resources for young people Resources for Educators Democratic engagement resources Back"
    end_string1 = "Footer Content"
    end_string2 = "Was this article helpful?"
    
    files = get_files(folder)

    # Process each file
    for file in files:
        file_path = os.path.join(folder, file)
        text = extract_text(file_path)
        
        # Find the start and end indices of the text to keep
        start_index = text.find(start_string)
        end_index1 = text.find(end_string1)
        end_index2 = text.find(end_string2)
        
        # If the start string was found, remove everything before it
        if start_index != -1:
            text = text[start_index + len(start_string):]
        
        # If the end strings were found, remove everything after them
        if end_index1 != -1:
            text = text[:end_index1]
        if end_index2 != -1:
            text = text[:end_index2]
        
        # Save the processed text as a .txt file
        txt_file = os.path.splitext(file)[0] + '.txt'
        txt_path = os.path.join(folder, txt_file)
        with open(txt_path, 'w', encoding='utf-8') as f:
            f.write(text.strip())

# Replace 'downloaded_pages' with the folder where your HTML pages are stored
folder = 'downloaded_pages'
process_folder(folder)

In [13]:
import os
import pandas as pd
import io

def get_csv_files(folder):
    return [f for f in os.listdir(folder) if f.endswith('.csv')]

folder = 'downloaded_pages'
files = get_csv_files(folder)

# Process each file
for file in files:
    file_path = os.path.join(folder, file)
    
    # Get the file size in bytes
    file_size_bytes = os.path.getsize(file_path)
    
    # Convert the file size to MB
    file_size_mb = file_size_bytes / (1024 * 1024)
    
    # Only process the file if it's under 3MB
    if file_size_mb <= 3:
        try:
            # Read the file in binary mode, decode each line and handle errors
            with open(file_path, 'rb') as f:
                lines = f.readlines()
            decoded_lines = [line.decode('utf-8', errors='ignore') for line in lines]
            cleaned_data = io.StringIO('\n'.join(decoded_lines))

            # Read the cleaned data using pandas
            df = pd.read_csv(cleaned_data, on_bad_lines = 'warn')

            # Convert the DataFrame to Markdown format
            markdown_text = df.to_markdown()

            # Write to a text file
            txt_file = os.path.splitext(file)[0] + '.txt'
            txt_path = os.path.join(folder, txt_file)
            with open(txt_path, 'w', encoding='utf-8') as f:
                f.write(markdown_text.strip())
        except pd.errors.ParserError as e:
            print(f"Failed to parse {file}: {e}")

Skipping line 33: expected 50 fields, saw 51
Skipping line 35: expected 50 fields, saw 51
Skipping line 57: expected 50 fields, saw 51
Skipping line 59: expected 50 fields, saw 51
Skipping line 107: expected 50 fields, saw 51
Skipping line 171: expected 50 fields, saw 51
Skipping line 177: expected 50 fields, saw 51
Skipping line 179: expected 50 fields, saw 51
Skipping line 189: expected 50 fields, saw 51
Skipping line 237: expected 50 fields, saw 51
Skipping line 249: expected 50 fields, saw 51
Skipping line 251: expected 50 fields, saw 51
Skipping line 257: expected 50 fields, saw 51
Skipping line 377: expected 50 fields, saw 51
Skipping line 387: expected 50 fields, saw 51
Skipping line 389: expected 50 fields, saw 51
Skipping line 681: expected 50 fields, saw 51
Skipping line 683: expected 50 fields, saw 51
Skipping line 713: expected 50 fields, saw 51
Skipping line 717: expected 50 fields, saw 51
Skipping line 719: expected 50 fields, saw 51
Skipping line 747: expected 50 fields,

In [None]:
# import textwrap

# def save_text_as_pdf(file_path, text):
#     c = canvas.Canvas(file_path, pagesize=letter)
#     width, height = letter
#     margin = 50  # Margin from each side
#     font_size = 12
#     line_spacing = 14
#     c.setFont("Helvetica", font_size)
    
#     # Ensure width is an integer
#     wrapper = textwrap.TextWrapper(width=int((width - 2 * margin) // 6))  
#     lines = []
#     for line in text.splitlines():
#         lines.extend(wrapper.wrap(line))
    
#     y_position = height - margin  
#     for line in lines:
#         if y_position < margin:  
#             c.showPage()
#             y_position = height - margin
#         c.drawString(margin, y_position, line)
#         y_position -= line_spacing
    
#     c.save()

# def txt_to_pdf():
#     # Save processed text as PDF
#     for file, text in zip(html_files, processed_texts):
#         # Creating the file name
#         file_name = f"{os.path.splitext(file)[0]}_processed.pdf"
        
#         # Ensure the file name is not too long
#         max_length = 255  # typical maximum filename length
#         if len(file_name) > max_length:
#             # Truncate the file name to the maximum length
#             file_name = file_name[:max_length]
        
#         pdf_file = os.path.join(stripped_folder, file_name)
#         save_text_as_pdf(pdf_file, text)


In [16]:
###Delete pages with specific substring in filename

import os
import glob

directory = "/Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages"  # replace with the path to your directory
substring = "20W"  # replace with the substring to search for

# Use glob to list all files in the directory that contain the substring
for file_path in glob.glob(os.path.join(directory, f'*{substring}*')):
    try:
        os.remove(file_path)
        print(f"File {file_path} has been deleted")
    except Exception as e:
        print(f"Error occurred: {e}")

File /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/sites_default_files_2023-05_PCC%20Part%202b%20-%20Standing%20as%20a%20party%20candidate%20W.pdf has been deleted
File /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/sites_default_files_2021-09_UKPGE%20Part%202a%20Standing%20as%20an%20independent%20candidate%20W.pdf has been deleted
File /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/sites_default_files_2022-02_Part%201%20Can%20you%20stand%20for%20election%20Community%20LGW%20W_1.pdf has been deleted
File /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/sites_default_files_2021-02_Register%20request%20form%20generic%20W.doc has been deleted
File /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/sites_default_files_2023-06_Templed%20Datganiad%20i%27r%20Wasg%20i%20Gymru%20-%20Canfasiad%20blynyddol%202023.docx has been deleted
File /Use

In [19]:
###Delete posters using filesize to num pages ratio

import os
import glob
import fitz  # PyMuPDF

directory = "/Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages"  # replace with the path to your directory
max_size_per_page_kb = 500  # in KB

# Use glob to list all PDF files in the directory
for file_path in glob.glob(os.path.join(directory, '*.pdf')):
    try:
        # Get the file size in KB
        file_size_kb = os.path.getsize(file_path) / 1024.0  # convert bytes to KB
        
        # Open the PDF file and get the number of pages
        doc = fitz.open(file_path)
        num_pages = doc.page_count
        
        # Calculate the size per page
        size_per_page_kb = file_size_kb / num_pages
        
        # If the size per page is more than the allowed maximum, delete the file
        if size_per_page_kb > max_size_per_page_kb:
            os.remove(file_path)
            print(f"File {file_path} has been deleted")
        
        # Close the document
        doc.close()

    except Exception as e:
        print(f"Error occurred while processing {file_path}: {e}")


File /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/sites_default_files_2019-09_A4-polling-station-poster-Urdu.pdf has been deleted
File /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/sites_default_files_pdf_file_Making-Your-Mark-Example-Postal-Voting-Quick-Start-Guide-First-Past-The-Post-Election-GB-English-A4.pdf has been deleted
Error occurred while processing /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/__data_assets_pdf_file_0015_107511_form-co1-ri.pdf: cannot open broken document
File /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/sites_default_files_2019-09_Your-vote-is-yours-alone-leaflet.pdf has been deleted
File /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/sites_default_files_2019-09_A4-postal-voting-poster-Bengali.pdf has been deleted
File /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pa

Error occurred while processing /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/__data_assets_pdf_file_0019_214228_Parliamentary-By-Election-Part-1-Can-you-stand-for-election.pdf: cannot open broken document
Error occurred while processing /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/__data_assets_pdf_file_0013_214231_Parliamentary-By-Election-Part-4-The-campaign.pdf: cannot open broken document
File /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/sites_default_files_pdf_file_Making-Your-Mark-Example-Postal-Voting-Statement-GB-Bilingual-A4.pdf has been deleted


In [20]:
###Delete broken files

import os
import glob
import fitz  # PyMuPDF

directory = "/Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages"  # replace with the path to your directory

# Use glob to list all files in the directory
for file_path in glob.glob(os.path.join(directory, '*')):
    try:
        # Try to open the document, if it is a PDF
        if file_path.endswith('.pdf'):
            doc = fitz.open(file_path)
            doc.close()
    except Exception as e:
        print(f"Error occurred while processing {file_path}: {e}. Deleting the file.")
        os.remove(file_path)

Error occurred while processing /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/sites_default_files_pdf_file_Making-Your-Mark-Example-Booth-Notice-Bilingual-Colour-A2.pdf: cannot open empty document. Deleting the file.
Error occurred while processing /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/sites_default_files_pdf_file_Notional-Spending-Factsheet-2019.pdf: cannot open broken document. Deleting the file.


In [42]:
import os
import subprocess
import glob
import shutil

def convert_to_pdf(input_directory):
    # List all .doc and .docx files in the input directory
    doc_files = glob.glob(os.path.join(input_directory, '*.doc*'))
    
    # Save current working directory
    current_working_directory = os.getcwd()
    
    # Change working directory to input_directory
    os.chdir(input_directory)
    
    for doc_file in doc_files:
        # Construct the PDF file path in the input directory
        base_name = os.path.basename(os.path.splitext(doc_file)[0])
        pdf_file = f"{base_name}.pdf"
        
        try:
            # Run unoconv to convert the doc/docx file to PDF
            # Without specifying output directory, the file should be created in the current working directory
            subprocess.run(['unoconv', '-f', 'pdf', os.path.basename(doc_file)], check=True)
            print(f"Converted {doc_file} to {pdf_file}")
        except subprocess.CalledProcessError as e:
            print(f"Failed to convert {doc_file} to PDF: {str(e)}")
        except Exception as e:
            print(f"An error occurred: {str(e)}")
    
    # Change back to the original working directory
    os.chdir(current_working_directory)

# Use your directory path
input_directory = '/Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages'
convert_to_pdf(input_directory)


Converted /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/sites_default_files_2023-02_LGE%20election%20timetable%204%20May%202023_0.docx to sites_default_files_2023-02_LGE%20election%20timetable%204%20May%202023_0.pdf
Converted /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/sites_default_files_2023-07_Voter%20ID%20Resource%20-%20Staff%20Guide%20Plain%20Text%20English%20-%20Anonymous%20Voters.docx to sites_default_files_2023-07_Voter%20ID%20Resource%20-%20Staff%20Guide%20Plain%20Text%20English%20-%20Anonymous%20Voters.pdf
Converted /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/sites_default_files_2022-01_List%20of%20Commonwealth%20countries%20generic.doc to sites_default_files_2022-01_List%20of%20Commonwealth%20countries%20generic.pdf
Converted /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/sites_default_files_2023-07_Voter%20ID%20Resource%20-%20Voter%20Booklet

Converted /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/sites_default_files_2023-07_LGW%20Nominations%20pack%20updated.docx to sites_default_files_2023-07_LGW%20Nominations%20pack%20updated.pdf
Converted /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/sites_default_files_2019-10_UKPGE-Election-timetable-%2012%20December_4.doc to sites_default_files_2019-10_UKPGE-Election-timetable-%2012%20December_4.pdf
Converted /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/sites_default_files_2021-02_Senedd%20PCC%20Secrecy%20-%20the%20count.doc to sites_default_files_2021-02_Senedd%20PCC%20Secrecy%20-%20the%20count.pdf
Converted /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/sites_default_files_2021-02_Counting%20agent%20appointment%20form%20generic.doc to sites_default_files_2021-02_Counting%20agent%20appointment%20form%20generic.pdf
Converted /Users/cardigan/Documents/Git

Converted /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/sites_default_files_2022-01_Polling%20station%20scenarios%20LGW.doc to sites_default_files_2022-01_Polling%20station%20scenarios%20LGW.pdf
Converted /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/sites_default_files_2021-03_Senedd%20PCC%20FAQs%20for%20frontline%20staff.doc to sites_default_files_2021-03_Senedd%20PCC%20FAQs%20for%20frontline%20staff.pdf
Converted /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/sites_default_files_2021-07_Planning%20considerations%20for%202021%20polls.docx to sites_default_files_2021-07_Planning%20considerations%20for%202021%20polls.pdf
Converted /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/sites_default_files_2021-11_Social_Media_Template_Posts.docx to sites_default_files_2021-11_Social_Media_Template_Posts.pdf
Converted /Users/cardigan/Documents/GitHub/electoral_commiss

Converted /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/sites_default_files_word_doc_UKP-Nomination-pack-incl-election-agent-notification-form.doc to sites_default_files_word_doc_UKP-Nomination-pack-incl-election-agent-notification-form.pdf
Converted /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/sites_default_files_word_doc_Secrecy-requirements-the-poll-MREF.doc to sites_default_files_word_doc_Secrecy-requirements-the-poll-MREF.pdf
Converted /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/sites_default_files_2021-08_SP%20Nomination%20checklist%20-%20constituency.doc to sites_default_files_2021-08_SP%20Nomination%20checklist%20-%20constituency.pdf
Converted /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/sites_default_files_2022-01_Election%20timetable%20LGE%20Generic.docx to sites_default_files_2022-01_Election%20timetable%20LGE%20Generic.pdf
Converted /Users/

Converted /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/sites_default_files_2021-02_SP-Notice-of-election-constituency_0.doc to sites_default_files_2021-02_SP-Notice-of-election-constituency_0.pdf
Converted /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/sites_default_files_2022-02_LGW%20election%20timetable%20Generic.docx to sites_default_files_2022-02_LGW%20election%20timetable%20Generic.pdf
Converted /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/sites_default_files_2021-03_Senedd%20Notice%20of%20election%20-%20region.doc to sites_default_files_2021-03_Senedd%20Notice%20of%20election%20-%20region.pdf
Converted /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/downloaded_pages/sites_default_files_2021-11_Got5_SocialMediaTemplatePosts.docx to sites_default_files_2021-11_Got5_SocialMediaTemplatePosts.pdf
Converted /Users/cardigan/Documents/GitHub/electoral_commission_chatbot/down