# Process all the sample files

## Find the page content.

## Import modules

In [4]:
#!pip install beautifulsoup4

# Install the parsersc
#!pip install html5lib
#!pip install lxml

from bs4 import BeautifulSoup
from bs4 import Comment, Tag, NavigableString

In [5]:
# import required modules for processing files
import os
from os import path

## Process the contents of the HTML

### Turn a file into a bs4 object

In [6]:
def process_markup(filemane: str) :
    with open(filemane) as fp:
        noise_soup = BeautifulSoup(fp, 'html.parser')
        return noise_soup

## Utility functions

### Update an attribute

In [7]:
def update_id(content: Tag, attribute: str, new_value: str):
    id_attr = {}
    id_attr[attribute] = new_value
    content.attrs[attribute] = id_attr[attribute]

### Keep only whitelisted CSS on elements

This will be expanded to take in a JSON file or similar

In [8]:
def whitelist_css(content: Tag, whitelist: dict) :
    for dirty_tag in content.select('[class]') :
        clean_tags = []
        for class_name in dirty_tag.attrs['class'] :
            if( dirty_tag.name in whitelist):
                if(class_name in whitelist[dirty_tag.name]) :
                    clean_tags.append(class_name)
        if(len(clean_tags) > 0) :
            dirty_tag.attrs['class'] = clean_tags
        else :
            del(dirty_tag.attrs['class'])
    return content

### Transform the accordian markup
Flatten the old accordian markup

In [9]:
def transform_accordian(old_accordian: Tag) :
    new_comment = Comment('Reformatted accordian')
    old_accordian.insert(1,new_comment)

    # Some accordians may have a button
    button = old_accordian.find('button', class_="accordion__trigger") #ToDo

    # remove the class attributes or replace them
    for h in old_accordian.find_all('h3') :
        class_list = ['some_new_class']
        class_attr = {}
        class_attr['class'] = class_list
        h.attrs = class_attr
        #del(h.attrs['class'])
    for div in old_accordian.find_all('div') :
        div.unwrap()

### Define a function to find the page content from the soup

This function takes a BeautifulSoup object and will find the page content based on its id attribute. For some reason this seems to need to be a two step process.

In [10]:
def find_page_content(soup: BeautifulSoup) :

    # Check if their is a H1 - if so grab the first one and then find the content div.
    if type(soup.h1) == Tag :
        outer = soup.h1.parent
        content = outer.find('div', { 'id' : 'ctl00_PlaceHolderMain_ctl03__ControlWrapper_RichHtmlField'})
        update_id(content, 'id', 'new_id')
    else :
        content = "Missing"
    return content

## Loop through the directory and process the samples

### Master function to call smaller functions

Whitelist of CSS

In [11]:
classes_to_keep = {}
classes_to_keep['div'] = ['text-block','text']
classes_to_keep['ol'] = ['list', 'list-step']

## Clean up function to process the old markup

In [12]:
def get_new_html(soup) :
    result = find_page_content(soup)

    accordians = result.find_all('div', class_ = 'accordion')
    if(len(accordians) > 0) :
        for acc in accordians :
            transform_accordian(acc)
    result = whitelist_css(result, classes_to_keep)
    return result

## Process the sample(s)

Process the sample files and save a copy in the cleaned_pages folder

In [13]:
# snippet for generating sample pages

new_file = '''
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Document</title>
</head>
<body>
    
</body>
</html>
'''

## Functions to process each of the files in a list of filenames

### Clean and save the files

In [101]:
from bs4 import UnicodeDammit
from pathlib import Path
 
# iterate over files and process them through the clean up functions
def clean_it(ugly_file: Path, ugly_file_directory: str, pretty_file_directory: str):

    print('--------------------------------')
    print('source:', ugly_file.resolve())

    try :
        # Detect the encoding
        with open(ugly_file.resolve(), 'rb') as file:
            content = file.read(1024)
        suggestion = UnicodeDammit(content)
        encoding = suggestion.original_encoding
                
        # load file into a string
        ugly_soup_text = ugly_file.read_text(encoding=encoding)

        # convert to a beautiful soup object
        ugly_soup = BeautifulSoup(ugly_soup_text, 'html.parser')

        # feed through the clean up process
        pretty_soup = get_new_html(ugly_soup)

        # for logging load the HTML wrapper
        wrapper_soup = BeautifulSoup(new_file, 'html.parser')
        wrapper_soup.body.insert(1, pretty_soup)

        ugly_file_name = str(ugly_file.resolve())

        new_path = ugly_file_name.replace(ugly_file_directory, pretty_file_directory)

        pretty_file = Path(ugly_file_name.replace('test','cleaned_pages'))
        print(pretty_file.resolve())

        # create the files parents if they dont exist
        pretty_file.parent.mkdir(parents=True,exist_ok=True)

        # save the file
        if not pretty_file.exists() :
            print('------ Saving file ---------')
            pretty_file.write_text(wrapper_soup.prettify(), encoding=encoding)
        else :
            print('--------- File already exists -------------')

    except Exception as ex: 
        print(f'something went wrong loading file {pretty_file.resolve()}', ex)
        print('--------------')


### Iterate over the folder and process the files in it.

In [None]:
# relative folder to save html source code.
html_source_root_dir = Path(Path().parent) / '..' / 'test'

# relative folder to save html source code.
html_destination_root_dir = Path(Path().parent) / '..' / 'cleaned_pages'

for src_html_file in html_source_root_dir.glob("**/*.html") :
    clean_it(src_html_file, str(html_source_root_dir.resolve()), str(html_destination_root_dir.resolve()) )
    