# Process all the sample files

## Find the page content.

## Import modules

In [1]:
#!pip install beautifulsoup4

# Install the parsersc
#!pip install html5lib
#!pip install lxml

from bs4 import BeautifulSoup
from bs4 import Comment, Tag, NavigableString

In [2]:
# import required modules for processing files
import os
from os import path

## Process the contents of the HTML

### Turn a file into a bs4 object

In [3]:
def process_markup(filemane: str) :
    with open(filemane) as fp:
        noise_soup = BeautifulSoup(fp, 'html.parser')
        return noise_soup

## Utility functions

### Update an attribute

In [4]:
def update_id(content: Tag, attribute: str, new_value: str):
    id_attr = {}
    id_attr[attribute] = new_value
    content.attrs[attribute] = id_attr[attribute]

### Keep only whitelisted CSS on elements

This will be expanded to take in a JSON file or similar

In [5]:
def whitelist_css(content: Tag, whitelist: dict) :
    for dirty_tag in content.select('[class]') :
        clean_tags = []
        for class_name in dirty_tag.attrs['class'] :
            if( dirty_tag.name in whitelist):
                if(class_name in whitelist[dirty_tag.name]) :
                    clean_tags.append(class_name)
        if(len(clean_tags) > 0) :
            dirty_tag.attrs['class'] = clean_tags
        else :
            del(dirty_tag.attrs['class'])
    return content

### Transform the accordian markup
Flatten the old accordian markup

In [6]:
def transform_accordian(old_accordian: Tag) :
    new_comment = Comment('Reformatted accordian')
    old_accordian.insert(1,new_comment)

    # Some accordians may have a button
    button = old_accordian.find('button', class_="accordion__trigger") #ToDo

    # remove the class attributes or replace them
    for h in old_accordian.find_all('h3') :
        class_list = ['some_new_class']
        class_attr = {}
        class_attr['class'] = class_list
        h.attrs = class_attr
        #del(h.attrs['class'])
    for div in old_accordian.find_all('div') :
        div.unwrap()

### Define a function to find the page content from the soup

This function takes a BeautifulSoup object and will find the page content based on its id attribute. For some reason this seems to need to be a two step process.

In [7]:
def find_page_content(soup: BeautifulSoup) :

    # Check if their is a H1 - if so grab the first one and then find the content div.
    if type(soup.h1) == Tag :
        outer = soup.h1.parent
        content = outer.find('div', { 'id' : 'ctl00_PlaceHolderMain_ctl03__ControlWrapper_RichHtmlField'})
        update_id(content, 'id', 'new_id')
    else :
        content = "Missing"
    return content

## Loop through the directory and process the samples

### Master function to call smaller functions

Whitelist of CSS

In [8]:
classes_to_keep = {}
classes_to_keep['div'] = ['text-block','text']
classes_to_keep['ol'] = ['list', 'list-step']

In [9]:
def get_new_html(filename: str) :
    soup = process_markup(filename)
    result = find_page_content(soup)

    accordians = result.find_all('div', class_ = 'accordion')
    if(len(accordians) > 0) :
        for acc in accordians :
            transform_accordian(acc)
    result = whitelist_css(result, classes_to_keep)
    return result

## Process the sample(s)

Process the sample files and save a copy in the cleaned_pages folder

In [10]:
# snippet for generating sample pages

new_file = '''
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Document</title>
</head>
<body>
    
</body>
</html>
'''

In [11]:
from os import path
# directory = path.join('..','sourcefiles','test')
directory = path.join('..','sourcefiles','test')
cleaned_directory = path.join('..', 'cleaned_pages','test')
 
# iterate over files in
# that directory
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    cleaned_f = os.path.join(cleaned_directory, filename)
    # checking if it is a file
    if os.path.isfile(f):
        clean_page = get_new_html(f)
        new_soup = BeautifulSoup(new_file, 'html.parser')
        new_soup.body.insert(1, clean_page)
        with open(cleaned_f, "w", encoding='utf-8') as file:
            file.write(new_soup.prettify())