In [1]:
# Liturgický kalendár:
# 365 dní ako "one page" zobrazenie.

In [2]:
import os
import re
import copy
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup, Comment

In [3]:
# kde sú html súbory:
data_dir = './data_html/'

In [4]:
# List all html files from data directory 
# with file name length 15 (meaning particular date like: '2023-01-01.html')

# poznámka: 
# keďže cirkevný rok začína adventom, sú v zip archíve aj dni z predošlého roka (2022)
# preto som pre názornosť dal podmienku, aby bral len aktuálny rok 2023 (file.startswith('2023'))
html_files = [ file for file in os.listdir(data_dir) 
                    if file.startswith('2023') and file.endswith(".html") and len(file)==15 ]

len(html_files) 

365

In [5]:
my_table_style="""
<style>
    table {
        /* width: 75%; 
        margin-left: auto; 
        margin-right: auto;
        font-family: Arial, Helvetica, sans-serif; 
        line-height: 1.5; */
        border: 1px solid gray; 
        border-collapse: collapse; 
        /* padding: 1rem;     */
    }

    td {
        /* white-space: pre-wrap; */
        border: 1px solid gray;
        border-collapse: collapse;
        /* padding: 1rem;         */
    }

    th {
        /* font-size: 1.1rem; */
        border: 1px solid gray;
        border-collapse: collapse;
        /* padding: 1rem;           */
    }

    tbody {
        border-style: ridge;
        border-width: 5px;
        border-color: coral
    }

    tr td:first-child {
        text-align: center;
    }    
</style>"""

In [6]:
soup_style = BeautifulSoup(my_table_style,'html.parser')

In [7]:
# vytvoríme si soup1, do ktorého budeme vkladať obsah.

fname = html_files[0] # poslúži ľubovoľný prvý súbor
with open(data_dir+fname, encoding='utf-8') as f_in:
    s = f_in.read()      

soup1 = BeautifulSoup(s,'lxml') # 

soup1.head.append(soup_style)

soup1.title.string = 'kalendár 2023' # meníme title
init_script = copy.copy(soup1.body.script)
soup1.body.clear()
soup1.body.append(init_script)

my_table = soup1.new_tag('table') # preč: , border="1"
soup1.body.append(my_table)



In [8]:
for fname in html_files:

    with open(data_dir+fname, encoding='utf-8') as f_in:
        s = f_in.read()            

    date_string = fname.removesuffix('.html')

    # týždne sú oddelené hrubšími čiarami, jednotlivé týždne sú zabalené v <tbody> elementoch
    current_date = pd.to_datetime(date_string)
    if (date_string == '2023-01-01') or current_date.day_name('sk_SK')=='Pondelok':
        tbody_here = soup1.new_tag('tbody') # create new tbody here
        soup1.table.append(tbody_here)

    new_table_row = soup1.new_tag('tr')
    tbody_here.append(new_table_row)

    ### soup2
    ### 
    soup2 = BeautifulSoup(s,'lxml') #    
    main = soup2.body.div # div class hvr-scl-chldrn-a

    elems = main.find_all('div', class_='lcHEADinfo') # mažeme súradnice
    for elem in elems: #  v príslušný deň ich môže byť viac
        elem.extract()        
    
    elems = main.find_all('div', class_='lcDENalt') # tu riešime štýlovanie a vigíliu.
    for elem in elems:
        if 'psv' in elem['class']: # rušíme štýlovanie 
            elem['class'].remove('psv') # psv ... prikázaný sviatok, rušíme štýlovanie (červený okraj)

        h3 = elem.find('h3')                       
        if h3:
            if h3.get_text().startswith('Vigília'): # ak je to vigília, mažeme.
                elem.extract()  

    elems = main.find_all('div', class_='lcBODY') # čítania mažeme
    for elem in elems: #  v príslušný deň ich môže byť viac
        elem.extract()        

    # main.find('div', class_='lcNAVIG').extract()    
    main.section.div.extract() # <div class="lcNAVIG"> # navigačné tlačidlá mažeme    

    # h1
    novy_link = 'onepage_tabulka.html#'+main.h1.a['href'].removesuffix('.html') # upravujeme link
    main.h1.a['href'] = novy_link
    main.h1.a.string = main.h1.a.string[:-5] # mažeme rok 2023 z textu linku
    main.h1.span.find('span', class_='lcWD').append(soup2.new_tag('br')) # <br> 
    main.h1.span.find('span', class_='lcDMY').append(soup2.new_tag('br')) # <br>
    for elem in main.find_all('h1'): # nechceme nadpisy, meníme na div
        elem.name = 'div'
    for elem in main.find_all('h2'): # nechceme nadpisy, meníme na div
        elem.name = 'div'

    # td1 bude obsahovať: datum (lcDMY) a deň v týždni (lcWD)
    td1 = soup1.new_tag('td', id=date_string) # id='2023-01-01' napríklad. využije sa pri naDnes navigácii
    td1.append( main.find('span', class_='lcDMY') ) # appending is also moving...   
    td1.append( main.find('span', class_='lcWD') ) # appending is also moving...
    new_table_row.append(td1) # <tr><td>    

    # td2 bude obsahovať: meniny (lcND) a zvyšok main soup.
    td2 = soup1.new_tag('td')
    td2.append( main.find('span', class_='lcND') ) # appending is also moving...
    td2.append( main.section ) # appending the rest, which is <section> under <div class="lc hvr-scl-chldrn-a">
    new_table_row.append(td2) # <tr><td>    

In [9]:
# final cleaning (size reducing)

# removing html comments <!-- -->
comments = soup1.find_all(string = lambda x: isinstance(x, Comment))
for comment in comments:
    comment.extract()

# removing tags with no content https://stackoverflow.com/questions/33500888/how-to-remove-tags-that-have-no-content
for x in soup1.find_all():
    if (len(x.get_text(strip=True)) == 0) and (x.name not in ['br', 'img', 'meta', 'link', 'script']):
        x.extract()

# removing redundant new_lines:
# out_string = str(soup1).replace('\n\n\n','\n')
out_string = str(soup1)
out_string = re.sub('\n \n', '\n', out_string)
out_string = re.sub('\n\n', '\n', out_string)

In [10]:
with open('onepage.html', mode='w', encoding='utf-8') as f_out:
    f_out.write(out_string)

In [45]:
# done