In [8]:
import os
from bs4 import BeautifulSoup
import re

def extract_titles_from_htm(directory_path):
    titles = {}

    for filename in os.listdir(directory_path):
        if filename.endswith('.htm'):
            file_path = os.path.join(directory_path, filename)

            with open(file_path, 'r', encoding='utf-8') as file:
                soup = BeautifulSoup(file, 'html.parser')
                body_text = soup.get_text()

                # Find the position of the GPO link
                gpo_link_index = body_text.find('www.gpo.gov')

                if gpo_link_index != -1:
                    # Extract the text after GPO link
                    text_after_gpo_link = body_text[gpo_link_index + len('www.gpo.gov'):]

                    # Find the first all-uppercase line after the GPO link
                    for line in text_after_gpo_link.splitlines():
                        if line.isupper():
                            titles[filename] = line.strip()
                            break
                    else:
                        titles[filename] = 'No Title Found'
                else:
                    titles[filename] = 'GPO Link Not Found'

    return titles

def print_titles_in_columns(titles):
    print(f"{'Filename':<50} | {'Title'}")
    print('-' * 100)  # Print a separator line

    for filename, title in titles.items():
        print(f"{filename:<50} | {title}")


# Usage example
directory_path = '../Episodes/CREC-2023-05-22/html'
titles = extract_titles_from_htm(directory_path)
print_titles_in_columns(titles)


Filename                                           | Title
----------------------------------------------------------------------------------------------------
CREC-2023-05-22-pt1-PgE472-3.htm                   | CELEBRATING THE 98TH BIRTHDAY OF TRINI GOMEZ
CREC-2023-05-22-pt1-PgH2479.htm                    | VETERANS' COMPENSATION COST-OF-LIVING ADJUSTMENT ACT OF 2023
CREC-2023-05-22-pt1-PgH2486.htm                    | COMMEMORATING THE LIFE AND LEGACY OF DR. NICHOLAS BALABKINS
CREC-2023-05-22-pt1-PgH2502-9.htm                  | H.R. 3585.
CREC-2023-05-22-pt1-PgE474-5.htm                   | RECOGNIZING CLINT JOHNSTON
CREC-2023-05-22-pt1-PgE474-4.htm                   | PERSONAL EXPLANATION
CREC-2023-05-22-pt1-PgH2502-8.htm                  | H.R. 3584.
CREC-2023-05-22-pt1-PgE472-2.htm                   | RECOGNIZING CHIEF GARY SPARKS' 40 YEARS OF SERVICE AS A FIREFIGHTER FOR
CREC-2023-05-22-pt1-PgH2502-20.htm                 | ADDITIONAL SPONSORS
CREC-2023-05-22-pt1-PgH2469-5.htm  

In [14]:
from bs4 import BeautifulSoup
import os

def extract_titles_and_content(directory, char_limit=1000):
    extracted_data = []

    for filename in os.listdir(directory):
        if filename.endswith('.htm'):
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                soup = BeautifulSoup(file, 'html.parser')
                
                # Find the link to www.gpo.gov and its next sibling
                gpo_link = soup.find('a', href='https://www.gpo.gov')
                if gpo_link:
                    title_tag = gpo_link.find_next_sibling(text=True)
                    if title_tag:
                        title = title_tag.strip().upper()  # Extract and clean the title

                        # Start capturing content after the title
                        content = ""
                        content_start = title_tag.next_element
                        while len(content) < char_limit and content_start:
                            # Skip over non-text elements like tags
                            if not content_start.name:
                                content += str(content_start)
                            content_start = content_start.next_element

                        content = content[:char_limit]  # Limit the content length
                        extracted_data.append({'file': filename, 'title': title, 'content': content})
                    else:
                        extracted_data.append({'file': filename, 'title': 'No title found', 'content': ''})
                else:
                    extracted_data.append({'file': filename, 'title': 'No title found', 'content': ''})

    return extracted_data

# Example usage
directory_path = '../Episodes/CREC-2023-05-22/html'
data = extract_titles_and_content(directory_path)
for item in data:
#    print(f"File: {item['file']:<50} Title: {item['title']}")
    print(f"Content: {item['content']}\n")

Content: 


Content: 


Content: 


Content: 


Content: 


Content: 


Content: 


Content: 


Content: 


Content: 


Content: 


Content: 


Content: 2. Much work needs to be 
done to keep crops on the table, to keep the electricity and the wires, 
keep the minerals coming to produce all the things we need to keep 
energy, all of that. The ESA needs to be revisited and actually focused 
on truly recovering species but not used as a weapon to stop things 
people need, our economies need, rural economies especially. That is 
some of the work we do in the Western Caucus.
  Mr. Speaker, I appreciate, again, Chairman Newhouse's leadership on 
this and the opportunity to speak on it here tonight. There is much 
more that I could say; you know me.
  Mr. NEWHOUSE. Mr. Speaker, Mr. LaMalfa is correct, there are a lot of 
things that we should talk about. We have a limited amount of time, but 
he is very good at expressing the needs of the people that he 
represents and making sure that peopl

  title_tag = gpo_link.find_next_sibling(text=True)
