In [4]:
from bs4 import BeautifulSoup
import re
import os
from docx import Document
from docx.shared import Pt
from datetime import datetime

# === Load HTML files ===
html_files = ['full.html', '360.html']

# Combine contents from both HTML files
combined_text = ""

for filename in html_files:
    if os.path.exists(filename):
        with open(filename, 'r', encoding='utf-8') as file:
            html_content = file.read()
            soup = BeautifulSoup(html_content, 'html.parser')
            text = soup.get_text()
            # Append the cleaned text from the current file to combined_text
            combined_text += ' ' + ' '.join(text.split())
    else:
        print(f"File not found: {filename}")

# Create a new document
doc = Document('comp_template.docx')

# === Extract My Current Location ===
address_match = re.search(r'My Current Location\s*(.*?),', combined_text)
address = address_match.group(1).strip() if address_match else "N/A"

# === Extract Full Location ===
city_state_match = re.search(r'My Current Location\s+[^,]+,\s*([^,]+),\s*([A-Z]{2})', combined_text)
city_state = f"{city_state_match.group(1).strip()}, {city_state_match.group(2).strip()}" if city_state_match else "N/A"

# === Extract Tax ID (next 10 characters after "Tax ID: ") ===
tax_id_match = re.search(r'Tax ID:\s*(.{10})', combined_text)
raw_tax_id = tax_id_match.group(1).strip() if tax_id_match else "N/A"
tax_id = f"{raw_tax_id[2:4]}-{raw_tax_id[4:]}" if raw_tax_id != "N/A" else "N/A"

# === Extract Tax Map ===
tax_map_match = re.search(r'Tax Map:\s*(.{4})', combined_text)
tax_map = tax_map_match.group(1).strip() if tax_map_match else "N/A"

# === Extract Tax Grid ===
tax_grid_match = re.search(r'Grid:\s*(\S+)', combined_text)
tax_grid = tax_grid_match.group(1).strip() if tax_grid_match else "N/A"

# === Extract Parcel ===
parcel_match = re.search(r'Parcel Number:\s*(.{4})', combined_text)
parcel = parcel_match.group(1).strip() if parcel_match else "N/A"

map_grid_parcel = f"{tax_map}/ {tax_grid}/ {parcel}"

# === Extract Legal Description ===
legal_desc_match = re.search(r'County Legal Desc:\s*(.*?)\s+Use', combined_text)
raw_legal_desc = legal_desc_match.group(1).strip() if legal_desc_match else "N/A"
legal_desc = ' '.join(word.capitalize() for word in raw_legal_desc.split())

# === Extract Owner ===
owner_match = re.search(r'Owner:\s*(.*?)\s+Owner Address', combined_text)
owner = owner_match.group(1).strip() if owner_match else "N/A"

# === Extract Closed Date ===
closed_match = re.search(r'COM Closed\s*(.{8})', combined_text)
closed = closed_match.group(1).strip() if closed_match else "N/A"

# === Extract Liber ===
liber_match = re.search(r'Book:\s*(.*?)\s+Page', combined_text)
liber = liber_match.group(1).strip() if liber_match else "N/A"

# === Extract Folio ===
folio_match = re.search(r'Page:\s*(.*?)\s+Tax', combined_text)
folio = folio_match.group(1).strip() if folio_match else "N/A"

deed_reference = f"{liber}/{folio}"

# === Extract Ownership Interest ===
interest_match = re.search(r'Ownership Interest:\s*(Fee Simple|Condominium|Leased Fee)', combined_text, re.IGNORECASE)
interest = interest_match.group(1).strip() if interest_match else "N/A"

# === Extract Listing Term Begins ===
listing_begin_match = re.search(r'Listing Term Begins:\s*(.*?)\s+Listing Entry', combined_text, re.IGNORECASE)
listing_begin = listing_begin_match.group(1).strip() if listing_begin_match else "N/A"
try:
    listing_begin = datetime.strptime(listing_begin, "%m/%d/%Y")
except ValueError:
    listing_begin = datetime.strptime(listing_begin, "%m/%d/%y")

# === Extract Off Market Date ===
off_market_match = re.search(r'Off Market Date:\s*(.{8})', combined_text, re.IGNORECASE)
off_market = off_market_match.group(1).strip() if off_market_match else "N/A"
try:
    off_market = datetime.strptime(off_market, "%m/%d/%Y")
except ValueError:
    off_market = datetime.strptime(off_market, "%m/%d/%y")

days_on_market = (off_market - listing_begin).days

# === Extract Close Price ===
close_price_match = re.search(r'Close Price:\s*(.*?)\s+Buyer', combined_text, re.IGNORECASE)
close_price = close_price_match.group(1).strip() if close_price_match else "N/A"

# === Extract Price Per Square Foot ===
price_sf_match = re.search(r'Price\s*[/\.]\s*Sq\s*Ft:\s*([\d,]+\.\d+)', combined_text, re.IGNORECASE)
price_sf = price_sf_match.group(1).strip() if price_sf_match else "N/A"

# === Extract Lot Size ===
lot_size_match = re.search(r'Lot Acres / SQFT:\s*([\d.]+a\s*/\s*\d+sf)', combined_text, re.IGNORECASE)
lot_size = lot_size_match.group(1).strip() if lot_size_match else "N/A"

# === Extract building sf ===
building_size_match = re.search(r"Available SqFt:\s*([\d,]+\.\d+)", combined_text, re.IGNORECASE)
building_size = building_size_match.group(1).strip() if building_size_match else "N/A"

# === Extract Year Built ===
year_match = re.search(r'Year Built:\s*(\d{4})', combined_text)
year_built = year_match.group(1).strip() if year_match else "N/A"

# === Extract Parking ===
parking_match = re.search(r"Total Parking Spaces\s*(\d+)", combined_text)
parking = parking_match.group(1).strip() if parking_match else "N/A"

# === Extract Zoning ===
zoning_match = re.search(r"Zoning:\s*([A-Z]{1,2})\b", combined_text)
zoning = zoning_match.group(1).strip() if zoning_match else "N/A"

# === Extract Utilities ===
utilities_match = re.search(r"Sewer:\s*(Public|Private)", combined_text)
utilities = utilities_match.group(1).strip() if zoning_match else "N/A"

# === Add Data to Word Document ===
cell_1 = doc.tables[0].cell(4, 1)
cell_1.text = ""
run_1 = cell_1.paragraphs[0].add_run(f"{address}")
run_1.font.size = Pt(10)
run_1.font.name = 'Times New Roman'
cell_1.paragraphs[0].paragraph_format.space_before = Pt(0)
cell_1.paragraphs[0].paragraph_format.space_after = Pt(0)

cell_1 = doc.tables[0].cell(5, 1)
cell_1.text = ""
run_1 = cell_1.paragraphs[0].add_run(f"{city_state}")
run_1.font.size = Pt(10)
run_1.font.name = 'Times New Roman'
cell_1.paragraphs[0].paragraph_format.space_before = Pt(0)
cell_1.paragraphs[0].paragraph_format.space_after = Pt(0)

cell_1 = doc.tables[0].cell(6, 1)
cell_1.text = ""
run_1 = cell_1.paragraphs[0].add_run(f"{tax_id}")
run_1.font.size = Pt(10)
run_1.font.name = 'Times New Roman'
cell_1.paragraphs[0].paragraph_format.space_before = Pt(0)
cell_1.paragraphs[0].paragraph_format.space_after = Pt(0)

cell_1 = doc.tables[0].cell(7, 1)
cell_1.text = ""
run_1 = cell_1.paragraphs[0].add_run(f"{map_grid_parcel}")
run_1.font.size = Pt(10)
run_1.font.name = 'Times New Roman'
cell_1.paragraphs[0].paragraph_format.space_before = Pt(0)
cell_1.paragraphs[0].paragraph_format.space_after = Pt(0)

cell_1 = doc.tables[0].cell(8, 1)
cell_1.text = ""
run_1 = cell_1.paragraphs[0].add_run(f"{legal_desc}")
run_1.font.size = Pt(10)
run_1.font.name = 'Times New Roman'
cell_1.paragraphs[0].paragraph_format.space_before = Pt(0)
cell_1.paragraphs[0].paragraph_format.space_after = Pt(0)

cell_1 = doc.tables[0].cell(11, 1)
cell_1.text = ""
run_1 = cell_1.paragraphs[0].add_run(f"{owner}")
run_1.font.size = Pt(10)
run_1.font.name = 'Times New Roman'
cell_1.paragraphs[0].paragraph_format.space_before = Pt(0)
cell_1.paragraphs[0].paragraph_format.space_after = Pt(0)

cell_1 = doc.tables[0].cell(12, 1)
cell_1.text = ""
run_1 = cell_1.paragraphs[0].add_run(f"{closed}")
run_1.font.size = Pt(10)
run_1.font.name = 'Times New Roman'
cell_1.paragraphs[0].paragraph_format.space_before = Pt(0)
cell_1.paragraphs[0].paragraph_format.space_after = Pt(0)

cell_1 = doc.tables[0].cell(13, 1)
cell_1.text = ""
run_1 = cell_1.paragraphs[0].add_run(f"{deed_reference}")
run_1.font.size = Pt(10)
run_1.font.name = 'Times New Roman'
cell_1.paragraphs[0].paragraph_format.space_before = Pt(0)
cell_1.paragraphs[0].paragraph_format.space_after = Pt(0)

cell_1 = doc.tables[0].cell(14, 1)
cell_1.text = ""
run_1 = cell_1.paragraphs[0].add_run(f"{interest}")
run_1.font.size = Pt(10)
run_1.font.name = 'Times New Roman'
cell_1.paragraphs[0].paragraph_format.space_before = Pt(0)
cell_1.paragraphs[0].paragraph_format.space_after = Pt(0)

cell_1 = doc.tables[0].cell(16, 1)
cell_1.text = ""
run_1 = cell_1.paragraphs[0].add_run(f"{days_on_market} days")
run_1.font.size = Pt(10)
run_1.font.name = 'Times New Roman'
cell_1.paragraphs[0].paragraph_format.space_before = Pt(0)
cell_1.paragraphs[0].paragraph_format.space_after = Pt(0)

cell_1 = doc.tables[0].cell(17, 1)
cell_1.text = ""
run_1 = cell_1.paragraphs[0].add_run(f"{close_price}")
run_1.font.size = Pt(10)
run_1.font.name = 'Times New Roman'
cell_1.paragraphs[0].paragraph_format.space_before = Pt(0)
cell_1.paragraphs[0].paragraph_format.space_after = Pt(0)

cell_1 = doc.tables[0].cell(18, 1)
cell_1.text = ""
run_1 = cell_1.paragraphs[0].add_run(f"${price_sf}")
run_1.font.size = Pt(10)
run_1.font.name = 'Times New Roman'
cell_1.paragraphs[0].paragraph_format.space_before = Pt(0)
cell_1.paragraphs[0].paragraph_format.space_after = Pt(0)

cell_1 = doc.tables[0].cell(20, 1)
cell_1.text = ""
run_1 = cell_1.paragraphs[0].add_run(f"{lot_size}")
run_1.font.size = Pt(10)
run_1.font.name = 'Times New Roman'
cell_1.paragraphs[0].paragraph_format.space_before = Pt(0)
cell_1.paragraphs[0].paragraph_format.space_after = Pt(0)

cell_1 = doc.tables[0].cell(21, 1)
cell_1.text = ""
run_1 = cell_1.paragraphs[0].add_run(f"{building_size}")
run_1.font.size = Pt(10)
run_1.font.name = 'Times New Roman'
cell_1.paragraphs[0].paragraph_format.space_before = Pt(0)
cell_1.paragraphs[0].paragraph_format.space_after = Pt(0)

cell_1 = doc.tables[0].cell(22, 1)
cell_1.text = ""
run_1 = cell_1.paragraphs[0].add_run(f"{year_built}")
run_1.font.size = Pt(10)
run_1.font.name = 'Times New Roman'
cell_1.paragraphs[0].paragraph_format.space_before = Pt(0)
cell_1.paragraphs[0].paragraph_format.space_after = Pt(0)

cell_1 = doc.tables[0].cell(23, 1)
cell_1.text = ""
run_1 = cell_1.paragraphs[0].add_run(f"{parking} vehicle spaces")
run_1.font.size = Pt(10)
run_1.font.name = 'Times New Roman'
cell_1.paragraphs[0].paragraph_format.space_before = Pt(0)
cell_1.paragraphs[0].paragraph_format.space_after = Pt(0)

cell_1 = doc.tables[0].cell(24, 1)
cell_1.text = ""
run_1 = cell_1.paragraphs[0].add_run(f"{zoning}")
run_1.font.size = Pt(10)
run_1.font.name = 'Times New Roman'
cell_1.paragraphs[0].paragraph_format.space_before = Pt(0)
cell_1.paragraphs[0].paragraph_format.space_after = Pt(0)

cell_1 = doc.tables[0].cell(25, 1)
cell_1.text = ""
run_1 = cell_1.paragraphs[0].add_run(f"{utilities}")
run_1.font.size = Pt(10)
run_1.font.name = 'Times New Roman'
cell_1.paragraphs[0].paragraph_format.space_before = Pt(0)
cell_1.paragraphs[0].paragraph_format.space_after = Pt(0)

# Save the document
doc.save("improved_comp.docx")

In [106]:
from bs4 import BeautifulSoup
import os

# List of HTML filenames
html_files = ['full.html', '360.html']

# Dictionary to store the contents
html_contents = {}

# Load HTML files
for filename in html_files:
    if os.path.exists(filename):
        with open(filename, 'r', encoding='utf-8') as file:
            html_contents[filename] = file.read()
    else:
        print(f"File not found: {filename}")

# Process each HTML file
for filename, html_content in html_contents.items():
    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')  # or use 'lxml'

    # Extract text
    text = soup.get_text()

    # Optionally clean up whitespace
    cleaned_text = ' '.join(text.split())

    # Print or process the cleaned text
    print(f"Cleaned text from {filename}:")
    print(cleaned_text)
    print("="*50)


Cleaned text from full.html:
Matrix michael pugh ID: 3062508Account & SettingsAct As... Brokerage TeamsTransfer Office BillingPassword & SecurityUpload HeadshotLogoutPRODUCTS Premium EnhancementsNestfully SocialAutomated social media marketing system AdBuilder™Automated ads for listingsRentSpreeOnline rental application and screeningCloud CMACMAs, buyer tours and listing presentations CubiCasaScan floor plans from your smart phoneTeamsGive your team the recognition it deservesIDX Data FeedListings data transferred to your websiteAuthentisignSecure, electronic signature solutionSmartCharts ProLocal market data with charts and reportsRealtyTracAccess to foreclosures and REO propertiesNEWSRESEARCHBright Research HomeNews & InsightsResearch & StudiesWeekly StatisticsMarket ReportsHome Demand IndexSUPPORTSearch Help TopicsGetting StartedTrainingAccuracy & PolicyBright MLS RulesChat With Us NowCustomer Success CoachesSubmit A CaseContact UsEDUCATIONWebinarsAssociation TrainingCustomer Succes