In [None]:
import os
import json
from pathlib import Path
from collections import Counter, defaultdict
from bs4 import BeautifulSoup
from tabulate import tabulate
import difflib

HTML_DIR = Path(r"C:\Users\hp\Desktop\leadsup\v0\scraping-approach-1\old-logs\html-codes-for-5")

raw_field_counter = Counter()
grouped_counter = Counter()
grouped_variants = defaultdict(set)
unrequired_counts = defaultdict(int)

FIELD_GROUPS = {
    "Email": ["email"],
    "Name": ["name", "full_name", "your_name"],
    "Phone": ["phone", "tel", "mobile"],
    "Message": ["message", "comments", "text", "inquiry", "enquiry"],
    "Company": ["company", "organization"],
    "Address",
    "Subject": ["subject", "reason"],
    "Website": ["website", "url"]
}
# for i in forms:
    # IF form filled using our field groups:
        # form_filled_count += 1
    # IF NOT:
        # create a json file for that form to store the following info:
        # form url
        # list of non-filled required fields
        # list of non-filled unrequired fields
        # save the json file


ROUND_2_SIMILARITY_THRESHOLD = 0.85

def normalize(text):
    return text.strip().lower().replace(" ", "_")

def assign_group(label):
    norm = normalize(label)
    for group, keywords in FIELD_GROUPS.items():
        if any(kw in norm for kw in keywords):
            grouped_variants[group].add(label)
            return group
    return label  # no match → retain raw

def extract_label(field, soup):
    field_id = field.get("id")
    if field_id:
        label_tag = soup.find("label", attrs={"for": field_id})
        if label_tag and label_tag.text.strip():
            return label_tag.text.strip()

    parent = field.find_parent()
    while parent:
        visible_text = parent.get_text(separator=" ", strip=True)
        if visible_text:
            return visible_text
        parent = parent.find_parent()

    return field.get("name") or "unknown"

for file in HTML_DIR.glob("*.html"):
    try:
        with open(file, "r", encoding="utf-8") as f:
            html = f.read()
        soup = BeautifulSoup(html, "html.parser")

        fields = soup.find_all(["input", "select", "textarea"])
        for field in fields:
            if (
                field.get("type") == "hidden"
                or "display:none" in field.get("style", "").replace(" ", "").lower()
                or any("invisible" in cls.lower() for cls in field.get("class", []))
            ):
                continue

            label = extract_label(field, soup)
            required = field.has_attr("required")

            raw_field_counter[(label, required)] += 1
            group = assign_group(label)
            grouped_counter[(group, required)] += 1
            if not required:
                unrequired_counts[group] += 1

    except Exception as e:
        print(f"[ERROR] {file.name}: {e}")

# === FINAL OUTPUT (Original Detailed Print) ===
print("\n=== Grouped Form Fields (excluding rare + unrequired) ===")
for (label, required), count in grouped_counter.most_common():
    if count <= 1 and not required:
        continue
    print(f"{label}: {count} {'(required)' if required else ''}")
    if label in grouped_variants:
        print(f"  ↳ Includes: {', '.join(sorted(grouped_variants[label]))}")

# === ROUND 1 FILTERING: Remove (count < 2 AND unrequired) ===
filtered_groups = Counter()
unrequired_summary = {}
for (label, required), count in grouped_counter.items():
    total_count = sum(c for (l, r), c in grouped_counter.items() if l == label)
    total_unrequired = unrequired_counts[label]
    if total_count > 1 or required:
        filtered_groups[label] = total_count
        unrequired_summary[label] = total_unrequired

# === ROUND 2: Further Grouping Based on Similarity ===
final_grouped = Counter()
final_unrequired = defaultdict(int)
seen_labels = []
label_map = {}

for label in filtered_groups:
    matched = False
    for existing in seen_labels:
        if difflib.SequenceMatcher(None, label.lower(), existing.lower()).ratio() > ROUND_2_SIMILARITY_THRESHOLD:
            label_map[label] = existing
            final_grouped[existing] += filtered_groups[label]
            final_unrequired[existing] += unrequired_summary.get(label, 0)
            matched = True
            break
    if not matched:
        label_map[label] = label
        seen_labels.append(label)
        final_grouped[label] = filtered_groups[label]
        final_unrequired[label] = unrequired_summary.get(label, 0)



=== Grouped Form Fields (excluding rare + unrequired) ===
Email: 62 
  ↳ Includes: Announce your new employees, promotions, board positions, community notes and leaders in your organization to The Daily Record’s influential audience. Your accolade will appear in print and online, as well as in search, so the business and legal community will stay in the know and your leader will be recognized. Opportunities include: Movers and Shakers posting: $300. Includes a standard photo, up to 300 characters of text in print for one issue (limit of 8 per page) on the Movers and Shakers page, plus visibility in a prime location on the homepage at TheDailyRecord.com. Featured Mover: $850. Includes a large photo, up to 1,500 characters of text, company logo, preferred placement in the center of the Movers and Shakers page (one per issue) plus visibility in a prime location on the homepage at TheDailyRecord.com. Note: The price is per person. A new form is needed for each submission. Professional ann

In [11]:
# === FINAL SUMMARY TABLE ===
table_data = []
for label, count in final_grouped.most_common():
    unreq_count = final_unrequired[label]
    table_data.append([label, count, unreq_count])

print("\n=== Summary Table (Aggregated) ===")
print(tabulate(table_data, headers=["Field Name", "Total Count", "Unrequired Count"], tablefmt="github"))


=== Summary Table (Aggregated) ===
| Field Name                                                                                                                                                                                                                                                                                                                                         |   Total Count |   Unrequired Count |
|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------|--------------------|
| Email                                                                                                                                                                                                     

-----

**Using GPT**

In [12]:
import os
import json
from pathlib import Path
from collections import Counter, defaultdict
from bs4 import BeautifulSoup
from openai import OpenAI
from tabulate import tabulate

HTML_DIR = Path(r"C:\Users\hp\Desktop\leadsup\v0\scraping-approach-1\old-logs\html-codes-for-5")

# Setup OpenAI client
key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=key)

def ask_gpt_to_group(fields_to_check):
    try:
        prompt = f"""
You are a smart assistant helping to group similar form field labels together.
Here is a list of field labels collected from web forms:

{json.dumps(fields_to_check, indent=2)}

Your task is to return a JSON object where each key is a grouped field label name (like 'Email', 'Name', etc.), and its value is a list of field labels from the input that belong to this group.
Only group truly similar/semantically identical fields. Do not group different concepts. Keep groups clear and minimal.
Example format:
{{
  "Email": ["Email", "Email Address", "E-mail"],
  "Message": ["Message", "Explain your issue", "Your message"]
}}
"""
        token_est = len(prompt.split())
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}]
        )
        output = response.choices[0].message.content.strip()
        return json.loads(output)
    except Exception as e:
        print(f"[API ERROR] GPT failed during grouping: {e}")
        return {}

raw_field_counter = Counter()
grouped_counter = Counter()
grouped_variants = defaultdict(set)
FIELD_GROUPS = {
    "Email": ["email"],
    "Name": ["name", "full_name", "your_name"],
    "Phone": ["phone", "tel", "mobile"],
    "Message": ["message", "comments", "text", "inquiry", "enquiry"],
    "Company": ["company", "organization"],
    "Subject": ["subject", "reason"],
    "Website": ["website", "url"],
    "File Upload": ["file", "upload", "attachment"]
}

def normalize(text):
    return text.strip().lower().replace(" ", "_")

def assign_group(label):
    norm = normalize(label)
    for group, keywords in FIELD_GROUPS.items():
        if any(kw in norm for kw in keywords):
            grouped_variants[group].add(label)
            return group
    return label

def extract_label(field, soup):
    field_id = field.get("id")
    if field_id:
        label_tag = soup.find("label", attrs={"for": field_id})
        if label_tag and label_tag.text.strip():
            return label_tag.text.strip()
    parent = field.find_parent()
    while parent:
        visible_text = parent.get_text(separator=" ", strip=True)
        if visible_text:
            return visible_text
        parent = parent.find_parent()
    return field.get("name") or "unknown"

field_details = defaultdict(lambda: {"count": 0, "unrequired": 0})

for file in HTML_DIR.glob("*.html"):
    try:
        with open(file, "r", encoding="utf-8") as f:
            html = f.read()
        soup = BeautifulSoup(html, "html.parser")
        fields = soup.find_all(["input", "select", "textarea"])
        for field in fields:
            if (
                field.get("type") == "hidden"
                or "display:none" in field.get("style", "").replace(" ", "").lower()
                or any("invisible" in cls.lower() for cls in field.get("class", []))
            ):
                continue
            label = extract_label(field, soup)
            required = field.has_attr("required")
            raw_field_counter[(label, required)] += 1
            group = assign_group(label)
            grouped_counter[(group, required)] += 1
            field_details[label]["count"] += 1
            if not required:
                field_details[label]["unrequired"] += 1
    except Exception as e:
        print(f"[ERROR] {file.name}: {e}")

# === FINAL OUTPUT (Original Detailed Print) ===
print("\n=== Grouped Form Fields (excluding rare + unrequired) ===")
for (label, required), count in grouped_counter.most_common():
    if count <= 1 and not required:
        continue
    print(f"{label}: {count} {'(required)' if required else ''}")
    if label in grouped_variants:
        print(f"  ↳ Includes: {', '.join(sorted(grouped_variants[label]))}")

# === ROUND 2: GPT-BASED GROUPING ===
# Step 1: exclude count<2 and unrequired
filtered_fields = {
    label: info for label, info in field_details.items()
    if not (info["count"] < 2 and info["unrequired"] == info["count"])
}
gpt_grouped = ask_gpt_to_group(list(filtered_fields.keys()))

# Step 2: summarize new counts
final_summary = []
for group, labels in gpt_grouped.items():
    total = sum(field_details[l]["count"] for l in labels if l in field_details)
    unrequired = sum(field_details[l]["unrequired"] for l in labels if l in field_details)
    final_summary.append((group, total, unrequired))



=== Grouped Form Fields (excluding rare + unrequired) ===
Email: 62 
  ↳ Includes: Announce your new employees, promotions, board positions, community notes and leaders in your organization to The Daily Record’s influential audience. Your accolade will appear in print and online, as well as in search, so the business and legal community will stay in the know and your leader will be recognized. Opportunities include: Movers and Shakers posting: $300. Includes a standard photo, up to 300 characters of text in print for one issue (limit of 8 per page) on the Movers and Shakers page, plus visibility in a prime location on the homepage at TheDailyRecord.com. Featured Mover: $850. Includes a large photo, up to 1,500 characters of text, company logo, preferred placement in the center of the Movers and Shakers page (one per issue) plus visibility in a prime location on the homepage at TheDailyRecord.com. Note: The price is per person. A new form is needed for each submission. Professional ann

In [13]:
# === SUMMARY TABLE ===
print("\n=== Summary Table (GPT Grouped) ===")
print(tabulate(
    sorted(final_summary, key=lambda x: x[1], reverse=True),
    headers=["Field Name", "Total Count", "Unrequired Count"],
    tablefmt="github"
))


=== Summary Table (GPT Grouped) ===
| Field Name   | Total Count   | Unrequired Count   |
|--------------|---------------|--------------------|
