```
Member-1
    Name: Hariom Nabira
    Roll No: 34
Member-2
    Name: Harsh Tiwari
    Roll No: 37 
```
```
Batch: B2
Semester: 6th
Branch: CSE
```

# Problem Statement:
### HTML compiler in python or java to convert HTML code to generate appropriate output based on the tags. Also perform error correction.

In [14]:
import webbrowser

In [15]:
def read_file(file_path):
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            return file.read()
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return None

In [16]:
def tokenize_html(html):
    tokens = []
    pos = 0
    while pos < len(html):
        if html[pos] == "<":  
            end_pos = html.find(">", pos)
            if end_pos == -1:
                break
            tokens.append(html[pos:end_pos + 1])
            pos = end_pos + 1
        else:
            end_pos = html.find("<", pos)
            if end_pos == -1:
                end_pos = len(html)
            text = html[pos:end_pos].strip()
            if text:
                tokens.append(text)
            pos = end_pos
    return tokens

In [17]:
def is_opening_tag(token):
    return token.startswith("<") and not token.startswith("</") and not token.endswith("/>")

def is_closing_tag(token):
    return token.startswith("</")

def get_tag_name(token):
    return token.strip("<>/").split()[0]

In [33]:
def parse_html(tokens):
    stack = []
    errors = []

    valid_parent_child = {
    "html": ["head", "body"],
    "head": ["title", "meta", "link", "style", "script"],
    "body": ["h1", "h2", "h3", "p", "div", "span", "table", "ul", "ol", "img", "br", "a", "footer", "header", "nav", "section", "article", "aside", "main"],
    "table": ["tr"],
    "tr": ["td", "th"],
    "ul": ["li"],
    "ol": ["li"],
    "p": ["b", "i", "strong", "em", "span", "a", "img", "br"],
    "div": ["h1", "h2", "h3", "h4", "h5", "h6", "p", "ul", "ol", "table", "span", "a", "img", "footer", "section", "article", "header"],
    "section": ["h1", "h2", "h3", "h4", "h5", "h6", "p", "ul", "ol", "table", "div"],
    "article": ["h1", "h2", "h3", "h4", "h5", "h6", "p", "div", "footer", "header", "section"],
    "nav": ["a", "ul", "ol"],
    "footer": ["p", "a", "ul", "ol"],
    "header": ["h1", "h2", "h3", "h4", "h5", "h6", "nav", "p"],
    "form": ["input", "select", "textarea", "button", "label", "fieldset", "legend"],
    "input": [],  # Input elements are self-closing in HTML
    "textarea": [],  # Textarea can contain text content
    "select": ["option"],
    "option": [],  # Option tags are standalone in HTML
    "label": ["input", "select", "textarea"],
    "button": ["input", "select", "textarea"]
}

    
    # valid_parent_child = {
    #     "html": ["head", "body"],
    #     "head": ["title", "meta", "link", "style", "script"],
    #     "body": ["h1", "h2", "h3", "p", "div", "span", "table", "ul", "ol", "img", "br", "a"],
    #     "table": ["tr"],
    #     "tr": ["td", "th"],
    #     "ul": ["li"],
    #     "ol": ["li"]
    # }

    
    
    # self_closing_tags = {"br", "img", "meta", "link", "input"}

    self_closing_tags = {
        "br", "img", "meta", "link", "input", "hr", "base", "area", "col", "source", "track", "wbr"
    }


    for token in tokens:
        if is_opening_tag(token):
            tag_name = get_tag_name(token)
            if tag_name in self_closing_tags:
                continue  
            if stack:
                parent_tag = get_tag_name(stack[-1])
                if parent_tag in valid_parent_child and tag_name not in valid_parent_child[parent_tag]:
                    errors.append(f"Invalid nesting: <{tag_name}> inside <{parent_tag}>")
            stack.append(token)
        elif is_closing_tag(token):
            tag_name = get_tag_name(token)
            if not stack:
                errors.append(f"Unmatched closing tag: {token}")
            else:
                last_open = stack[-1]
                if get_tag_name(last_open) == tag_name:
                    stack.pop()
                else:
                    errors.append(f"Mismatched tag: Expected </{get_tag_name(last_open)}>, found {token}")
    
    while stack:
        errors.append(f"Unclosed tag: {stack.pop()}")

    return errors

In [24]:
def correct_html(tokens):
    stack = []
    corrected_tokens = []
    self_closing_tags = {"br", "img", "meta", "link", "input"}

    for token in tokens:
        if is_opening_tag(token):
            tag_name = get_tag_name(token)
            if tag_name not in self_closing_tags:
                stack.append(token)
            corrected_tokens.append(token)
        elif is_closing_tag(token):
            tag_name = get_tag_name(token)
            if stack and get_tag_name(stack[-1]) == tag_name:
                stack.pop()
                corrected_tokens.append(token)
            else:
                while stack and get_tag_name(stack[-1]) != tag_name:
                    corrected_tokens.append(f"</{get_tag_name(stack.pop())}>")
                if stack:
                    stack.pop()
                    corrected_tokens.append(token)
        else:
            corrected_tokens.append(token)

    while stack:
        corrected_tokens.append(f"</{get_tag_name(stack.pop())}>")

    return corrected_tokens

In [25]:
def save_and_open_html(corrected_tokens):
    corrected_html = "<!DOCTYPE html>\n" + "\n".join(corrected_tokens)

    print("\nCorrected HTML Output:\n")
    print(corrected_html)

    output_file = "corrected_output.html"
    with open(output_file, "w", encoding="utf-8") as file:
        file.write(corrected_html)
    
    webbrowser.open(output_file)


In [26]:
def main():
    file_path = input("Enter the path to your .txt file: ").strip()

    html_code = read_file(file_path)
    if html_code is not None:
        tokens = tokenize_html(html_code)
        errors = parse_html(tokens)

        if errors:
            print("\n Errors Detected:")
            for error in errors:
                print("  -", error)

        corrected_tokens = correct_html(tokens)
        save_and_open_html(corrected_tokens)

In [None]:
# C:\Users\harsh\testHtml.txt
main()