In [22]:
import requests
from bs4 import BeautifulSoup
from docx import Document
from docx.document import Document as _Document
from docx.oxml.table import CT_Tbl
from docx.table import _Cell, Table
import difflib
import html
import pandas as pd

def compare_texts(text1, text2):
    # Preprocess the text (remove spaces and special characters)
    def preprocess_text(text):
        return ''.join(c for c in text if c.isalnum())

    text1_clean = preprocess_text(text1)
    text2_clean = preprocess_text(text2)

    # Calculate similarity percentage
    seq = difflib.SequenceMatcher(None, text1_clean, text2_clean)
    similarity_percentage = seq.ratio() * 100

    # Get the differences
    diff = difflib.unified_diff(text1.splitlines(), text2.splitlines(), lineterm='')

    # Create a pretty HTML representation of the differences
    html_diff = difflib.HtmlDiff()
    html_diff_output = html_diff.make_file(text1.splitlines(), text2.splitlines())

    return similarity_percentage, diff, html_diff_output

def fetch_webpage_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for request success
        soup = BeautifulSoup(response.content, 'html.parser')
        main_section = soup.find('div', class_='flex flex-wrap flex-col')
        return main_section.get_text() if main_section else soup.get_text()
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
    except Exception as err:
        print(f"Other error occurred: {err}")
    return None

def read_docx_content(docx_path):
    doc = Document(docx_path)
    full_text = []

    # Extract text from paragraphs and tables in order
    for element in doc.element.body:
        if isinstance(element, CT_Tbl):
            table = Table(element, doc)
            for row in table.rows:
                row_text = '\n'.join(cell.text for cell in row.cells)
                full_text.append(row_text)
        else:
            # Handle NoneType text
            text = element.text if element.text is not None else ''
            full_text.append(text)

    return '\n'.join(full_text)

def tables(parent):
    if isinstance(parent, _Document):
        element = parent.element.body
    elif isinstance(parent, _Cell):
        element = parent._tc

    for child in element.iterchildren():
        if isinstance(child, CT_Tbl):
            table = Table(child, parent)
            data = [[cell.text for cell in row.cells] for row in table.rows]
            yield pd.DataFrame(data, columns=data[0])

#enter route IATA here
route_name = "BOM-JDH"

# Webpage URL and doc path in local system
webpage_url = 'https://www.airindia.com/en/book-flights/mumbai-to-Jodhpur-flights'
docx_path = fr'C:\Users\KamlendraSingh\OneDrive - AIR INDIA LIMITED\Documents\EM Long Form Content\EM-Content-Docs\{route_name}.docx'

print(docx_path)

text2 = fetch_webpage_content(webpage_url)
text1 = read_docx_content(docx_path)

if text2 and text1:
    similarity, differences, html_differences = compare_texts(text1, text2)
    print(f"Similarity Percentage: {similarity:.2f}%")
    # print("\nDifferences:")
    # print('\n'.join(differences))

    # Save the HTML differences to a file
    with open(fr"C:\Users\KamlendraSingh\OneDrive - AIR INDIA LIMITED\Documents\EM Long Form Content\EM-Content-Docs\Content-Differences\{route_name}.html", "w", encoding="utf-8") as html_file:
        html_file.write(html_differences)
    print(f"HTML differences saved to {route_name}.html")
else:
    print("Failed to fetch webpage or document content.")


C:\Users\KamlendraSingh\OneDrive - AIR INDIA LIMITED\Documents\EM Long Form Content\EM-Content-Docs\BOM-JDH.docx
Similarity Percentage: 92.67%
HTML differences saved to BOM-JDH.html
