<a href="https://colab.research.google.com/github/kirtivardhan1222-crypto/Python-Work/blob/main/Python_Code_2nd_Iteration_QA_2nd_Pass.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# --- INSTALL DEPENDENCY ---
!pip install python-docx

from docx import Document
from google.colab import files
import re, textwrap, html
from datetime import datetime
from IPython.display import HTML, display

# --- UPLOAD FILE ---
print("Upload your DOCX file:")
uploaded = files.upload()
FILENAME = list(uploaded.keys())[0]
print(f"File uploaded: {FILENAME}")

# --- CONFIGURATION SECTION ---
REQUIRED_SECTIONS = [
    "Executive Summary",
    "Scope",
    "Findings",
    "Recommendations",
    "Conclusion",
]

BANNED_WORDS = {
    "utilize": "use",
    "synergy": "collaboration",
    "impactful": "effective",
}

# --- LOGIC STARTS HERE ---
def read_doc(path: str):
    """
    Returns:
      - full_text (str): all paragraph text joined by newlines
      - paragraphs (list[str]): raw paragraph strings
      - headings_found (set[str]): set of normalized headings detected via Word styles AND text match
    """
    doc = Document(path)
    paras = []
    headings_found = set()

    for p in doc.paragraphs:
        txt = p.text or ""
        paras.append(txt)

        # detect headings by Word style (Heading 1/2/3...) OR by text that looks like a section title
        style_name = (p.style.name or "").lower()
        if ("heading" in style_name and any(s.lower() in style_name for s in ["heading", "title"])) or re.match(r"^\s*#[#\s]*", txt):
            headings_found.add(txt.strip().lower())

        # also, if a paragraph exactly equals a required section (case-insensitive), count it
        if txt.strip().lower() in [s.lower() for s in REQUIRED_SECTIONS]:
            headings_found.add(txt.strip().lower())

    full_text = "\n".join(paras)
    return full_text, paras, headings_found

def normalize(s: str) -> str:
    return re.sub(r"\s+", " ", s.strip().lower())

def check_required_sections(full_text: str, headings_found: set) -> dict:
    """
    Checks sections both by keyword presence in text AND by headings_found.
    """
    tlow = full_text.lower()
    results = {}
    for sec in REQUIRED_SECTIONS:
        present = (sec.lower() in tlow) or (sec.lower() in headings_found)
        results[sec] = present
    return results

def check_banned_words(full_text: str) -> dict:
    """
    Returns dict: bad_word -> list of (start,end) indices where found
    """
    tlow = full_text.lower()
    hits = {}
    for bad in BANNED_WORDS.keys():
        pattern = rf"\b{re.escape(bad.lower())}\b"
        matches = list(re.finditer(pattern, tlow))
        if matches:
            hits[bad] = [(m.start(), m.end()) for m in matches]
    return hits

def check_duplicates(paragraphs: list[str]) -> list[str]:
    """
    Returns list of duplicate lines (exact text matches after stripping)
    """
    lines = [l.strip() for l in paragraphs if l.strip()]
    seen = set()
    dups = []
    for l in lines:
        if l in seen:
            dups.append(l)
        else:
            seen.add(l)
    return dups

def quality_score(section_ok: dict, banned_hits: dict, dup_lines: list[str]) -> int:
    """
    Simple score out of 100:
      - Start at 100
      - -10 for each missing required section (min 0)
      - -2 for each banned word occurrence instance
      - -3 for each duplicate line (capped to 30)
      - Clamp 0..100
    """
    score = 100
    missing = sum(1 for v in section_ok.values() if not v)
    banned_count = sum(len(v) for v in banned_hits.values())
    dup_count = len(dup_lines)

    score -= 10 * missing
    score -= 2 * banned_count
    score -= min(30, 3 * dup_count)
    return max(0, min(100, score))

def highlight_banned_in_html(text: str, banned_hits: dict) -> str:
    """
    Produces HTML with banned words wrapped in <mark>.
    """
    # Build segments without overlapping replacements by walking the string
    marks = []
    for bad, ranges in banned_hits.items():
        for (s, e) in ranges:
            marks.append((s, e, bad))
    marks.sort(key=lambda x: x[0])

    out = []
    cursor = 0
    safe = html.escape(text)
    # BUT indices were on lowercased original; we need to rebuild mapping
    # Simple approach: rebuild by scanning original text (not escaped), then escape on the fly.
    # Redo with original text:
    raw = text
    out = []
    cursor = 0
    # Rebuild ranges on raw (already are). We will slice raw, then escape each slice.
    for (s, e, bad) in marks:
        if s < cursor:
            continue  # skip overlaps
        out.append(html.escape(raw[cursor:s]))
        bad_raw = html.escape(raw[s:e])
        out.append(f"<mark title='Use “{BANNED_WORDS[bad]}” instead'>{bad_raw}</mark>")
        cursor = e
    out.append(html.escape(raw[cursor:]))
    return "".join(out)

def build_html_report(filename: str,
                      section_ok: dict,
                      banned_hits: dict,
                      dup_lines: list[str],
                      highlighted_text_html: str,
                      score: int) -> str:
    now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    missing_list = [sec for sec, ok in section_ok.items() if not ok]

    css = """
    <style>
      body { font-family: system-ui, -apple-system, Segoe UI, Roboto, Arial, sans-serif; line-height: 1.5; padding: 24px; }
      h1 { margin-bottom: 0; }
      .meta { color: #555; margin-bottom: 20px; }
      .chip { display: inline-block; padding: 4px 10px; border-radius: 16px; border: 1px solid #ddd; margin: 2px 6px 2px 0; }
      .ok { background: #e9f7ef; border-color: #b2dfbd; }
      .bad { background: #fdecea; border-color: #f5c6cb; }
      mark { padding: 0 3px; border-radius: 3px; }
      .score { font-size: 28px; font-weight: 700; }
      .sec { margin-top: 24px; }
      pre { white-space: pre-wrap; word-wrap: break-word; border: 1px solid #eee; padding: 12px; border-radius: 6px; background: #fafafa; }
    </style>
    """
    sec_html = "".join(
        f"<span class='chip {'ok' if ok else 'bad'}'>{html.escape(sec)}: {'Present' if ok else 'Missing'}</span>"
        for sec, ok in section_ok.items()
    )

    banned_html = "✅ No banned words found." if not banned_hits else "<br>".join(
        f"⚠️ <b>{html.escape(bad)}</b> → “{html.escape(BANNED_WORDS[bad])}” ({len(ranges)}×)"
        for bad, ranges in banned_hits.items()
    )

    dups_html = "✅ No duplicate lines found." if not dup_lines else "<ul>" + "".join(
        f"<li>{html.escape(l[:120])}{'…' if len(l)>120 else ''}</li>" for l in dup_lines
    ) + "</ul>"

    html_doc = f"""<!doctype html>
<html>
<head>
<meta charset="utf-8">
<title>Report Audit — {html.escape(filename)}</title>
{css}
</head>
<body>
  <h1>Report Audit</h1>
  <div class="meta">{html.escape(filename)} • Generated {html.escape(now)}</div>

  <div class="sec">
    <div class="score">Quality Score: {score}/100</div>
  </div>

  <div class="sec">
    <h2>Required Sections</h2>
    {sec_html}
  </div>

  <div class="sec">
    <h2>Banned Words</h2>
    <div>{banned_html}</div>
  </div>

  <div class="sec">
    <h2>Duplicate Lines</h2>
    <div>{dups_html}</div>
  </div>

  <div class="sec">
    <h2>Document (with highlights)</h2>
    <pre>{highlighted_text_html}</pre>
  </div>
</body>
</html>"""
    return html_doc

# --- RUN ---
full_text, paragraphs, headings_found = read_doc(FILENAME)
section_ok = check_required_sections(full_text, headings_found)
banned_hits = check_banned_words(full_text)
dup_lines   = check_duplicates(paragraphs)
score       = quality_score(section_ok, banned_hits, dup_lines)

highlighted_html = highlight_banned_in_html(full_text, banned_hits)
report_html = build_html_report(
    filename=FILENAME,
    section_ok=section_ok,
    banned_hits=banned_hits,
    dup_lines=dup_lines,
    highlighted_text_html=highlighted_html,
    score=score
)

# Save + display + download
out_path = "report_audit.html"
with open(out_path, "w", encoding="utf-8") as f:
    f.write(report_html)

display(HTML(f"<p><b>Done.</b> Download the report: <a href='files/{out_path}' target='_blank'>{out_path}</a></p>"))
display(HTML(report_html))
