In [44]:
# SINGLE-CELL: Full ClauseWise app (flashy UI + multi-feature backend)
# Copy-paste into one Colab python cell and run.

# 1) Install dependencies
!pip install -q flask flask-cors pyngrok pdfplumber python-docx pillow pytesseract requests pdf2image

# 2) Full app code (writes templates/static then runs server)
import os, io, time, uuid, tempfile
from flask import Flask, request, jsonify, render_template, send_file
from flask_cors import CORS
from pyngrok import ngrok
from PIL import Image
import pytesseract
import pdfplumber
import docx
import requests
from pdf2image import convert_from_path

# ---------------- CONFIG ----------------
# Hugging Face token (you provided earlier) - keep private in real use.
HUGGING_FACE_TOKEN = "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"

# Ngrok token
NGROK_TOKEN = "31zncZCWYbr5fHKJ1wXdn4A0191_37bhg5ZoKURDu4eEcumWJ"

# Model to use for generative tasks (prompt-based)
HF_MODEL = "google/flan-t5-large"   # good general-purpose instruction model
HF_API_URL = f"https://api-inference.huggingface.co/models/{HF_MODEL}"
HF_HEADERS = {"Authorization": f"Bearer {HUGGING_FACE_TOKEN}"}

# Upload folder
os.makedirs("templates", exist_ok=True)
os.makedirs("static", exist_ok=True)
os.makedirs("uploads", exist_ok=True)

# ---------------- Flask ----------------
app = Flask(__name__, template_folder="templates", static_folder="static")
CORS(app)

# ---------------- Helper: Text extraction ----------------
def ocr_image(pil_img):
    try:
        return pytesseract.image_to_string(pil_img)
    except Exception:
        return ""

def extract_text_from_pdf(path):
    text_parts = []
    try:
        with pdfplumber.open(path) as pdf:
            for i, page in enumerate(pdf.pages, start=1):
                page_text = page.extract_text()
                if page_text and page_text.strip():
                    text_parts.append(page_text)
                else:
                    # fallback: render page image and OCR
                    try:
                        images = convert_from_path(path, first_page=i, last_page=i, dpi=300)
                        if images:
                            text_parts.append(ocr_image(images[0]))
                    except Exception:
                        try:
                            img = page.to_image(resolution=300).original
                            # page.to_image().original returns numpy array; convert:
                            pil = Image.fromarray(img) if hasattr(Image, 'fromarray') else None
                            if pil:
                                text_parts.append(ocr_image(pil))
                        except Exception:
                            continue
    except Exception:
        # fallback full conversion
        try:
            images = convert_from_path(path, dpi=300)
            for img in images:
                text_parts.append(ocr_image(img))
        except Exception:
            return ""
    return "\n".join([p for p in text_parts if p and p.strip()])

def extract_text_generic(path):
    ext = os.path.splitext(path)[1].lower()
    if ext == ".pdf":
        return extract_text_from_pdf(path)
    if ext in [".docx"]:
        try:
            doc = docx.Document(path)
            return "\n".join([p.text for p in doc.paragraphs if p.text])
        except Exception:
            return ""
    if ext in [".png", ".jpg", ".jpeg", ".bmp", ".tiff"]:
        try:
            img = Image.open(path)
            return ocr_image(img)
        except Exception:
            return ""
    if ext in [".txt", ".md", ".csv"]:
        try:
            with open(path, "r", encoding="utf-8", errors="ignore") as f:
                return f.read()
        except Exception:
            return ""
    return ""

# ---------------- Helper: call HF model (instruction prompt) ----------------
def call_hf_instruction(prompt, max_tokens=512):
    payload = {"inputs": prompt, "options":{"wait_for_model": True}}
    try:
        r = requests.post(HF_API_URL, headers=HF_HEADERS, json=payload, timeout=120)
        if r.status_code != 200:
            return {"error": f"Hugging Face API returned {r.status_code}: {r.text}"}
        data = r.json()
        # generative response often in list with generated_text
        if isinstance(data, list) and data:
            first = data[0]
            if isinstance(first, dict) and "generated_text" in first:
                return {"text": first["generated_text"]}
            # other models may return different keys
            # fallback to stringifying
            return {"text": str(first)}
        elif isinstance(data, dict):
            if "error" in data:
                return {"error": data["error"]}
            # fallback text
            for k in ("generated_text", "summary_text", "text"):
                if k in data:
                    return {"text": data[k]}
            return {"text": str(data)}
        else:
            return {"text": str(data)}
    except Exception as e:
        return {"error": f"Request failed: {e}"}

# ---------------- Chunking helper ----------------
def chunk_text(text, max_chars=3000):
    chunks = []
    start = 0
    n = len(text)
    while start < n:
        end = min(start + max_chars, n)
        if end < n:
            cut = text.rfind("\n", start, end)
            if cut <= start:
                cut = text.rfind(" ", start, end)
            if cut > start:
                end = cut
        chunks.append(text[start:end].strip())
        start = end
    return [c for c in chunks if c]

# ---------------- High-level feature implementations (prompt based) ----------------
def feature_simplify(text):
    prompt = (
        "You are ClauseWise — simplify the legal clauses below into short, clear, layman-friendly language. "
        "Keep the meaning exactly the same. For each clause, output a heading 'Clause X:' and then the simplified text.\n\n"
        f"Document excerpt:\n{text}"
    )
    return call_hf_instruction(prompt)

def feature_ner(text):
    prompt = (
        "You are ClauseWise. Extract named entities from the text and return JSON with these top-level keys: "
        "'parties' (list), 'dates' (list), 'money' (list), 'obligations' (list of short obligations), 'terms' (list of legal terms).\n\n"
        f"Text:\n{text}\n\nReturn only valid JSON."
    )
    return call_hf_instruction(prompt)

def feature_clause_extract(text):
    prompt = (
        "You are ClauseWise. Split the following legal document into numbered clauses. Output a JSON array of objects "
        "each with keys: 'clause_number' and 'clause_text'. Keep clauses concise.\n\n"
        f"Document:\n{text}\n\nReturn only JSON."
    )
    return call_hf_instruction(prompt)

def feature_classify(text):
    prompt = (
        "You are ClauseWise. Classify the uploaded legal document into one of: NDA, Lease, Employment Contract, Service Agreement, Other. "
        "Give a one-line explanation for the classification. Return JSON like: {\"type\":\"NDA\",\"confidence\":0.9,\"explanation\":\"...\"}.\n\n"
        f"Document excerpt:\n{text}"
    )
    return call_hf_instruction(prompt)

# ---------------- ROUTES ----------------
@app.route("/")
def index():
    return render_template("index.html")

@app.route("/analyze", methods=["POST"])
def analyze_route():
    if "file" not in request.files:
        return jsonify({"error": "No file provided"}), 400

    f = request.files["file"]
    orig_name = f.filename or f"upload_{uuid.uuid4().hex}"
    safe_name = orig_name.replace("/", "_").replace("\\", "_")
    tmp_path = os.path.join("uploads", f"{uuid.uuid4().hex}_{safe_name}")
    f.save(tmp_path)

    # extract text
    text = extract_text_generic(tmp_path)
    if not text or text.strip() == "":
        return jsonify({"error": "Could not extract any readable text from the uploaded file."}), 400

    # feature selection from request.form (default to all)
    feature = request.form.get("feature", "all")  # "simplify", "ner", "clauses", "classify", "all"
    # chunking & processing
    if feature == "simplify":
        # chunk large docs and summarize/simplify per chunk
        chunks = chunk_text(text, 2500)
        out_parts = []
        for c in chunks:
            res = feature_simplify(c)
            if "error" in res: return jsonify({"error": res["error"]}), 502
            out_parts.append(res.get("text",""))
        return jsonify({"result": "\n\n---\n\n".join(out_parts)})
    elif feature == "ner":
        res = feature_ner(text)
        if "error" in res: return jsonify({"error": res["error"]}), 502
        # Try to parse JSON (model should return JSON)
        return jsonify({"result_raw": res.get("text","")})
    elif feature == "clauses":
        res = feature_clause_extract(text)
        if "error" in res: return jsonify({"error": res["error"]}), 502
        return jsonify({"result_raw": res.get("text","")})
    elif feature == "classify":
        res = feature_classify(text)
        if "error" in res: return jsonify({"error": res["error"]}), 502
        return jsonify({"result_raw": res.get("text","")})
    else:
        # "all" - run all features (may take longer)
        results = {}
        # Simplify (quick chunked)
        chunks = chunk_text(text, 2500)
        simp_parts = []
        for c in chunks:
            r = feature_simplify(c)
            if "error" in r: results["simplify_error"] = r["error"]; break
            simp_parts.append(r.get("text",""))
        results["simplified"] = "\n\n---\n\n".join(simp_parts)
        # NER
        ner_r = feature_ner(text)
        results["ner_raw"] = ner_r.get("text","") if "text" in ner_r else ner_r.get("error", "")
        # Clauses
        cl_r = feature_clause_extract(text)
        results["clauses_raw"] = cl_r.get("text","") if "text" in cl_r else cl_r.get("error","")
        # Classification
        clz_r = feature_classify(text)
        results["classification_raw"] = clz_r.get("text","") if "text" in clz_r else clz_r.get("error","")
        return jsonify({"analysis": results, "meta": {"chars": len(text)}})

# ---------------- Download report (simple txt) ----------------
@app.route("/download", methods=["POST"])
def download_report():
    payload = request.get_json()
    if not payload or "analysis" not in payload:
        return jsonify({"error":"No analysis provided"}), 400
    fname = payload.get("filename","clausewise_report.txt")
    body = []
    a = payload["analysis"]
    if isinstance(a, dict):
        for k,v in a.items():
            body.append(f"=== {k} ===\n{v}\n\n")
    else:
        body.append(str(a))
    tmp = os.path.join("uploads", f"report_{uuid.uuid4().hex}.txt")
    with open(tmp, "w", encoding="utf-8") as fh:
        fh.write("\n".join(body))
    return send_file(tmp, as_attachment=True, download_name=fname)

# ------------------ Frontend: Big flashy UI (templates/index.html + static/script.js + static/styles.css) ----------------
index_html = r'''
<!doctype html>
<html lang="en">
<head>
  <meta charset="utf-8">
  <title>ClauseWise — Legal Document Analyzer</title>
  <meta name="viewport" content="width=device-width,initial-scale=1" />
  <link rel="stylesheet" href="/static/styles.css">
</head>
<body>
  <div class="app">
    <aside class="sidebar">
      <div class="brand"><div class="logo">⚖️</div><div><h2>ClauseWise</h2><div class="tag">Legal Document Analyzer</div></div></div>
      <div class="desc">AI-powered clause simplification, NER, clause extraction, classification & more.</div>
      <div class="controls">
        <label for="feature">Choose feature</label>
        <select id="feature">
          <option value="all" selected>All (full analysis)</option>
          <option value="simplify">Clause Simplification</option>
          <option value="ner">Named Entity Recognition (NER)</option>
          <option value="clauses">Clause Extraction</option>
          <option value="classify">Document Classification</option>
        </select>
      </div>
      <div class="footer">Powered by Granite (placeholder) • IBM Watson (placeholder) • HF</div>
    </aside>

    <main class="main">
      <header><h1>ClauseWise</h1><p>Upload PDF, DOCX, TXT or Image — we extract, analyze & summarize.</p></header>

      <section class="uploader">
        <div id="dropzone" class="dropzone">
          <div class="dz-text">Drop files here or click to browse</div>
          <input id="fileInput" type="file" accept=".pdf,.docx,.txt,.md,.png,.jpg,.jpeg"/>
        </div>
        <div class="buttons">
          <button id="analyzeBtn" class="primary">Analyze Document</button>
          <button id="clearBtn" class="ghost">Clear</button>
          <button id="downloadBtn" class="small">Download Report</button>
        </div>
        <div id="progress" class="progress"><div id="bar" class="bar"></div><div id="ptext" class="ptext">Waiting</div></div>
      </section>

      <section id="results" class="results hidden">
        <div class="result-card">
          <div class="result-header"><h3>Analysis Results</h3></div>
          <div class="grid">
            <div class="panel"><h4>General Summary / Simplified</h4><pre id="simplified"></pre></div>
            <div class="panel"><h4>Named Entities (NER)</h4><pre id="ner"></pre></div>
            <div class="panel full"><h4>Clauses</h4><pre id="clauses"></pre></div>
            <div class="panel"><h4>Classification</h4><pre id="classification"></pre></div>
          </div>
        </div>
      </section>
    </main>
  </div>
  <script src="/static/script.js"></script>
</body>
</html>
'''

styles_css = r'''
:root{
  --bg:#071124;--card:rgba(255,255,255,0.03);--accent:#00d4ff;--muted:#9fb4c8;
}
*{box-sizing:border-box}
html,body{height:100%;margin:0;font-family:Inter,Segoe UI,Roboto;background:linear-gradient(135deg,var(--bg),#0d1b2a);color:#e8f6ff}
.app{display:flex;gap:20px;height:100vh;padding:28px}
.sidebar{width:300px;background:linear-gradient(180deg,rgba(255,255,255,0.02),rgba(255,255,255,0.01));padding:20px;border-radius:14px;display:flex;flex-direction:column;gap:12px}
.brand{display:flex;gap:12px;align-items:center}
.logo{width:56px;height:56px;border-radius:12px;background:linear-gradient(90deg,var(--accent),#6b7bff);display:flex;align-items:center;justify-content:center;font-size:22px}
.tag{font-size:12px;color:var(--muted)}
.desc{font-size:13px;color:var(--muted)}
.controls{margin-top:8px}
select{width:100%;padding:10px;border-radius:8px;border:none;background:rgba(255,255,255,0.02);color:#e8f6ff}
.footer{font-size:12px;color:var(--muted);margin-top:auto}

/* main */
.main{flex:1;display:flex;flex-direction:column;gap:14px}
header{background:var(--card);padding:18px;border-radius:12px}
.uploader{background:var(--card);padding:18px;border-radius:12px;display:flex;flex-direction:column;gap:12px}
.dropzone{border:2px dashed rgba(255,255,255,0.04);padding:18px;border-radius:10px;display:flex;align-items:center;justify-content:center;cursor:pointer}
.dropzone:hover{background:rgba(255,255,255,0.01)}
.dropzone input{display:none}
.buttons{display:flex;gap:8px}
button{padding:10px 14px;border-radius:10px;border:none;cursor:pointer}
.primary{background:linear-gradient(90deg,var(--accent),#6b7bff);color:#012;font-weight:700}
.ghost{background:transparent;border:1px solid rgba(255,255,255,0.04);color:var(--muted)}
.small{background:rgba(255,255,255,0.02);color:var(--muted)}
.progress{height:10px;background:rgba(255,255,255,0.03);border-radius:8px;overflow:hidden;position:relative}
.bar{height:100%;width:0%;background:linear-gradient(90deg,var(--accent),#6b7bff);transition:width 400ms}
.ptext{position:absolute;right:10px;top:-22px;color:var(--muted);font-size:12px}

/* results */
.results{margin-top:6px}
.result-card{background:linear-gradient(180deg,rgba(255,255,255,0.02),rgba(255,255,255,0.01));padding:12px;border-radius:12px}
.grid{display:grid;grid-template-columns:1fr 1fr;gap:12px}
.panel{background:rgba(0,0,0,0.35);padding:12px;border-radius:8px;min-height:120px;overflow:auto}
.panel.full{grid-column:1 / -1}
.hidden{display:none}
pre{white-space:pre-wrap;color:#e6f7ff;font-size:13px}
'''

script_js = r'''
const dropzone = document.getElementById("dropzone");
const fileInput = document.getElementById("fileInput");
const analyzeBtn = document.getElementById("analyzeBtn");
const clearBtn = document.getElementById("clearBtn");
const downloadBtn = document.getElementById("downloadBtn");
const bar = document.getElementById("bar");
const ptext = document.getElementById("ptext");
const results = document.getElementById("results");
const simplified = document.getElementById("simplified");
const ner = document.getElementById("ner");
const clauses = document.getElementById("clauses");
const classification = document.getElementById("classification");
const featureSelect = document.getElementById("feature");

// dropzone UX
dropzone.addEventListener("click", ()=> fileInput.click());
dropzone.addEventListener("dragover", (e)=>{ e.preventDefault(); dropzone.style.borderColor = "#6ee6ff"; });
dropzone.addEventListener("dragleave", (e)=>{ e.preventDefault(); dropzone.style.borderColor = ""; });
dropzone.addEventListener("drop", (e)=>{ e.preventDefault(); fileInput.files = e.dataTransfer.files; });

// clear
clearBtn.addEventListener("click", ()=>{
  fileInput.value = "";
  results.classList.add("hidden");
  bar.style.width = "0%";
  ptext.textContent = "Waiting";
  simplified.textContent = ""; ner.textContent=""; clauses.textContent=""; classification.textContent="";
});

// analyze
analyzeBtn.addEventListener("click", async ()=>{
  if(!fileInput.files.length){ alert("Please select a file."); return; }
  const f = fileInput.files[0];
  const feature = featureSelect.value;
  const fd = new FormData(); fd.append("file", f); fd.append("feature", feature);
  bar.style.width = "12%"; ptext.textContent = "Uploading...";
  try{
    const resp = await fetch("/analyze", { method: "POST", body: fd });
    bar.style.width = "50%"; ptext.textContent = "Analyzing...";
    if(!resp.ok){
      const err = await resp.json().catch(()=>({error:"Server error"}));
      alert(err.error || "Server error");
      bar.style.width = "0%"; ptext.textContent="Error";
      return;
    }
    const data = await resp.json();
    bar.style.width = "100%"; ptext.textContent = "Done";

    // adapt to feature
    if(feature === "simplify"){
      simplified.textContent = data.result || data.result_raw || data.analysis || JSON.stringify(data);
      results.classList.remove("hidden");
      ner.textContent=""; clauses.textContent=""; classification.textContent="";
    } else if(feature === "ner"){
      ner.textContent = data.result_raw || data.analysis || JSON.stringify(data);
      results.classList.remove("hidden");
      simplified.textContent=""; clauses.textContent=""; classification.textContent="";
    } else if(feature === "clauses"){
      clauses.textContent = data.result_raw || data.analysis || JSON.stringify(data);
      results.classList.remove("hidden");
      simplified.textContent=""; ner.textContent=""; classification.textContent="";
    } else if(feature === "classify"){
      classification.textContent = data.result_raw || data.analysis || JSON.stringify(data);
      results.classList.remove("hidden");
      simplified.textContent=""; ner.textContent=""; clauses.textContent="";
    } else {
      // all
      const analysis = data.analysis || data;
      simplified.textContent = analysis.simplified || JSON.stringify(analysis.simplified || "");
      ner.textContent = analysis.ner_raw || JSON.stringify(analysis.ner_raw || "");
      clauses.textContent = analysis.clauses_raw || JSON.stringify(analysis.clauses_raw || "");
      classification.textContent = analysis.classification_raw || JSON.stringify(analysis.classification_raw || "");
      results.classList.remove("hidden");
    }
  }catch(e){
    alert("Request failed: "+e.message);
    bar.style.width="0%"; ptext.textContent="Failed";
  }
});

// download report (collect displayed panels)
downloadBtn.addEventListener("click", async ()=>{
  const analysis = {
    "Simplified": simplified.textContent,
    "NER": ner.textContent,
    "Clauses": clauses.textContent,
    "Classification": classification.textContent
  };
  try{
    const res = await fetch("/download", { method:"POST", headers:{"Content-Type":"application/json"}, body: JSON.stringify({analysis, filename:"ClauseWise_Report.txt"})});
    if(res.ok){
      const blob = await res.blob(); const url = URL.createObjectURL(blob); const a=document.createElement("a"); a.href=url; a.download="ClauseWise_Report.txt"; document.body.appendChild(a); a.click(); a.remove(); URL.revokeObjectURL(url);
    } else alert("Download failed");
  }catch(e){ alert("Download failed: "+e.message); }
});
'''

# Write frontend files
with open("templates/index.html","w",encoding="utf-8") as f: f.write(index_html)
with open("static/styles.css","w",encoding="utf-8") as f: f.write(styles_css)
with open("static/script.js","w",encoding="utf-8") as f: f.write(script_js)

# ---------------- Start server (ngrok)
ngrok.set_auth_token(NGROK_TOKEN)
public_url = ngrok.connect(5000).public_url
print("🚀 ClauseWise running at:", public_url)
# Run flask app (blocking)
app.run(host="0.0.0.0", port=5000)


🚀 ClauseWise running at: https://0bb51266ffde.ngrok-free.app
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [30/Aug/2025 07:16:42] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [30/Aug/2025 07:16:43] "GET /static/styles.css HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [30/Aug/2025 07:16:43] "GET /static/script.js HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [30/Aug/2025 07:16:44] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [30/Aug/2025 07:17:53] "POST /analyze HTTP/1.1" 200 -
