In [5]:
# ⇣ à exécuter une fois au début du notebook
%env GROQ_TOKEN=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


env: GROQ_TOKEN=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


In [1]:
# ==============================================================
# 📊  Title‑Aware Due‑Diligence Question Generator  (Colab + Groq)
# ==============================================================

# ①  INSTALL  ─ libs légères + openai≥1
!pip install -q PyMuPDF tqdm regex python-dotenv "openai>=1.3.7"

# ②  Google Drive
from google.colab import drive
drive.mount("/content/drive")

PDF_DIR = "/content/drive/MyDrive/DueDilDocuments"          # dossier PDF
OUT_MD  = "/content/drive/MyDrive/due_diligence_questions.md"

# ③  Clé & endpoint Groq Cloud  (API style OpenAI v1)
import os, json, fitz, regex as re2, tqdm
from statistics import quantiles
from pathlib import Path
from openai import OpenAI

os.environ["OPENAI_API_KEY"]  = "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"          # <‑‑ remplace
os.environ["OPENAI_BASE_URL"] = "https://api.groq.com/openai/v1"

client = OpenAI()   # prendra clé + base_url depuis les variables d'environnement

# ④  PDF ➜ sections (détection de titres)
HEAD_RE_CAPS = re2.compile(r"^[\p{Lu}0-9 &/–-]{6,}$")
HEAD_RE_NUM  = re2.compile(r"^\d+(\.\d+)*\s+\S+")

def collect_big_fonts(page):
    sizes = [s["size"]
             for blk in page.get_text("dict")["blocks"]
             for ln  in blk.get("lines", [])
             for s   in ln.get("spans", [])]
    if not sizes: return set()
    thresh = quantiles(sizes, n=10)[-1]
    return {s for s in sizes if s >= thresh}

def lines_with_fonts(page):
    for blk in page.get_text("dict")["blocks"]:
        for ln in blk.get("lines", []):
            txt = "".join(sp["text"] for sp in ln["spans"]).strip()
            if txt:
                yield txt, max(sp["size"] for sp in ln["spans"])

def split_by_titles(pdf_path):
    doc = fitz.open(pdf_path)
    sections, buf, head = [], [], "INTRODUCTION"
    for page in doc:
        big = collect_big_fonts(page)
        for txt, sz in lines_with_fonts(page):
            looks_head = (
                sz in big or txt.endswith(":") or
                HEAD_RE_CAPS.match(txt) or HEAD_RE_NUM.match(txt)
            ) and len(txt.split()) <= 15
            if looks_head:
                if buf:
                    sections.append((head, "\n".join(buf).strip()))
                    buf = []
                head = txt.rstrip(":")
            else:
                buf.append(txt)
    if buf:
        sections.append((head, "\n".join(buf).strip()))
    return sections

# ⑤  LLM helpers (Groq : modèle Mixtral‑8×7B)
LABELS = ["AML / KYC","Fund Regulation","Market Manipulation",
          "Stablecoins","Custody & Wallet Security",
          "Liquidity","Tokenomics","Tax & Reporting"]

def gen_questions(passage:str, k:int=5):
    prompt = ("Act like a senior compliance officer, understand deeply this document "
              "and generate the questions that I need to ask to generate a due diligence report:\n\n"
              f"{passage}\n\nPlease provide {k} critical questions, one per line.")

    rsp = client.chat.completions.create(
        model="gemma2-9b-it",
        messages=[{"role":"user","content":prompt}],
        max_tokens=300, temperature=0.7
    )
    raw = rsp.choices[0].message.content
    qs  = [re2.sub(r"^[\-\d\.\)\s]*","",l).strip()
           for l in raw.splitlines() if l.strip()]
    return [q.rstrip(".")+"?" if not q.endswith("?") else q for q in qs][:k]

def bucket(qs:list[str]):
    if not qs: return {}
    results = {l:[] for l in LABELS}

    for q in qs:
        prompt = (f"Act like a senior compliance officer. Choose ONLY ONE of these categories for the following question: "
                  f"{', '.join(LABELS)}.\n\nQuestion: {q}\n\n"
                  f"Reply with just the category name, nothing else.")

        try:
            rsp = client.chat.completions.create(
                model="gemma2-9b-it",
                messages=[{"role":"user","content":prompt}],
                temperature=0,
                max_tokens=50  # Keep response short
            )

            category = rsp.choices[0].message.content.strip()

            # Find the best matching category
            matched_category = None
            for label in LABELS:
                if label.lower() in category.lower():
                    matched_category = label
                    break

            # If no match found, use the closest match
            if not matched_category:
                for label in LABELS:
                    if any(word.lower() in category.lower() for word in label.split()):
                        matched_category = label
                        break

            # If still no match, assign to first category
            if not matched_category and LABELS:
                matched_category = LABELS[0]

            # Add question to the matched category
            if matched_category:
                results[matched_category].append(q)

        except Exception as e:
            print(f"Error processing question: {q}")
            print(f"Error: {e}")
            # Continue with next question

    return {k:v for k,v in results.items() if v}

# ⑥  Pipeline dossier PDF - THIS FUNCTION WAS MISSING
def run_dir(pdf_dir):
    rows=[]
    for pdf in tqdm.tqdm(list(Path(pdf_dir).glob("*.pdf")), desc="PDFs"):
        for idx,(title,body) in enumerate(split_by_titles(pdf),1):
            if len(body.split())<40: continue
            qs = gen_questions(body,5)
            bk = bucket(qs)
            if bk: rows.append((pdf.name,idx,title,bk))
    return rows

def write_md(rows,outfile):
    with open(outfile,"w",encoding="utf-8") as f:
        for pdf,idx,title,bk in rows:
            f.write(f"## {pdf} – Section {idx}: {title}\n\n")
            for cat,qs in bk.items():
                f.write(f"### {cat}\n")
                for q in qs: f.write(f"- {q}\n")
                f.write("\n")
            f.write("\n---\n\n")

# ⑦  Exécution
rows = run_dir(PDF_DIR)
write_md(rows, OUT_MD)
print("✅ Rapport généré :", OUT_MD)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive


PDFs: 100%|██████████| 5/5 [25:10<00:00, 302.19s/it]

✅ Rapport généré : /content/drive/MyDrive/due_diligence_questions.md



