# ⚖️ Legal Domain LLM SFT Data Construction Pipeline
This Notebook integrates the complete workflow from **PDF raw text extraction** to **diverse instruction generation (Instruct Tuning)**.

### 1. Environment Preparation
Install necessary dependencies.

In [None]:
pip install pdfplumber tqdm openai

### 2. Phase 1: PDF Data Cleaning and Parsing
This section is responsible for reading PDF files, removing headers, footers, and page numbers, and using regex to extract legal articles.

In [None]:
import pdfplumber
import os
import re
import json
from tqdm.notebook import tqdm

# --- Path Configuration ---
RAW_DATA_DIR = '../data/raw' 
PROCESSED_DATA_DIR = '../data/processed'
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)

def clean_text_smart(text):
    if not text: return ""
    # A. Remove reference citation numbers
    text = re.sub(r'\[\s*\d+(?:[-–,]\d+)*\s*\]', '', text)
    text = re.sub(r'［\s*\d+(?:[-–,]\d+)*\s*］', '', text)
    # B. Remove page numbers embedded in the middle of text
    text = re.sub(r'(?:^|\s|\\n)[-—–－]\s*\d+\s*[-—–－](?=\s|\\n|$)', ' ', text)
    # C. Remove isolated line-level page numbers
    lines = text.split('\n')
    cleaned_lines = [line for line in lines if not re.fullmatch(r'[-—–－\s\d]+', line.strip())]
    text = '\n'.join(cleaned_lines)
    # D. Fix broken Chinese word segmentation
    pattern_broken_zh = r'([\u4e00-\u9fa5])\s+([\u4e00-\u9fa5])'
    text = re.sub(pattern_broken_zh, r'\1\2', text)
    text = re.sub(pattern_broken_zh, r'\1\2', text) 
    # E. Normalize whitespace characters
    text = re.sub(r'[ \t\r\f]+', ' ', text) 
    return text.strip()

def process_legal_doc(file_path):
    filename = os.path.basename(file_path)
    full_text = ""
    try:
        with pdfplumber.open(file_path) as pdf:
            for page in tqdm(pdf.pages, desc=f"Parsing {filename}", leave=False):
                width, height = page.width, page.height
                bbox = (0, height * 0.05, width, height * 0.95)
                try:
                    page_crop = page.crop(bbox=bbox)
                    text = page_crop.extract_text()
                    if text: full_text += "\n" + text
                except: continue
    except Exception as e:
        print(f"Read failed: {e}")
        return []
    
    full_text = clean_text_smart(full_text)
    pattern = r"(第[0-9零一二三四五六七八九十百千]+条[\s\S]*?)(?=第[0-9零一二三四五六七八九十百千]+条|$)"
    matches = re.findall(pattern, full_text)
    
    return [{"source": filename, "type": "legal_article", "content": re.sub(r'\s+', ' ', m).strip()} 
            for m in matches if len(m) > 15]

In [None]:
# Execute parsing loop
if os.path.exists(RAW_DATA_DIR):
    files = [f for f in os.listdir(RAW_DATA_DIR) if f.lower().endswith('.pdf')]
    all_chunks = []
    for filename in tqdm(files, desc="Total Parsing Progress"):
        if "法" in filename: # Only process files containing 'Law' in the name
            chunks = process_legal_doc(os.path.join(RAW_DATA_DIR, filename))
            all_chunks.extend(chunks)
    
    # Save intermediate results
    output_path = os.path.join(PROCESSED_DATA_DIR, 'raw_chunks.jsonl')
    with open(output_path, 'w', encoding='utf-8') as f:
        for chunk in all_chunks:
            f.write(json.dumps(chunk, ensure_ascii=False) + '\n')
    print(f"✅ Cleaning complete, total of {len(all_chunks)} legal articles obtained.")
else:
    print("❌ raw directory not found, please check the path.")

### 3. Phase 2: Diverse Instruction Data Generation (SFT)
Use LLM to transform legal articles into: **Case Analysis, Document Drafting, and Concept Explanation**.

In [None]:
import random
import time
from openai import OpenAI

# --- Fill in your API Configuration ---
API_KEY = "Your_API_Key"
BASE_URL = "https://api.siliconflow.cn/v1"
MODEL_NAME = "deepseek-ai/DeepSeek-V3"

client = OpenAI(api_key=API_KEY, base_url=BASE_URL)

# Prompt Templates (Case/Doc/Concept)
PROMPTS = {
    "case_analysis": "You are a senior lawyer. Please read this legal article: {content}. Construct a consultation case involving multiple conflicting parties. User: Describe the case. Assistant: <Thought Process> Analysis Logic + <Legal Advice> Conclusion.",
    "doc_drafting": "You are a lawyer. Based on this legal article: {content}, construct User: Request to draft related document. Assistant: <Thought Process> Key Points + <Document Body> Content.",
    "concept_explain": "You are a professor. Based on this legal article: {content}, construct User: Layperson asking about a concept. Assistant: <Thought Process> Deconstruction + <Simple Explanation> Examples."
}

def generate_sft_data(chunk):
    task_type = random.choices(["case_analysis", "doc_drafting", "concept_explain"], weights=[0.6, 0.2, 0.2])[0]
    prompt_tpl = PROMPTS[task_type]
    
    try:
        response = client.chat.completions.create(
            model=MODEL_NAME,
            messages=[
                {"role": "system", "content": "You are a legal expert data construction assistant. Please return JSON format, including 'instruction' and 'output' fields."},
                {"role": "user", "content": prompt_tpl.format(content=chunk['content'])}
            ],
            response_format={"type": "json_object"}
        )
        data = json.loads(response.choices[0].message.content)
        return {
            "instruction": data.get("instruction", ""),
            "output": data.get("output", ""),
            "task_type": task_type,
            "source": chunk.get('source')
        }
    except: return None

In [None]:
# Execute generation
sft_items = []
test_chunks = all_chunks[:10]  # Test first 10 items first

for chunk in tqdm(test_chunks, desc="LLM Generating"):
    item = generate_sft_data(chunk)
    if item: sft_items.append(item)
    time.sleep(0.2)

# Save final SFT data
sft_path = os.path.join(PROCESSED_DATA_DIR, 'domain_expert_sft.jsonl')
with open(sft_path, 'w', encoding='utf-8') as f:
    for item in sft_items:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

print(f"✅ Generation complete! File saved to: {sft_path}")