In [3]:
import os
import re
import json
from langdetect import detect
from datasketch import MinHash, MinHashLSH
from glob import glob
from tqdm import tqdm
from collections import Counter

# ------------ Config ------------
INPUT_JSON1 = "arxiv_clean.json"   # Task 1
INPUT_TXT2 = "pdf_text/*.txt"            # Task 2
INPUT_JSONL3 = "task3/talks_transcripts.jsonl" # Task 3
OUTPUT_CORPUS = "task4/clean_corpus.txt"
OUTPUT_STATS = "task4/stats.md"
SIMILARITY_THRESHOLD = 0.7
TOP_N_WORDS = 20

# ------------ Step 1: Load Data ------------
documents = []

print("📥 Loading documents...")

# Task 1
if os.path.exists(INPUT_JSON1):
    with open(INPUT_JSON1, "r", encoding="utf-8") as f:
        try:
            papers = json.load(f)
            for p in papers:
                documents.append(p["title"] + "\n" + p.get("abstract", ""))
        except json.JSONDecodeError:
            print("⚠️ Could not parse Task 1 JSON file, skipping...")

# Task 2
for file in glob(INPUT_TXT2):
    try:
        with open(file, "r", encoding="utf-8", errors="ignore") as f:
            documents.append(f.read())
    except Exception as e:
        print(f"⚠️ Could not read {file}: {e}")

# Task 3
if os.path.exists(INPUT_JSONL3):
    with open(INPUT_JSONL3, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                rec = json.loads(line)
                documents.append(rec.get("speaker_text", "") + "\n" + rec.get("ocr_text", ""))
            except json.JSONDecodeError as e:
                print(f"⚠️ Skipping malformed line in Task 3: {e}")

print(f"✅ Loaded {len(documents)} documents from Tasks 1–3")

# ------------ Step 2: Language Detection (keep English) ------------
print("🌐 Detecting language...")
docs_lang_filtered = []
for d in tqdm(documents, desc="Language filtering"):
    try:
        if detect(d) == "en":
            docs_lang_filtered.append(d)
    except:
        continue

# ------------ Step 3: Strip HTML Noise ------------
def clean_html(text):
    text = re.sub(r"<[^>]+>", " ", text)
    text = re.sub(r"&\w+;", " ", text)
    return re.sub(r"\s+", " ", text).strip()

print("🧹 Removing HTML noise...")
docs_no_html = [clean_html(d) for d in tqdm(docs_lang_filtered, desc="Stripping HTML")]

# ------------ Step 4: Deduplication with MinHash ------------
print("🌀 Deduplicating documents...")
lsh = MinHashLSH(threshold=SIMILARITY_THRESHOLD, num_perm=128)
unique_docs = []

for i, doc in enumerate(tqdm(docs_no_html, desc="Deduplication")):
    mh = MinHash(num_perm=128)
    for word in set(doc.split()):
        mh.update(word.encode("utf8"))
    if not lsh.query(mh):
        lsh.insert(f"doc{i}", mh)
        unique_docs.append(doc)

# ------------ Step 5: Remove PII ------------
def remove_pii(text):
    text = re.sub(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", "[EMAIL]", text)
    text = re.sub(r"\b(?:\d[ -]*?){13,16}\b", "[CREDIT_CARD]", text)
    text = re.sub(r"\+?\d[\d -]{8,}\d", "[PHONE]", text)
    return text

print("🔒 Removing PII...")
docs_no_pii = [remove_pii(d) for d in tqdm(unique_docs, desc="PII removal")]

# ------------ Step 6: Remove repetitive n-grams ------------
def remove_repetitions(text, n=3):
    tokens = text.split()
    cleaned, seen = [], set()
    for i in range(len(tokens)):
        ngram = tuple(tokens[i:i+n])
        if ngram in seen:
            continue
        seen.add(ngram)
        cleaned.append(tokens[i])
    return " ".join(cleaned)

print("♻️ Removing repetitive n-grams...")
docs_cleaned = [remove_repetitions(d) for d in tqdm(docs_no_pii, desc="Repetition cleanup")]

# ------------ Step 7: Save Output ------------
with open(OUTPUT_CORPUS, "w", encoding="utf-8") as f:
    for d in docs_cleaned:
        f.write(d + "\n")

# ------------ Step 8: Stats ------------
original_tokens = sum(len(d.split()) for d in documents)
cleaned_tokens = sum(len(d.split()) for d in docs_cleaned)
removed_docs = len(documents) - len(docs_cleaned)

# Word frequency (top N)
word_counts = Counter(" ".join(docs_cleaned).split())
top_words = word_counts.most_common(TOP_N_WORDS)

with open(OUTPUT_STATS, "w", encoding="utf-8") as f:
    f.write("# Cleaning Statistics\n")
    f.write(f"- Original documents: {len(documents)}\n")
    f.write(f"- Cleaned documents: {len(docs_cleaned)}\n")
    f.write(f"- Removed documents: {removed_docs} ({removed_docs/len(documents)*100:.2f}%)\n")
    f.write(f"- Original tokens: {original_tokens}\n")
    f.write(f"- Cleaned tokens: {cleaned_tokens}\n\n")
    f.write("## Top Frequent Words\n")
    f.write("| Word | Count |\n|------|-------|\n")
    for word, count in top_words:
        f.write(f"| {word} | {count} |\n")

print(f"✅ Cleaning complete. Saved corpus to {OUTPUT_CORPUS} and stats to {OUTPUT_STATS}")


📥 Loading documents...
⚠️ Skipping malformed line in Task 3: Expecting value: line 1 column 1 (char 0)
✅ Loaded 274 documents from Tasks 1–3
🌐 Detecting language...


Language filtering: 100%|███████████████████████████████████████████████████████████| 274/274 [00:01<00:00, 146.87it/s]


🧹 Removing HTML noise...


Stripping HTML: 100%|██████████████████████████████████████████████████████████████| 259/259 [00:00<00:00, 5558.89it/s]


🌀 Deduplicating documents...


Deduplication: 100%|████████████████████████████████████████████████████████████████| 259/259 [00:00<00:00, 335.57it/s]


🔒 Removing PII...


PII removal: 100%|█████████████████████████████████████████████████████████████████| 259/259 [00:00<00:00, 2563.34it/s]


♻️ Removing repetitive n-grams...


Repetition cleanup: 100%|██████████████████████████████████████████████████████████| 259/259 [00:00<00:00, 3225.99it/s]

✅ Cleaning complete. Saved corpus to task4/clean_corpus.txt and stats to task4/stats.md



