In [None]:
import json
import os

input_dir = "../../articles/" 
output_file = "labeled_chunks_articles.jsonl"

RELEVANT_SECTIONS = {
    "introduction", "methods", "results", "discussion", "conclusion", "background", "abstract"
}

IRRELEVANT_SECTIONS = {
    "references", "acknowledgements", "author contributions", "funding",
    "bacterial strains", "design of mutation mapping microarray",
    "microarray hybridization", "analysis of mapping array data",
    "bib entries", "ref entries", "back matter"
}

def label_section(section_name):
    section = section_name.lower().strip()
    if any(key in section for key in RELEVANT_SECTIONS):
        return 1
    if any(key in section for key in IRRELEVANT_SECTIONS):
        return 0
    return None

def extract_labeled_chunks(filepath):
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            data = json.load(f)
            sections = data.get("abstract", []) + data.get("body_text", [])
            for entry in sections:
                section_name = entry.get("section", "")
                label = label_section(section_name)
                if label is not None and entry.get("text", "").strip():
                    yield {
                        "text": entry["text"].strip(),
                        "label": label
                    }
    except json.JSONDecodeError:
        print(f"Skipped (invalid JSON): {filepath}")
    except Exception as e:
        print(f"Error reading {filepath}: {e}")

# Process all files
count = 0
with open(output_file, "w", encoding="utf-8") as out_f:
    for filename in os.listdir(input_dir):
        if filename.endswith(".json"):
            path = os.path.join(input_dir, filename)
            for record in extract_labeled_chunks(path):
                out_f.write(json.dumps(record) + "\n")
                count += 1

print(f"Finished writing {count} labeled chunks to {output_file}")


In [None]:
import re

input_dir = "../../text_books/"
output_file = "labeled_chunks_text_books.jsonl"

def label_paragraph(text):
    text_lower = text.lower()
    if any(p in text_lower for p in ["table of contents", "preface", "project gutenberg", "produced by", "transcribed by"]):
        return 0
    if re.match(r'^chapter\s+\w+', text_lower):
        return 1
    if len(text) > 150:
        return 1
    return 0

def process_txt_file(input_path):
    with open(input_path, "r", encoding="utf-8") as f:
        content = f.read()

    paragraphs = [p.strip() for p in re.split(r'\n\s*\n', content) if len(p.strip()) > 30]

    labeled_data = []
    for para in paragraphs:
        label = label_paragraph(para)
        labeled_data.append({"text": para, "label": label})
    return labeled_data


with open(output_file, "w", encoding="utf-8") as out:
    total = 0
    for filename in os.listdir(input_dir):
        if filename.endswith(".txt"):
            input_path = os.path.join(input_dir, filename)
            print(f"Processing {filename}...")
            labeled_chunks = process_txt_file(input_path)
            for item in labeled_chunks:
                out.write(json.dumps(item) + "\n")
                total += 1

print(f"Saved {total} labeled chunks to {output_file}")


Processing pg118.txt...
Processing pg22747.txt...
Processing pg26697.txt...
Processing pg38046.txt...
Processing pg42241.txt...
Processing pg46424.txt...
Processing pg46448.txt...
Processing pg51676.txt...
Processing pg56018.txt...
Processing pg70367.txt...
✅ Saved 18723 labeled chunks to labeled_chunks_text_books.jsonl


In [None]:

# combine the files into a single JSONL file
def load_jsonl(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]

data_articles = load_jsonl("labeled_chunks_articles.jsonl")
data_books = load_jsonl("labeled_chunks_text_books.jsonl")

all_data = data_articles + data_books
print(f"Total samples: {len(all_data)}")
