# 📦✂️ Literature Screening – Data Preparation  
Build balanced train and test sets, store them as CSV files and keep track of each abstract’s origin.  
Folder outputs created:  
```
outputs/  
└── datasets/  
    ├── train_dataset.csv  
    └── test_dataset.csv  
```

In [None]:
# ╔════════════════════════════════════════════════╗
# ║ Cell 1 – Imports, paths, reproducibility 🔧📂  ║
# ╚════════════════════════════════════════════════╝
import os, json, random, re, glob
from pathlib import Path
from typing import List, Dict

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

included_csv  = Path("data/SBIS_with_IDs.csv")
included_dir  = Path("data/abstracts")
excluded_root = Path("data")
outputs_root  = Path("outputs/datasets")
outputs_root.mkdir(parents=True, exist_ok=True)

In [None]:
# ╔════════════════════════════════════════════════╗
# ║ Cell 2 – Load Included abstracts ✅📖          ║
# ╚════════════════════════════════════════════════╝
df_inc = pd.read_csv(included_csv)
df_inc["abstract"] = pd.Series(dtype="object")

errors = []
for idx, row in tqdm(df_inc.iterrows(),
                     total=len(df_inc),
                     desc="Loading Included abstracts"):
    cdo_id = str(row.get("CDO_ID", "")).strip()
    abs_path = included_dir / f"{cdo_id}.txt"
    if abs_path.exists():
        try:
            text = abs_path.read_text(encoding="utf-8").strip()
            if len(text) >= 100:
                df_inc.at[idx, "abstract"] = text
            else:
                df_inc.at[idx, "abstract"] = np.nan  # skip too-short
        except Exception as e:
            errors.append(f"{cdo_id}: {e}")
    else:
        df_inc.at[idx, "abstract"] = np.nan

df_inc = df_inc.dropna(subset=["abstract"]).reset_index(drop=True)
print(f"Included abstracts kept: {len(df_inc):,}")
if errors:
    print("First read error:", errors[0])


In [None]:
# ╔════════════════════════════════════════════════╗
# ║ Cell 3 – Build Excluded sample ❌📤           ║
# ╚════════════════════════════════════════════════╝
# Collect all txt files from yearly folders
year_dirs = [excluded_root / f"{yr}_abstracts" for yr in range(2016, 2025)]
all_exc_files = []
for ydir in year_dirs:
    all_exc_files.extend(list(ydir.glob("*.txt")))

# Filter out very short files immediately
all_exc_files = [p for p in all_exc_files if p.stat().st_size >= 100]

print(f"Excluded abstracts available (≥50 bytes): {len(all_exc_files):,}")

# Balance classes 1:1 with Included
N = len(df_inc)
sampled_files = random.sample(all_exc_files, N)

def build_exc_row(i, p: Path) -> Dict:
    text = p.read_text(encoding="utf-8").strip()
    return {
        "id"       : f"EXC_{i:05d}_{p.stem}",  # clearer unique ID
        "title"    : "Title in Abstract",
        "abstract" : text,
        "label"    : "Excluded",
        "src_path" : str(p)
    }

df_exc = pd.DataFrame([build_exc_row(i, p) for i, p in enumerate(sampled_files)])
print(f"Sampled Excluded abstracts: {len(df_exc):,}")


In [None]:
# ╔════════════════════════════════════════════════╗
# ║ Cell 4 – Combine, shuffle, train/test split 🔀 ║
# ╚════════════════════════════════════════════════╝
df_inc_eval = pd.DataFrame({
    "id"       : df_inc["CDO_ID"].astype(str),
    "title"    : df_inc["Title"],
    "abstract" : df_inc["abstract"],
    "label"    : "Included",
    "src_path" : str(included_dir / "")  # placeholder
})

df_all = (pd.concat([df_inc_eval, df_exc], ignore_index=True)
            .sample(frac=1, random_state=SEED)
            .reset_index(drop=True))

train_df, test_df = train_test_split(
    df_all, test_size=0.20, stratify=df_all["label"], random_state=SEED
)

print(f"Train size: {len(train_df):,}, Test size: {len(test_df):,}")


In [None]:
# ╔════════════════════════════════════════════════╗
# ║ Cell 5 – Save CSV snapshots 💾                ║
# ╚════════════════════════════════════════════════╝
train_path = outputs_root / "train_dataset.csv"
test_path  = outputs_root / "test_dataset.csv"

train_df.to_csv(train_path, index=False)
test_df.to_csv(test_path,  index=False)

print("CSV files written:")
print(" -", train_path.resolve())
print(" -", test_path.resolve())


## ✔️ Data preparation finished  
The remaining notebooks can now read directly from  
`outputs/datasets/train_dataset.csv` and `outputs/datasets/test_dataset.csv`,  
removing the need to re-read every txt file each run.