# Move all .txt files to a common folder

In [5]:
import os
import shutil

base_dir = r"A:/Projects/CXR Reports"       # Folder where the search starts
destination_dir = r"A:/Projects/CXR Reports/Reports"  # Folder where .txt files will be copied

# iterate through all files in base directory
for root, dirs, files in os.walk(base_dir):
    for file in files:
        if file.lower().endswith(".txt") and not file.startswith("."):
            src_path = os.path.join(root, file)
            dest_path = os.path.join(destination_dir, file)
            shutil.copy2(src_path, dest_path)  # copy2 preserves metadata

total_files = len([f for f in os.listdir(destination_dir) if f.lower().endswith(".txt")])
print(f"Done! All {total_files} .txt files copied.")

SameFileError: 'A:/Projects/CXR Reports\\Reports\\s50000014.txt' and 'A:/Projects/CXR Reports/Reports\\s50000014.txt' are the same file

# Process all .txt files to strip extraneous information leaving just impressions

In [18]:
import os
import re

base_dir = r"A:/Projects/CXR Reports/Reports"
output_dir = r"A:/Projects/CXR Reports/Reports_Stripped"
max_files = 1000

os.makedirs(output_dir, exist_ok=True)

# Keep these 
KEEP_HEADERS = {"FINDINGS", "IMPRESSION", "IMPRESSIONS"}  # normalize later

# Remove these (unwanted)
UNWANTED_HEADERS = [
    "INDICATION FOR EXAM",
    "CLINICAL HISTORY",
    "FINAL REPORT",
    "HISTORY",
    "TECHNIQUE",
    "COMPARISON",
    "INDICATION",
    "EXAM",
    "EXAMINATION",
    "REASON FOR EXAM",
    "REASON FOR EXAMINATION",
    "WET READ",
    "NOTIFICAION"
]

# Build a robust alternation that favors longer matches first
UNWANTED_HEADERS.sort(key=len, reverse=True)
UNWANTED_ALT = "|".join(re.escape(h) for h in UNWANTED_HEADERS)

# Any header: ALL-CAPS-ish with allowed punctuation, *must* be followed by a colon
ANY_HEADER_RE = re.compile(
    r'^\s*[A-Z][A-Z0-9\s/()&\-\.\+]{2,}\s*:',
    flags=re.IGNORECASE | re.MULTILINE
)

REMOVE_RE = re.compile(
    rf'^\s*(?:{UNWANTED_ALT})\s*:\s*.*?'
    r'(?=^\s*[A-Z][A-Z0-9\s/()&\-\.\+]{2,}\s*:|\Z)',
    flags=re.IGNORECASE | re.DOTALL | re.MULTILINE
)

def normalize(text: str) -> str:
    # Normalize line endings; leave only \n internally
    return text.replace('\r\n', '\n').replace('\r', '\n')

def drop_preamble_before_first_header(text: str) -> str:
    m = ANY_HEADER_RE.search(text)
    if not m:
        return text  # no headers detected; return as-is
    return text[m.start():]  # drop everything before first header

def collapse_blank_lines(text: str) -> str:
    lines = [ln.rstrip() for ln in text.split('\n')]
    out = []
    blank = False
    for ln in lines:
        if ln.strip() == "":
            if not blank:
                out.append("")
            blank = True
        else:
            out.append(ln)
            blank = False
    return "\n".join(out).strip()

def strip_unwanted_sections(text: str) -> str:
    text = normalize(text)
    text = drop_preamble_before_first_header(text)
    text = REMOVE_RE.sub('', text)
    text = collapse_blank_lines(text)
    return text

processed = kept_any = skipped = 0
limit_hit = False

for root, _, files in os.walk(base_dir):
    for fname in files:
        if not fname.lower().endswith(".txt"):
            continue
        processed += 1
        if processed > max_files:
            limit_hit = True
            break

        src_path = os.path.join(root, fname)
        try:
            with open(src_path, "r", encoding="utf-8", errors="ignore") as f:
                text = f.read()
        except Exception as e:
            print(f"Could not read {src_path}: {e}")
            continue

        cleaned = strip_unwanted_sections(text)

        if cleaned:
            kept_any += 1
            final_text = re.sub(r'[\r\n]+', ' ', cleaned)  # replace any run of \r or \n with a space
            final_text = re.sub(r'\s{2,}', ' ', final_text).strip()  # collapse double spaces
            final_text = re.sub(r'\b(?:IMPRESSION|FINAL REPORT|FINDINGS)\b\s*:?\s*','', final_text, flags=re.IGNORECASE).strip()
            out_path = os.path.join(output_dir, fname)
            try:
                with open(out_path, "w", encoding="utf-8", newline="\r\n") as f:
                    f.write(final_text)
            except Exception as e:
                print(f"Could not write {out_path}: {e}")
        else:
            skipped += 1

print(f"Processed .txt files: {processed}")
print(f"Wrote stripped files: {kept_any}")
print(f"Skipped (all stripped out): {skipped}")
print(f"Output folder: {output_dir}")


Processed .txt files: 1001
Wrote stripped files: 999
Skipped (all stripped out): 1
Output folder: A:/Projects/CXR Reports/Reports_Stripped
