# 从 CSV 第三列删除指定术语

从各癌症类型 CSV 文件的第三列（cleaned_text）中删除病名相关术语。

In [1]:
import csv
import re

BASE_PATH = "/Users/kezhuli/Desktop/删除病名和同义词/"

FILE_TERMS = {
    "Colon_Cancer.csv": ["colon", "colonic", "colorectal", "bowel", "rectum", "rectal", "polyps"],
    "Liver_Cancer.csv": ["liver", "hepatic", "hepatocellular", "hcc", "cirrhosis", "bile"],
    "Lung_Cancer.csv": ["lung", "pulmonary", "bronchial", "thoracic", "pleura"],
    "Stomach_Cancer.csv": ["stomach", "gastric", "gastro", "gastrointestinal"],
    "Thyroid_Cancer.csv": ["thyroid", "nodule", "papillary", "follicular"],
}

COMMON_TERMS = [
    "cancer", "cancers", "carcinoma", "adenocarcinoma",
    "tumor", "tumour", "malignant", "malignancy"
]


def remove_terms_from_text(text: str, terms: list) -> str:
    """Remove specified terms (whole words only) from text."""
    for term in terms:
        pattern = r"\b" + re.escape(term) + r"\b"
        text = re.sub(pattern, "", text, flags=re.IGNORECASE)
    # Clean up multiple spaces and trim
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [2]:
for filename, file_specific_terms in FILE_TERMS.items():
    terms_to_remove = file_specific_terms + COMMON_TERMS
    filepath = BASE_PATH + filename

    with open(filepath, "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        rows = list(reader)

    header = rows[0]
    data_rows = rows[1:]

    for row in data_rows:
        if len(row) >= 3:
            row[2] = remove_terms_from_text(row[2], terms_to_remove)

    with open(filepath, "w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(header)
        writer.writerows(data_rows)

    print(f"Processed {filename}")

Processed Colon_Cancer.csv
Processed Liver_Cancer.csv
Processed Lung_Cancer.csv
Processed Stomach_Cancer.csv
Processed Thyroid_Cancer.csv


In [4]:
# 五个文件第三列每行词数及最小值
for filename in FILE_TERMS.keys():
    filepath = BASE_PATH + filename
    with open(filepath, "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        rows = list(reader)
    data_rows = rows[1:]
    counts = [len(row[2].split()) if len(row) >= 3 else 0 for row in data_rows]
    print(f"{filename}:")
    print(f"  每行词数: {counts}")
    print(f"  最小值: {min(counts)}")
    print()

Colon_Cancer.csv:
  每行词数: [95, 96, 96, 96, 96, 95, 93, 93, 96, 96, 96, 96, 96, 96, 96, 96, 96, 92, 96, 94, 94, 96, 93, 94, 95, 96, 92, 96, 95, 94, 94, 93, 95, 93, 93, 93, 96, 93, 96, 90, 94, 96, 96, 94, 96, 96, 96, 96, 96, 94, 95, 96, 96, 96, 95, 96, 96, 96, 94, 95, 96, 96, 96, 96, 96, 95, 95, 94, 96, 95, 94, 95, 94, 95, 95, 95, 95, 95, 95, 93, 96, 95, 94, 96, 95, 95, 96, 96, 96, 94, 93, 96, 95, 94, 96, 94, 96, 96, 95, 96, 96, 95, 94, 88, 95, 94, 96, 96, 96, 96, 95, 96, 96, 96, 96, 88, 96, 90, 96, 95, 96, 94, 92, 96, 95, 96, 96, 96, 96, 96, 96, 96, 94, 96, 96, 95, 96, 96, 93, 96, 94, 94, 90, 93, 95, 94, 94, 95, 95, 95, 96, 94, 96, 95, 94, 96, 96, 94, 96, 96, 93, 93, 95, 96, 94, 96, 96, 95, 95, 94, 96, 96, 96, 96, 95, 96, 96, 95, 96, 96, 94, 96, 93, 93, 95, 96, 93, 94, 96, 96, 95, 96, 92, 94, 89, 94, 94, 96, 96, 96]
  最小值: 88

Liver_Cancer.csv:
  每行词数: [90, 96, 92, 88, 96, 87, 89, 92, 96, 96, 87, 94, 96, 95, 86, 95, 83, 88, 92, 96, 94, 96, 86, 96, 92, 93, 91, 94, 90, 89, 96, 96, 96, 87,

In [5]:
# 将五个文件第三列每行截断为 79 词
TARGET_WORD_COUNT = 79

for filename in FILE_TERMS.keys():
    filepath = BASE_PATH + filename
    with open(filepath, "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        rows = list(reader)
    
    header = rows[0]
    data_rows = rows[1:]
    
    for row in data_rows:
        if len(row) >= 3:
            words = row[2].split()
            row[2] = " ".join(words[:TARGET_WORD_COUNT])
    
    with open(filepath, "w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(header)
        writer.writerows(data_rows)
    
    print(f"{filename}: 已截断为 {TARGET_WORD_COUNT} 词")

Colon_Cancer.csv: 已截断为 79 词
Liver_Cancer.csv: 已截断为 79 词
Lung_Cancer.csv: 已截断为 79 词
Stomach_Cancer.csv: 已截断为 79 词
Thyroid_Cancer.csv: 已截断为 79 词
