# 切换路径

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
import os

current_directory = os.getcwd()
project_directory = '/content/drive/MyDrive/CSC8639/'

if current_directory == project_directory:
    print("The current working directory is already the specified directory, no change needed.")
else:
    os.chdir(project_directory)
    print(f"The current working directory has been changed to: {project_directory}")

The current working directory has been changed to: /content/drive/MyDrive/CSC8639/


# 1.数据准备

## dcm文件转为jpg文件

In [None]:
import numpy as np
import cv2
import os
import pydicom
from pydicom.pixel_data_handlers.util import apply_modality_lut, apply_voi_lut

ModuleNotFoundError: No module named 'pydicom'

In [None]:
def resize_and_save(load_path, save_path):  # load_path=/path/to/load/*.dicom, save_path=/path/to/save/*.jpg
    ds = pydicom.dcmread(load_path, force=True)
    img = ds.pixel_array
    img = apply_modality_lut(img, ds)  # rescaleSlope & intercept
    img = apply_voi_lut(img, ds)  # windowing
    if hasattr(ds, "PhotometricInterpretation"):
        if ds.PhotometricInterpretation.lower().strip() == "monochrome1":
            img = img.max() - img  # invert

    h, w = img.shape
    ratio = 512 / min(h, w)
    target_size = (int(w * ratio), int(h * ratio))
    img = cv2.resize(img, target_size, cv2.INTER_LANCZOS4)

    # normalize
    img = (img - img.min()) / (img.max() - img.min()) * np.iinfo(np.uint8).max
    img = img.astype(np.uint8)
    cv2.imwrite(save_path, img)


def process_images_in_directory(load_path, save_path):
    # Ensure the save_path directory exists
    os.makedirs(save_path, exist_ok=True)

    # List all files in the load_path directory
    files = os.listdir(load_path)

    # Iterate over each file
    for file in files:
        if file.endswith('.dcm'):
            # Construct full file paths
            filename = os.path.basename(file)[:-4] + '.jpg'
            input_file = os.path.join(load_path, file)
            output_file = os.path.join(save_path, filename)

            # Call resize_and_save function
            resize_and_save(input_file, output_file)

In [None]:
load_path = 'datasets/rsna/stage_2_train_images_mini'
save_path = 'datasets/rsna/stage_2_train_images_jpg_mini'

process_images_in_directory(load_path, save_path)

In [None]:
load_path = 'datasets/rsna/stage_2_test_images_mini'
save_path = 'datasets/rsna/stage_2_test_images_jpg_mini'

process_images_in_directory(load_path, save_path)

## 划分pneumonia数据集

In [None]:
import pandas as pd
import numpy as np
import os
import sys
import tqdm
import argparse

sys.path.append(os.getcwd())
from gloria.constants import *
from sklearn.model_selection import train_test_split


def preprocess_pneumonia_data(test_fac=0.15):

    try:
        df = pd.read_csv(PNEUMONIA_ORIGINAL_TRAIN_CSV)
    except:
        raise Exception(
            "Please make sure the the RSNA Pneumonia dataset is \
            stored at {PNEUMONIA_DATA_DIR}"
        )

    # create bounding boxes
    def create_bbox(row):
        if row["Target"] == 0:
            return 0
        else:
            x1 = row["x"]
            y1 = row["y"]
            x2 = x1 + row["width"]
            y2 = y1 + row["height"]
            return [x1, y1, x2, y2]

    df["bbox"] = df.apply(lambda x: create_bbox(x), axis=1)

    # aggregate multiple boxes
    df = df[["patientId", "bbox"]]
    df = df.groupby("patientId").agg(list)
    df = df.reset_index()
    df["bbox"] = df["bbox"].apply(lambda x: None if x == [0] else x)

    # create labels
    df["Target"] = df["bbox"].apply(lambda x: 0 if x == None else 1)

    # no encoded pixels mean healthy
    df["Path"] = df["patientId"].apply(lambda x: PNEUMONIA_IMG_DIR / (x + ".dcm"))

    # split data
    train_df, test_val_df = train_test_split(df, test_size=test_fac * 2, random_state=0)
    test_df, valid_df = train_test_split(test_val_df, test_size=0.5, random_state=0)

    print(f"Number of train samples: {len(train_df)}")
    print(train_df["Target"].value_counts())
    print(f"Number of valid samples: {len(valid_df)}")
    print(valid_df["Target"].value_counts())
    print(f"Number of test samples: {len(test_df)}")
    print(test_df["Target"].value_counts())

    train_df.to_csv(PNEUMONIA_TRAIN_CSV)
    valid_df.to_csv(PNEUMONIA_VALID_CSV)
    test_df.to_csv(PNEUMONIA_TEST_CSV)


def preprocess_pneumothorax_data(test_fac=0.15):

    try:
        df = pd.read_csv(PNEUMOTHORAX_ORIGINAL_TRAIN_CSV)
    except:
        raise Exception(
            "Please make sure the the SIIM Pneumothorax dataset is \
            stored at {PNEUMOTHORAX_DATA_DIR}"
        )

    # get image paths
    img_paths = {}
    for subdir, dirs, files in tqdm.tqdm(os.walk(PNEUMOTHORAX_IMG_DIR)):
        for f in files:
            if "dcm" in f:
                # remove dcm
                file_id = f[:-4]
                img_paths[file_id] = os.path.join(subdir, f)

    # no encoded pixels mean healthy
    df["Label"] = df.apply(
        lambda x: 0.0 if x[" EncodedPixels"] == " -1" else 1.0, axis=1
    )
    df["Path"] = df["ImageId"].apply(lambda x: img_paths[x])

    # split data
    train_df, test_val_df = train_test_split(df, test_size=test_fac * 2, random_state=0)
    test_df, valid_df = train_test_split(test_val_df, test_size=0.5, random_state=0)

    print(f"Number of train samples: {len(train_df)}")
    print(train_df["Label"].value_counts())
    print(f"Number of valid samples: {len(valid_df)}")
    print(valid_df["Label"].value_counts())
    print(f"Number of test samples: {len(test_df)}")
    print(test_df["Label"].value_counts())

    train_df.to_csv(PNEUMOTHORAX_TRAIN_CSV)
    valid_df.to_csv(PNEUMOTHORAX_VALID_CSV)
    test_df.to_csv(PNEUMOTHORAX_TEST_CSV)


def preprocess_chexpert_5x200_data():

    df = pd.read_csv(CHEXPERT_ORIGINAL_TRAIN_CSV)
    df = df.fillna(0)
    df = df[df["Frontal/Lateral"] == "Frontal"]

    df_master = pd.read_csv(CHEXPERT_MASTER_CSV)
    df_master = df_master[["Path", "Report Impression"]]

    task_dfs = []
    for i, t in enumerate(CHEXPERT_COMPETITION_TASKS):
        index = np.zeros(14)
        index[i] = 1
        df_task = df[
            (df["Atelectasis"] == index[0])
            & (df["Cardiomegaly"] == index[1])
            & (df["Consolidation"] == index[2])
            & (df["Edema"] == index[3])
            & (df["Pleural Effusion"] == index[4])
            & (df["Enlarged Cardiomediastinum"] == index[5])
            & (df["Lung Lesion"] == index[7])
            & (df["Lung Opacity"] == index[8])
            & (df["Pneumonia"] == index[9])
            & (df["Pneumothorax"] == index[10])
            & (df["Pleural Other"] == index[11])
            & (df["Fracture"] == index[12])
            & (df["Support Devices"] == index[13])
        ]
        df_task = df_task.sample(n=200)
        task_dfs.append(df_task)
    df_200 = pd.concat(task_dfs)

    # get reports
    df_200 = pd.merge(df_200, df_master, how="left", left_on="Path", right_on="Path")

    return df_200


def preprocess_chexpert_data():

    try:
        df = pd.read_csv(CHEXPERT_ORIGINAL_TRAIN_CSV)
    except:
        raise Exception(
            "Please make sure the the Pneunotrhoax dataset is \
            stored at {PNEUMOTHORAX_DATA_DIR}"
        )

    df_200 = preprocess_chexpert_5x200_data()
    df = df[~df[CHEXPERT_PATH_COL].isin(df_200[CHEXPERT_PATH_COL])]
    valid_ids = np.random.choice(len(df), size=CHEXPERT_VALID_NUM, replace=False)
    valid_df = df.iloc[valid_ids]
    train_df = df.drop(valid_ids, errors="ignore")

    print(f"Number of train samples: {len(train_df)}")
    print(f"Number of valid samples: {len(valid_df)}")
    print(f"Number of chexpert5x200 samples: {len(df_200)}")

    train_df.to_csv(CHEXPERT_TRAIN_CSV)
    valid_df.to_csv(CHEXPERT_VALID_CSV)
    df_200.to_csv(CHEXPERT_5x200)


_DATASETS = {
    "chexpert": preprocess_chexpert_data,
    "pneumonia": preprocess_pneumonia_data,
    "pneumothorax": preprocess_pneumothorax_data,
}


def available_datasets():
    """Returns the names of available datasets"""
    return list(_DATASETS.keys())

In [None]:
parser = argparse.ArgumentParser()
parser.add_argument(
    "-d",
    "--dataset",
    type=str,
    help="dataset type, one of [chexpert, pneumonia, pneumothorax]",
    required=True,
)
args = parser.parse_args()

if args.dataset.lower() in _DATASETS.keys():
    _DATASETS[args.dataset.lower()]()
else:
    RuntimeError(
        f"Model {args.dataset} not found; available datasets = {available_datasets()}"
    )

## 划分mimic-cxr-jpg数据集

In [None]:
import pandas as pd
import os
import re


# 定义函数生成图像路径
def generate_image_path(row):
    subject_id = row['subject_id']
    study_id = row['study_id']
    dicom_id = row['dicom_id']

    subject_folder = "p" + str(subject_id).zfill(8)[:2]
    subject_subfolder = "p" + str(subject_id).zfill(8)
    study_folder = "s" + str(study_id)

    image_path = os.path.join(jpg_root_dir, subject_folder, subject_subfolder, study_folder, dicom_id + ".jpg")
    return image_path


def extract_findings_and_impression(row):
    findings = []
    impression = []

    subject_id = row['subject_id']
    study_id = row['study_id']
    subject_folder = "p" + str(subject_id).zfill(8)[:2]
    subject_subfolder = "p" + str(subject_id).zfill(8)
    study_folder = "s" + str(study_id)
    text_path = os.path.join(text_root_dir, subject_folder, subject_subfolder, study_folder + ".txt")

    with open(text_path, 'r') as file:
        lines = file.read()
        find_findings = False
        find_impression = False

        findings_pattern = re.compile(r'FINDINGS:\s*(.*?)\s*IMPRESSION:', re.DOTALL)
        impression_pattern = re.compile(r'IMPRESSION:\s*(.*)', re.DOTALL)

        findings_match = findings_pattern.search(lines)
        impression_match = impression_pattern.search(lines)

    findings_text = findings_match.group(1).replace('\n', ' ').strip() if findings_match else None
    impression_text = impression_match.group(1).replace('\n', ' ').strip() if impression_match else None

    if findings_text is None or impression_text is None:
        return None

    return [findings_text, impression_text]

# 定义函数：合并并去除重复值和空值
def merge_unique(values):
    merged_list = []
    for value in values:
        if pd.notna(value):
            merged_list.append(value)

    return list(set(merged_list)) if merged_list else None

In [None]:
# 定义根目录
jpg_root_dir = "/home/jovyan/work/CSC8639/mimic-cxr-jpg-v2.1.0/Data/files"
text_root_dir = "/home/jovyan/work/CSC8639/mimic-cxr/2.0.0/files"

# 读取CSV文件
metadata_path = "datasets/mimic-cxr-jpg/mimic-cxr-2.0.0-metadata.csv"  # 替换为你的文件路径
metadata = pd.read_csv(metadata_path)

# 生成新的image列
metadata['image'] = metadata.apply(generate_image_path, axis=1)
metadata['AP'] = metadata['image'].where(metadata['ViewPosition'] == 'AP', None)
metadata['PA'] = metadata['image'].where(metadata['ViewPosition'] == 'PA', None)
metadata['Lateral'] = metadata['image'].where(metadata['ViewPosition'] == 'LATERAL', None)
metadata['view'] = metadata.apply(lambda metadata: metadata['ViewPosition'] if metadata['ViewPosition'] in ["AP","PA","LATERAL"] else None, axis=1)
metadata['text'] = metadata.apply(extract_findings_and_impression, axis=1)

# 保存到新的CSV文件
new_metadata_path = "datasets/mimic-cxr-jpg/mimic-cxr-2.0.0-metadata-with-images.csv"  # 替换为你希望保存的路径
metadata.to_csv(new_metadata_path, index=True)

In [None]:
# 读取数据文件
file_path = "datasets/mimic-cxr-jpg/mimic-cxr-2.0.0-metadata-with-images.csv"
metadata = pd.read_csv(file_path)

# 按照 study_id 列进行分组，并合并其他列数据为列表
grouped = metadata.groupby('study_id').agg({
    'image': lambda x: merge_unique(x),
    'view': lambda x: merge_unique(x),
    'AP': lambda x: merge_unique(x),
    'PA': lambda x: merge_unique(x),
    'Lateral': lambda x: merge_unique(x),
    'text': 'first'
}).reset_index()
grouped = grouped[grouped['text'].notna()]

# 将处理后的数据保存到新的CSV文件中
output_file = 'datasets/mimic-cxr-jpg/mimic-cxr-grouped-metadata.csv'
grouped.to_csv(output_file, index=True)
print(f"Saved grouped metadata to {output_file}")

In [None]:
# 读取 metadata 和 split 数据
metadata = pd.read_csv('datasets/mimic-cxr-jpg/mimic-cxr-grouped-metadata.csv')
split_data = pd.read_csv('datasets/mimic-cxr-jpg/mimic-cxr-2.0.0-split.csv')

# 将 split 数据和 metadata 数据合并，根据 study_id 进行匹配
merged_data = pd.merge(metadata, split_data, on = 'study_id')

# 拆分成 train, validate, test 数据
train_data = merged_data[merged_data['split'] == 'train']
valid_data = merged_data[merged_data['split'] == 'validate']
test_data = merged_data[merged_data['split'] == 'test']

cols_to_save = ['image', 'view', 'AP', 'PA', 'Lateral', 'text']

# 保存为新文件
train_data[cols_to_save].to_csv('datasets/mimic-cxr-jpg/mimic_train.csv', index=True)
valid_data[cols_to_save].to_csv('datasets/mimic-cxr-jpg/mimic_valid.csv', index=True)
test_data[cols_to_save].to_csv('datasets/mimic-cxr-jpg/mimic_test.csv', index=True)

if os.path.exists(new_metadata_path):
    os.remove(new_metadata_path)
    print(f"{new_metadata_path} has been deleted.")
else:
    print(f"{new_metadata_path} does not exist.")

if os.path.exists(output_file):
    os.remove(output_file)
    print(f"{output_file} has been deleted.")
else:
    print(f"{output_file} does not exist.")

print("Files saved successfully: mimic_train.csv, mimic_valid.csv, mimic_test.csv")

## mimic-cxr-jpg backtranslation增强文本数据

In [None]:
!python "text_augmentation/back_translation.py"

# 2.安装依赖

In [3]:
!pip install -r "Model/image-classification-model/requirements.txt"

Collecting antlr4-python3-runtime==4.9.3 (from -r Model/image-classification-model/requirements.txt (line 3))
  Downloading antlr4-python3-runtime-4.9.3.tar.gz (117 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.0/117.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting argon2-cffi==21.3.0 (from -r Model/image-classification-model/requirements.txt (line 5))
  Downloading argon2_cffi-21.3.0-py3-none-any.whl (14 kB)
Collecting arrow==1.2.3 (from -r Model/image-classification-model/requirements.txt (line 7))
  Downloading arrow-1.2.3-py3-none-any.whl (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.4/66.4 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting asttokens==2.0.5 (from -r Model/image-classification-model/requirements.txt (line 8))
  Downloading asttokens-2.0.5-py2.py3-none-any.whl (20 kB)
Collecting async-lru==2.0.3 (from -r Model/image-classification-mo

# 3.训练模型

In [3]:
!python "Model/image-classification-model/train.py" --config-name train \
  dataloader=dataloader_32 scheduler=cosine_epoch15_warmup1

# output folder: Model/image-classification-model/runs/backbone/outputs/

2024-07-02 12:25:19.278362: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-02 12:25:19.278423: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-02 12:25:19.390785: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-02 12:25:19.608067: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[2024-07-02 12:25:43,143][__main__][INFO] - Configura

# 4.训练Finetune Classifier

In [7]:
!python "Model/image-classification-model/finetune.py" --config-name finetune_100 \
  data_train=rsna_pneumonia data_valid=rsna_pneumonia \
  dataloader=dataloader_32 scheduler=cosine_epoch5_warmup1 \
  model.load_backbone_weights="Model/image-classification-model/model/backbone/model-best.tar"

# output folder: Model/image-classification-model/runs/finetune_10/outputs

2024-07-02 14:33:49.352350: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-02 14:33:49.352425: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-02 14:33:49.354815: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-02 14:33:49.365851: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[2024-07-02 14:33:57,490][__main__][INFO] - Configura

# 5.评估模型

In [8]:
!python "Model/image-classification-model/evaluate_clip.py" dataloader=dataloader_32 \
  test.checkpoint="Model/image-classification-model/model/backbone/model-best.tar"

# output folder: Model/image-classification-model/evaluation/backbone/outputs

2024-07-02 14:49:15.777554: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-02 14:49:15.777606: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-02 14:49:15.779234: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-02 14:49:15.787236: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Model/image-classification-model/model/backbone/model

In [9]:
!python "Model/image-classification-model/evaluate_finetune.py" dataloader=dataloader_32 \
  test.checkpoint="Model/image-classification-model/model/finetune/model-best.tar"

# output folder: Model/image-classification-model/evaluation/finetune/outputs

2024-07-02 14:50:50.763337: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-02 14:50:50.763399: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-02 14:50:50.764906: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-02 14:50:50.772461: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Model/image-classification-model/model/finetune/model