In [3]:
!pip install -r requirements.txt

Collecting torch (from -r requirements.txt (line 1))
  Using cached torch-2.5.1-cp310-cp310-win_amd64.whl.metadata (28 kB)
Collecting torchvision (from -r requirements.txt (line 2))
  Using cached torchvision-0.20.1-cp310-cp310-win_amd64.whl.metadata (6.2 kB)
Collecting scikit-learn (from -r requirements.txt (line 3))
  Using cached scikit_learn-1.5.2-cp310-cp310-win_amd64.whl.metadata (13 kB)
Collecting scipy (from -r requirements.txt (line 4))
  Using cached scipy-1.14.1-cp310-cp310-win_amd64.whl.metadata (60 kB)
Collecting numpy (from -r requirements.txt (line 5))
  Downloading numpy-2.1.3-cp310-cp310-win_amd64.whl.metadata (60 kB)
Collecting Pillow (from -r requirements.txt (line 6))
  Using cached pillow-11.0.0-cp310-cp310-win_amd64.whl.metadata (9.3 kB)
Collecting tqdm (from -r requirements.txt (line 7))
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting mock (from -r requirements.txt (line 8))
  Using cached mock-5.1.0-py3-none-any.whl.metadata (3.0 kB)
Coll

In [21]:
import pandas as pd
import os
import glob
from PIL import Image

# Data Preprocessing

In [None]:
# ONLY DO THIS ONCE. It took 78 minutes on my machine


input_directory = 'raw_data/jpeg/'
output_directory = 'raw_data/png16/'

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

for foldername, subfolders, filenames in os.walk(input_directory):
    for filename in filenames:
        if filename.endswith('.jpg') or filename.endswith('.jpeg'):

            img_path = os.path.join(foldername, filename)
            img = Image.open(img_path)
            
            relative_path = os.path.relpath(foldername, input_directory)
            output_folder = os.path.join(output_directory, relative_path)
            if not os.path.exists(output_folder):
                os.makedirs(output_folder)
            
            png16_filename = os.path.splitext(filename)[0] + '.png'
            png16_save_path = os.path.join(output_folder, png16_filename)
            img.save(png16_save_path, format='PNG', bits=16)

print("Conversion completed.")

Conversion completed.


In [26]:
img_base_path = "C:/Users/jbber/.vscode/LightMirai/raw_data/png16/" 
csv_base_path = "C:/Users/jbber/.vscode/LightMirai/raw_data/csv/" 

def process_data(filepath, output_path=None):

    data = pd.read_csv(filepath)
    data[['file_path', 'mask_path']] = data.apply(map_image_paths, axis=1)

    processed_data = pd.DataFrame({
        'patient_id': data['patient_id'],
        'exam_id': data.groupby('patient_id').cumcount(),
        'laterality': data['left or right breast'].astype(str).str.strip().str[0].str.upper(),
        'view': data['image view'], 
        'file_path': data['file_path'], 
        'mask_path': data['mask_path'],
    })

    if output_path is not None:
        processed_data.to_csv(output_path, index=False)
        print(f"Processed data saved to {output_path}")

    return processed_data

def map_image_paths(row):

    folder_name = os.path.basename(os.path.dirname(row['ROI mask file path']))
    folder_path = os.path.join(img_base_path, folder_name)
    
    if not os.path.exists(folder_path):
        print(f"Warning: Folder {folder_path} does not exist.")
        return pd.Series([None, None])
        
    mammogram_file = next(
        (f for f in glob.glob(os.path.join(folder_path, "1-*.png")) if len(os.path.basename(f).split('-')[-1]) == 7),
        None
    )
    mask_file = next(
        (f for f in glob.glob(os.path.join(folder_path, "2-*.png")) if len(os.path.basename(f).split('-')[-1]) == 7),
        None
    )
    
    if not mammogram_file:
        print(f"Warning: No mammogram file (1-*.png) found in {folder_path}.")
    if not mask_file:
        print(f"Warning: No mask file (2-*.png) found in {folder_path}.")
    
    return pd.Series([mammogram_file, mask_file])

def merge_data(dataframe_1, dataframe_2):
    if list(dataframe_1.columns) != list(dataframe_2.columns):
        raise ValueError("DataFrames have different columns and cannot be merged.")

    merged_dataframe = pd.concat([dataframe_1, dataframe_2], ignore_index=True)
    merged_dataframe.drop_duplicates(inplace=True)

    return merged_dataframe

calc_train_data = process_data(csv_base_path + "calc_case_description_train_set.csv")
calc_test_data = process_data(csv_base_path + "calc_case_description_test_set.csv")

mass_train_data = process_data(csv_base_path + "mass_case_description_train_set.csv")
mass_test_data = process_data(csv_base_path + "mass_case_description_test_set.csv")

train_data = merge_data(calc_train_data, mass_train_data)
test_data = merge_data(calc_test_data, mass_test_data)

train_data.to_csv("clean_data/train.csv")
test_data.to_csv("clean_data/test.csv")



In [27]:
print("Training Set:\n")

print(train_data.head())
print(train_data.tail())
print(train_data.info())
print(train_data.describe(include='all'))
print(train_data.columns)
print(train_data.shape)
print(train_data.dtypes)
print(train_data.isnull().sum())
print(f"Number of duplicate rows: {len(train_data[train_data.duplicated()])}")

print("Testing Set:\n")

print(test_data.head())
print(test_data.tail())
print(test_data.info())
print(test_data.describe(include='all'))
print(test_data.columns)
print(test_data.shape)
print(test_data.dtypes)
print(test_data.isnull().sum())
print(f"Number of duplicate rows: {len(test_data[test_data.duplicated()])}")


Training Set:

  patient_id  exam_id laterality view  \
0    P_00005        0          R   CC   
1    P_00005        1          R  MLO   
2    P_00007        0          L   CC   
3    P_00007        1          L  MLO   
4    P_00008        0          L   CC   

                                           file_path  \
0  C:/Users/jbber/.vscode/LightMirai/raw_data/png...   
1  C:/Users/jbber/.vscode/LightMirai/raw_data/png...   
2  C:/Users/jbber/.vscode/LightMirai/raw_data/png...   
3  C:/Users/jbber/.vscode/LightMirai/raw_data/png...   
4  C:/Users/jbber/.vscode/LightMirai/raw_data/png...   

                                           mask_path  
0  C:/Users/jbber/.vscode/LightMirai/raw_data/png...  
1  C:/Users/jbber/.vscode/LightMirai/raw_data/png...  
2  C:/Users/jbber/.vscode/LightMirai/raw_data/png...  
3  C:/Users/jbber/.vscode/LightMirai/raw_data/png...  
4  C:/Users/jbber/.vscode/LightMirai/raw_data/png...  
     patient_id  exam_id laterality view  \
2859    P_02033        1   