# Dicom Metadata
A couple of changes to be made to the metadata file
1. standardize column names to lower case with underscores instead of spaces.
2. Add a set column, which contains 'test', or 'train'.
3. Add a casetype column that will indicate 'calc' or 'mass' case.
4. Save the revised metadata in data/dev/CIBS-DDSM/metadata.csv

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
source = "data/raw/CBIS-DDSM-All-doiJNLP-zzWs5zfZ/metadata.csv"
dest1 = "data/dev/CBIS-DDSM/metadata.csv"
dest2 = "data/prod/CBIS-DDSM/metadata.csv"

In [3]:
os.makedirs(os.path.dirname(dest1), exist_ok=True)
os.makedirs(os.path.dirname(dest2), exist_ok=True)

## Standardize Columns

In [4]:
df = pd.read_csv(source, index_col=None)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6775 entries, 0 to 6774
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Series UID            6775 non-null   object 
 1   Collection            6775 non-null   object 
 2   3rd Party Analysis    0 non-null      float64
 3   Data Description URI  6775 non-null   object 
 4   Subject ID            6775 non-null   object 
 5   Study UID             6775 non-null   object 
 6   Study Description     0 non-null      float64
 7   Study Date            6775 non-null   object 
 8   Series Description    6775 non-null   object 
 9   Manufacturer          0 non-null      float64
 10  Modality              6775 non-null   object 
 11  SOP Class Name        6775 non-null   object 
 12  SOP Class UID         6775 non-null   object 
 13  Number of Images      6775 non-null   int64  
 14  File Size             6775 non-null   object 
 15  File Location        

In [5]:
df.columns = [col.replace(" ","_").lower() for col in df.columns]
df.drop(columns=['3rd_party_analysis', 'study_description', 'manufacturer'], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6775 entries, 0 to 6774
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   series_uid            6775 non-null   object
 1   collection            6775 non-null   object
 2   data_description_uri  6775 non-null   object
 3   subject_id            6775 non-null   object
 4   study_uid             6775 non-null   object
 5   study_date            6775 non-null   object
 6   series_description    6775 non-null   object
 7   modality              6775 non-null   object
 8   sop_class_name        6775 non-null   object
 9   sop_class_uid         6775 non-null   object
 10  number_of_images      6775 non-null   int64 
 11  file_size             6775 non-null   object
 12  file_location         6775 non-null   object
 13  download_timestamp    6775 non-null   object
dtypes: int64(1), object(13)
memory usage: 741.1+ KB


## Add Set Type

In [6]:
df.head()

Unnamed: 0,series_uid,collection,data_description_uri,subject_id,study_uid,study_date,series_description,modality,sop_class_name,sop_class_uid,number_of_images,file_size,file_location,download_timestamp
0,1.3.6.1.4.1.9590.100.1.2.294445047912407030012...,CBIS-DDSM,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Calc-Test_P_01004_LEFT_MLO_1,1.3.6.1.4.1.9590.100.1.2.265198230512455509519...,08-29-2017,ROI mask images,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,2,23.65 MB,./CBIS-DDSM/Calc-Test_P_01004_LEFT_MLO_1/08-29...,2023-05-24T02:45:44.375
1,1.3.6.1.4.1.9590.100.1.2.188613955710170417803...,CBIS-DDSM,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Calc-Test_P_00038_LEFT_MLO_1,1.3.6.1.4.1.9590.100.1.2.291121996131431385353...,08-29-2017,ROI mask images,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,2,14.62 MB,./CBIS-DDSM/Calc-Test_P_00038_LEFT_MLO_1/08-29...,2023-05-24T02:45:45.621
2,1.3.6.1.4.1.9590.100.1.2.399466258212646932018...,CBIS-DDSM,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Calc-Test_P_00041_LEFT_MLO_2,1.3.6.1.4.1.9590.100.1.2.372962290011068589008...,08-29-2017,ROI mask images,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,2,22.93 MB,./CBIS-DDSM/Calc-Test_P_00041_LEFT_MLO_2/08-29...,2023-05-24T02:45:46.7
3,1.3.6.1.4.1.9590.100.1.2.419081637812053404913...,CBIS-DDSM,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Calc-Test_P_00038_LEFT_CC_1,1.3.6.1.4.1.9590.100.1.2.161465562211359959230...,08-29-2017,ROI mask images,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,2,14.06 MB,./CBIS-DDSM/Calc-Test_P_00038_LEFT_CC_1/08-29-...,2023-05-24T02:45:47.806
4,1.3.6.1.4.1.9590.100.1.2.284547955212024760928...,CBIS-DDSM,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Calc-Test_P_00485_LEFT_CC_3,1.3.6.1.4.1.9590.100.1.2.188175119112669404616...,08-29-2017,ROI mask images,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,2,24.81 MB,./CBIS-DDSM/Calc-Test_P_00485_LEFT_CC_3/08-29-...,2023-05-24T02:45:49.07


In [7]:
df['casetype'] = np.where(df["subject_id"].str.contains('Calc'), 'calc', 'mass')
df['fileset'] = np.where(df["subject_id"].str.contains('Test'), 'test', 'train')

In [8]:
df.tail()

Unnamed: 0,series_uid,collection,data_description_uri,subject_id,study_uid,study_date,series_description,modality,sop_class_name,sop_class_uid,number_of_images,file_size,file_location,download_timestamp,casetype,fileset
6770,1.3.6.1.4.1.9590.100.1.2.196313365713001401409...,CBIS-DDSM,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Calc-Training_P_01688_RIGHT_MLO_1,1.3.6.1.4.1.9590.100.1.2.102057887611529931731...,09-06-2017,ROI mask images,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,2,22.42 MB,./CBIS-DDSM/Calc-Training_P_01688_RIGHT_MLO_1/...,2023-05-24T07:39:15.357,calc,train
6771,1.3.6.1.4.1.9590.100.1.2.130519811104880269404...,CBIS-DDSM,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Mass-Training_P_00859_LEFT_CC,1.3.6.1.4.1.9590.100.1.2.387387244313639850632...,07-20-2016,full mammogram images,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,1,31.97 MB,./CBIS-DDSM/Mass-Training_P_00859_LEFT_CC/07-2...,2023-05-24T07:39:18.582,mass,train
6772,1.3.6.1.4.1.9590.100.1.2.176054613511449460831...,CBIS-DDSM,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Mass-Training_P_01144_LEFT_MLO,1.3.6.1.4.1.9590.100.1.2.837997121261132471647...,07-20-2016,full mammogram images,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,1,42.05 MB,./CBIS-DDSM/Mass-Training_P_01144_LEFT_MLO/07-...,2023-05-24T07:39:23.317,mass,train
6773,1.3.6.1.4.1.9590.100.1.2.327802325011574904120...,CBIS-DDSM,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Mass-Training_P_00057_RIGHT_MLO_1,1.3.6.1.4.1.9590.100.1.2.329745178124734097250...,07-21-2016,ROI mask images,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,2,12.79 MB,./CBIS-DDSM/Mass-Training_P_00057_RIGHT_MLO_1/...,2023-05-24T07:39:24.52,mass,train
6774,1.3.6.1.4.1.9590.100.1.2.300329088011365070913...,CBIS-DDSM,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Calc-Test_P_01179_LEFT_MLO_2,1.3.6.1.4.1.9590.100.1.2.887068612419240106317...,08-29-2017,ROI mask images,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,2,12.00 MB,./CBIS-DDSM/Calc-Test_P_01179_LEFT_MLO_2/08-29...,2023-05-24T07:39:25.645,calc,test


In [9]:
df.to_csv(dest1, index=False)
df.to_csv(dest2, index=False)

In [10]:
df2 = pd.read_csv(dest1, index_col=None)
df2.head()

Unnamed: 0,series_uid,collection,data_description_uri,subject_id,study_uid,study_date,series_description,modality,sop_class_name,sop_class_uid,number_of_images,file_size,file_location,download_timestamp,casetype,fileset
0,1.3.6.1.4.1.9590.100.1.2.294445047912407030012...,CBIS-DDSM,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Calc-Test_P_01004_LEFT_MLO_1,1.3.6.1.4.1.9590.100.1.2.265198230512455509519...,08-29-2017,ROI mask images,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,2,23.65 MB,./CBIS-DDSM/Calc-Test_P_01004_LEFT_MLO_1/08-29...,2023-05-24T02:45:44.375,calc,test
1,1.3.6.1.4.1.9590.100.1.2.188613955710170417803...,CBIS-DDSM,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Calc-Test_P_00038_LEFT_MLO_1,1.3.6.1.4.1.9590.100.1.2.291121996131431385353...,08-29-2017,ROI mask images,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,2,14.62 MB,./CBIS-DDSM/Calc-Test_P_00038_LEFT_MLO_1/08-29...,2023-05-24T02:45:45.621,calc,test
2,1.3.6.1.4.1.9590.100.1.2.399466258212646932018...,CBIS-DDSM,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Calc-Test_P_00041_LEFT_MLO_2,1.3.6.1.4.1.9590.100.1.2.372962290011068589008...,08-29-2017,ROI mask images,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,2,22.93 MB,./CBIS-DDSM/Calc-Test_P_00041_LEFT_MLO_2/08-29...,2023-05-24T02:45:46.7,calc,test
3,1.3.6.1.4.1.9590.100.1.2.419081637812053404913...,CBIS-DDSM,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Calc-Test_P_00038_LEFT_CC_1,1.3.6.1.4.1.9590.100.1.2.161465562211359959230...,08-29-2017,ROI mask images,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,2,14.06 MB,./CBIS-DDSM/Calc-Test_P_00038_LEFT_CC_1/08-29-...,2023-05-24T02:45:47.806,calc,test
4,1.3.6.1.4.1.9590.100.1.2.284547955212024760928...,CBIS-DDSM,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Calc-Test_P_00485_LEFT_CC_3,1.3.6.1.4.1.9590.100.1.2.188175119112669404616...,08-29-2017,ROI mask images,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,2,24.81 MB,./CBIS-DDSM/Calc-Test_P_00485_LEFT_CC_3/08-29-...,2023-05-24T02:45:49.07,calc,test
