# Pre-Processing of the Images of the CBIS-DDSM Dataset

The CBIS-DDSM (Curated Breast Imaging Subset of DDSM) is an updated and standardized version of the  Digital Database for Screening Mammography (DDSM).  The DDSM is a database of 2,620 scanned film mammography studies and contains normal, benign, and malignant cases with verified pathology information. 
The dataset is publicly available for download at: https://wiki.cancerimagingarchive.net/pages/viewpage.action?pageId=22516629

In [None]:
import matplotlib.pyplot as plt
import pydicom
import numpy as np
import matplotlib
import cv2
import os
import cv2 as cv
from PIL import Image
from glob import glob
directory = "**"

In [None]:
# Setting the dataset directory
thisdir = "**"

files = []
# r=root, d=directories, f = files
for r, d, f in os.walk(thisdir):
    for file in f:
        if file.endswith(".dcm"):
            files.append(os.path.join(r, file))
  
tt=0
images = np.zeros((3568,128,128,1))
labels = []
for k in files:
    dataset = pydicom.dcmread(k)
    d = dataset.pixel_array
    resized = cv2.resize(d, (128,128), interpolation = cv2.INTER_AREA)
    normalized_d = ((resized - np.amin(resized))/(np.amax(resized) - np.amin(resized)) * 255)
        
    images[tt] = np.expand_dims(normalized_d, axis=(0,3))
    labels.append(k.split("\\")[6])
        
    tt = tt +1
    if(tt%100 == 0):
        print('Saving image: {} of 3568'.format(tt))

In [None]:
np.save('CBIS_DDSM.npy', images)
np.save('CBIS_DDSM_labels.npy', labels)

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("CBIS_DDSM_description.csv") #Concatenation of all the csv files provided by the dataset

labels = np.load("CBIS_DDSM_labels.npy")


labels_csv =  [None] * 3568

for i in range(0,3568):

   l_csv = df["image file path"][i].split("/")[0] + '_' + str(df["abnormality id"][i])
   
   ind = np.where(labels==df["image file path"][i].split("/")[0] + '_' + str(df["abnormality id"][i]))[0][0]
   
   labels_csv[ind] = [l_csv, df["abnormality type"][i], df["pathology"][i].split("_")[0], df["breast_density"][i], df["assessment"][i], df["subtlety"][i]]
   

df_new = pd.DataFrame (labels_csv, columns = ["name", "abnormality", "pathology", "density", "birads", "subtlety"])
df_new.to_csv('CBIS_DDSM_description_clean.csv')

df = pd.read_csv("CBIS_DDSM_description_clean.csv")

new_concepts = df[["name"]]

for k in  ["abnormality", "pathology", "density", "birads", "subtlety","breast","image_view","mass_shape","mass_margins","calc_type","calc_distribution"]:
    df_new = pd.get_dummies(df[k])

    for col in df_new.columns:
        df_new = df_new.rename(columns={col: k +"_" + str(col)})
    new_concepts = pd.concat([new_concepts, df_new], axis =1)

new_concepts.to_csv('CBIS_DDSM_description_all_concepts.csv')