In [None]:
import numpy as np
import pydicom
import pandas as pd
import os
import sys
import matplotlib.pyplot as pl
from report_parser import parse_report
from PIL import Image
from tqdm.notebook import tqdm
from collections import defaultdict
import pickle
import gzip
import random
import spacy
import gensim, logging
from pathlib import Path
from sklearn.model_selection import train_test_split
import torch
import torchvision
from google.cloud import storage
from collections import Counter
from pathlib import Path
from IPython.display import display 

from report_parser import parse_report

PAD_CHAR = '**PAD**'
UNK_CHAR = '**UNK**'

dataset_file_path = ''
local_file_path = ''

In [None]:
df_records = pd.read_csv(os.path.join(dataset_file_path,'cxr-record-list.csv.gz'))
df_records.rename(columns={'path': 'dicom_path'}, inplace=True)
print(df_records)

In [None]:
# Load a few images to spot check
n = 3
resize = torchvision.transforms.Resize(256)
crop = torchvision.transforms.CenterCrop(256)
for index, row in tqdm(df_records.iterrows(), total=df_records.shape[0]):
    dicom_file = row['dicom_path']
    dicom_path = os.path.join(dataset_file_path, dicom_file)
    plan = pydicom.dcmread(dicom_path, stop_before_pixels=False)
#     print(plan)
#     sys.exit()
    view_position = plan.ViewPosition
    print('The image has {} x {} voxels'.format(plan.pixel_array.shape[0],
                                            plan.pixel_array.shape[1]))
    image = Image.fromarray(np.uint8(plan.pixel_array/plan.pixel_array.max()*255))
    resized_image = crop(resize(image))
#     print(image)
#     sys.exit()

    if image != None:
        print('\t-----------')
        print('\tPatient ID:', plan.PatientID)
        print('\tView Position:', view_position)
        pl.figure(figsize=(18,9))
#         pl.imshow(image, cmap=pl.cm.bone)
        pl.imshow(image, cmap='gray')
        pl.show()
        pl.imshow(resized_image, cmap='gray')
        pl.show()
        
    if index > n:
        break


In [None]:
print('Loading train dataframe...')
datasetPath = os.path.join(local_file_path, 'df_master_train.csv')
df_master_train = pd.read_csv(os.path.join(datasetPath))

print('Loading val dataframe...')
datasetPath = os.path.join(local_file_path, 'df_master_val.csv')
df_master_val = pd.read_csv(os.path.join(datasetPath))

print('Loading test dataframe...')
datasetPath = os.path.join(local_file_path, 'df_master_test.csv')
df_master_test = pd.read_csv(os.path.join(datasetPath))

print('Finished loading dataframes')
print(df_master_train.columns)
print(df_master_train.head(1))

In [None]:
densenet_121 = torch.hub.load('pytorch/vision:v0.5.0', 'densenet121', pretrained=True)
print(densenet_121)

model = torch.nn.Sequential(*list(densenet_121.features.children())[:-1])
model.eval()
for param in model.parameters():
    param.requires_grad = False
print('#################################')
print(model)

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"]="5"

In [None]:
image_pipeline = torchvision.transforms.Compose([
    torchvision.transforms.Resize(256),
    torchvision.transforms.CenterCrop(256),
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


print('EXTRACTING VIEWS AND IMAGE FEATURES FOR VALIDATION IMAGES')
df_master_val['processed_dicom_path'] = df_master_val.apply(lambda row: row.dicom_path[:-3] + 'np', axis=1)
views = []
images = []
paths = []
for index, row in tqdm(df_master_val.iterrows(), total=df_master_val.shape[0]):
    dicom_file = row['dicom_path']
    dicom_id = row['dicom_id']
    dicom_path = os.path.join(dataset_file_path, dicom_file)
    save_path = os.path.join(local_file_path, row['processed_dicom_path'])
    Path(os.path.dirname(save_path)).mkdir(parents=True, exist_ok=True)
    plan = pydicom.dcmread(dicom_path, stop_before_pixels=False)
    views.append(plan.ViewPosition)
    image = Image.fromarray(np.uint8(plan.pixel_array/plan.pixel_array.max()*255)).convert("RGB")
    paths.append(save_path)
    image = image_pipeline(image)
    
    if torch.cuda.is_available():
        image = image.to('cuda')
        model.to('cuda')
    else:
        print('ERROR GPU UNAVAILABLE')
        break
    images.append(image)
    if len(images)==64:
        batch = torch.stack(images, dim=0)
        output = model(batch)

        for i in range(len(paths)):
            torch.save(output[i].to(device=torch.device("cpu")), paths[i])
        images = []
        paths = []


In [None]:
print(Counter(views))

In [None]:
df_master_val['dicom_view'] = views

print('Saving val dataframe...')
datasetPath = os.path.join(local_file_path, 'df_master_val.csv')
df_master_val.to_csv(datasetPath, index=False)
print('Dataframe saved')

In [None]:
print('EXTRACTING VIEWS AND IMAGE FEATURES FOR TESTING IMAGES')
df_master_test['processed_dicom_path'] = df_master_test.apply(lambda row: row.dicom_path[:-3] + 'np', axis=1)
views = []
images = []
paths = []
for index, row in tqdm(df_master_test.iterrows(), total=df_master_test.shape[0]):
    dicom_file = row['dicom_path']
    dicom_id = row['dicom_id']
    dicom_path = os.path.join(dataset_file_path, dicom_file)
    save_path = os.path.join(local_file_path, row['processed_dicom_path'])
    Path(os.path.dirname(save_path)).mkdir(parents=True, exist_ok=True)
    plan = pydicom.dcmread(dicom_path, stop_before_pixels=False)
    views.append(plan.ViewPosition)
    image = Image.fromarray(np.uint8(plan.pixel_array/plan.pixel_array.max()*255)).convert("RGB")
    paths.append(save_path)
    image = image_pipeline(image)
    
    if torch.cuda.is_available():
        image = image.to('cuda')
        model.to('cuda')
    else:
        print('ERROR GPU UNAVAILABLE')
        break
    images.append(image)
    if len(images)==64:
        batch = torch.stack(images, dim=0)
        output = model(batch)
        for i in range(len(paths)):
            torch.save(output[i].to(device=torch.device("cpu")), paths[i])
        images = []
        paths = []

In [None]:
print(Counter(views))

In [None]:
df_master_test['dicom_view'] = views

print('Saving test dataframe...')
datasetPath = os.path.join(local_file_path, 'df_master_test.csv')
df_master_test.to_csv(datasetPath, index=False)
print('Dataframe saved')

In [None]:
print('EXTRACTING VIEWS AND IMAGE FEATURES FOR TRAINING IMAGES')
df_master_train['processed_dicom_path'] = df_master_train.apply(lambda row: row.dicom_path[:-3] + 'np', axis=1)
views = []
images = []
paths = []
for index, row in tqdm(df_master_train.iterrows(), total=df_master_train.shape[0]):
    dicom_file = row['dicom_path']
    dicom_id = row['dicom_id']
    dicom_path = os.path.join(dataset_file_path, dicom_file)
    save_path = os.path.join(local_file_path, row['processed_dicom_path'])
    Path(os.path.dirname(save_path)).mkdir(parents=True, exist_ok=True)
    plan = pydicom.dcmread(dicom_path, stop_before_pixels=False)
    views.append(plan.ViewPosition)
    image = Image.fromarray(np.uint8(plan.pixel_array/plan.pixel_array.max()*255)).convert("RGB")
    paths.append(save_path)
    image = image_pipeline(image)
    
    if torch.cuda.is_available():
        image = image.to('cuda')
        model.to('cuda')
    else:
        print('ERROR GPU UNAVAILABLE')
        break
    images.append(image)
    if len(images)==64:
        batch = torch.stack(images, dim=0)
        output = model(batch)
        for i in range(len(paths)):
            torch.save(output[i].to(device=torch.device("cpu")), paths[i])
        images = []
        paths = []

In [None]:
print(Counter(views))

In [None]:
df_master_train['dicom_view'] = views

print('Saving train dataframe...')
datasetPath = os.path.join(local_file_path, 'df_master_train.csv')
df_master_train.to_csv(datasetPath, index=False)
print('Dataframe saved')