In [None]:
import numpy as np
import pydicom
import pandas as pd
import os
import matplotlib.pyplot as pl
from report_parser import parse_report
from PIL import Image
from tqdm.notebook import tqdm
from collections import defaultdict
import pickle
import gzip
import random
import spacy
import gensim, logging
from pathlib import Path
from sklearn.model_selection import train_test_split
import csv

from report_parser import parse_report
from google.cloud import storage

PAD_CHAR = '**PAD**'
UNK_CHAR = '**UNK**'
START_CHAR = '**START**'
END_CHAR = '**END**'

dataset_file_path = ''
local_file_path = ''
chexpert_csv_file_path = ''

In [None]:
class MySentences(object):
    def __init__(self, reports):
        self.reports = reports

    def __iter__(self):
        for report in self.reports:
            #iterates over all tokens in each sentence in the report
            yield [token.text for token in report]

In [None]:
df_studies = pd.read_csv(os.path.join(dataset_file_path, 'cxr-study-list.csv.gz'))
df_studies.rename(columns={'path': 'report_path'}, inplace=True)
print(df_studies.columns.tolist())

print('Number of reports: ')
print(len(df_studies))

In [None]:
nlp = spacy.load('en_core_web_sm')

processed_notes = {}
report_indices = []
for index, row in tqdm(df_studies.iterrows(), total=df_studies.shape[0]):
    report_path = row['report_path']
    parsed_rep = parse_report(os.path.join(dataset_file_path, report_path))
    if 'findings' in parsed_rep:
        if not parsed_rep['findings'] or parsed_rep['findings'].isspace():
            print('Null section')
            print(parsed_rep['findings'])
        else:
            tokenized = nlp(parsed_rep['findings'], disable=['ner'])
            processed_notes[report_path]=tokenized
            report_indices.append(index)
#     print('\n\nFINDINGS...')
#     print(parsed_rep['findings'])


In [None]:
print('Number of processed reports: ')
print(len(processed_notes))

print('DROPPING REPORTS WITHOUT FINDINGS')
df_studies = df_studies.iloc[report_indices]
df_studies.reset_index(drop=True, inplace=True)

print('Number of reports with findings: ')
print(len(df_studies))
print(df_studies)

In [None]:
print('Splitting data')
df_subjects = df_studies[['subject_id']].drop_duplicates()
train_subjects, test_subjects = train_test_split(df_subjects, test_size=0.2, random_state=0)
train_subjects, val_subjects = train_test_split(train_subjects, test_size=0.125, random_state=0)

print('Total\nNumber of subjects: {}\tPercentage: {}'.format(len(df_subjects), len(df_subjects)/len(df_subjects)))
print('Train\nNumber of subjects: {}\tPercentage: {}'.format(len(train_subjects), len(train_subjects)/len(df_subjects)))
print('Val\nNumber of subjects: {}\tPercentage: {}'.format(len(val_subjects), len(val_subjects)/len(df_subjects)))
print('Test\nNumber of subjects: {}\tPercentage: {}'.format(len(test_subjects), len(test_subjects)/len(df_subjects)))

df_train = df_studies[df_studies.subject_id.isin(train_subjects.subject_id)]
df_val = df_studies[df_studies.subject_id.isin(val_subjects.subject_id)]
df_test = df_studies[df_studies.subject_id.isin(test_subjects.subject_id)]

print('Total\nNumber of studies: {}\tPercentage: {}'.format(len(df_studies), len(df_studies)/len(df_studies)))
print('Train\nNumber of studies: {}\tPercentage: {}'.format(len(df_train), len(df_train)/len(df_studies)))
print('Val\nNumber of studies: {}\tPercentage: {}'.format(len(df_val), len(df_val)/len(df_studies)))
print('Test\nNumber of studies: {}\tPercentage: {}'.format(len(df_test), len(df_test)/len(df_studies)))

# print(df_train.head(1))
# print(df_subjects.head(1))

In [None]:
train_notes = []
for index, row in tqdm(df_train.iterrows(), total=df_train.shape[0]):
    train_notes.append(processed_notes[row['report_path']])


report_iter = MySentences(train_notes)
i=0
for test in report_iter:
    print('REPORT')
    print(report_iter.reports[i])
    print('REPORT TOKENS')
    print(test)
    print(type(test[0]))
    i+=1
    if i == 5:
        break

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
model = gensim.models.Word2Vec(report_iter, min_count=5, workers=8, size=256)



In [None]:
word_vectors = model.wv
datasetPath = os.path.join(local_file_path, 'cxr_vectors.kv')
word_vectors.save(datasetPath)

datasetPath = os.path.join(local_file_path, 'cxr_w2v_model')
model.save(datasetPath)

In [None]:
np.random.seed(0)
pad_emb = np.zeros((1, 256), dtype=np.float32)
unk_emb = np.random.randn(1, 256)
start_emb = np.random.randn(1, 256)
end_emb = np.random.randn(1, 256)

weights = model.wv.vectors
unk_ind = weights.shape[0]
print(unk_ind)
print(weights.shape)
print(weights.dtype)
weights = np.concatenate((weights, unk_emb.astype(np.float32), end_emb.astype(np.float32), start_emb.astype(np.float32), pad_emb), axis=0)

print(weights.shape)
print(weights.dtype)

datasetPath = os.path.join(local_file_path, 'cxr_w2v.npy')
np.save(datasetPath, weights)

In [None]:
for path, report in processed_notes.items():
    for sentence in report.sents:
        print(sentence)
    break

In [None]:
word2ind = {}
ind2word = {}
for idx, key in enumerate(word_vectors.vocab):
            word2ind[key] = idx
            ind2word[idx] = key
word2ind[UNK_CHAR] = idx+1
word2ind[END_CHAR] = idx+2
word2ind[START_CHAR] = idx+3
word2ind[PAD_CHAR] = idx+4
ind2word[idx+1] = UNK_CHAR
ind2word[idx+2] = END_CHAR
ind2word[idx+3] = START_CHAR
ind2word[idx+4] = PAD_CHAR

datasetPath = os.path.join(local_file_path, 'word2ind.npy')
np.save(datasetPath, word2ind)

datasetPath = os.path.join(local_file_path, 'ind2word.npy')
np.save(datasetPath, ind2word)

In [None]:
print(df_studies)

In [None]:
idx=0
csv_reports = []
paths = []
for report_path, report in tqdm(processed_notes.items()):
    if idx < 3:
        print('Report ' + str(idx))
        print(report_path)
    file_path = os.path.join(local_file_path, report_path)
    Path(os.path.dirname(file_path)).mkdir(parents=True, exist_ok=True)
    with open(file_path, 'w') as file:   
        full_report = []
        for sentence in report.sents:
            sent = []
            if idx < 3:
                print(sentence)
            for word in sentence:
                if word.text in word_vectors.vocab:
                    sent.append(word.text)
                    full_report.append(word.text)
                else:
                    sent.append(UNK_CHAR)
                    full_report.append(UNK_CHAR)
            file.write(' '.join(x for x in sent))
            file.write('\n')
            if idx < 3:
                print(sent) 
        csv_reports.append([' '.join(x for x in full_report)])
        paths.append([report_path])
        if idx < 3:
            print(full_report)
            print(csv_reports[-1][0])
        idx+=1
        
assert len(csv_reports) == len(paths)

print('SAVING REPORTS TO CSV...')
csv_file_path = os.path.join(chexpert_csv_file_path, 'reports.csv')
with open(csv_file_path, 'w', newline='') as csv_file:
    wr = csv.writer(csv_file, quoting=csv.QUOTE_ALL)
    wr.writerows(csv_reports)

print('Saving filepath dataframe...')
csv_file_path = os.path.join(chexpert_csv_file_path, 'report_paths.csv')
with open(csv_file_path, 'w', newline='') as csv_file:
    wr = csv.writer(csv_file, quoting=csv.QUOTE_ALL)
    wr.writerows(paths)
#         if idx > 3:
#             break

In [None]:
df_records = pd.read_csv(os.path.join(dataset_file_path, 'cxr-record-list.csv.gz'))
df_records.rename(columns={'path': 'dicom_path'}, inplace=True)
print(df_records.columns.tolist())

print('Number of images: ')
print(len(df_records))

df_master_train = df_records.merge(df_train, how='inner', on=['subject_id', 'study_id'])

print('Number of unique training reports: ')
print(len(df_train))
print('Number of training reports after matching to images: ')
print(len(df_master_train))

df_master_val = df_records.merge(df_val, how='inner', on=['subject_id', 'study_id'])

print('Number of unique val reports: ')
print(len(df_val))
print('Number of val reports after matching to images: ')
print(len(df_master_val))

df_master_test = df_records.merge(df_test, how='inner', on=['subject_id', 'study_id'])

print('Number of unique test reports: ')
print(len(df_test))
print('Number of test reports after matching to images: ')
print(len(df_master_test))

In [None]:
print('Loading train df...')
old_df_train = pd.read_csv(os.path.join(local_file_path, 'df_master_train.csv'))
print(len(old_df_train))
print('Loading val df...')
old_df_val = pd.read_csv(os.path.join(local_file_path, 'df_master_val.csv'))
print(len(old_df_val))
print('Loading test df...')
old_df_test = pd.read_csv(os.path.join(local_file_path, 'df_master_test.csv'))
print(len(old_df_test))

print('Concatenating dataframes...')
df_master = pd.concat([old_df_train, old_df_val, old_df_test])

print(df_master)
print(len(df_master))

In [None]:
print('Merging train df...')
df_master_train = df_master.merge(df_master_train[['study_id', 'dicom_id']], how = 'inner', on=['study_id', 'dicom_id'])
print(len(df_master_train))
# print(df_master_train)

df_master_val = df_master.merge(df_master_val[['study_id', 'dicom_id']], how = 'inner', on=['study_id', 'dicom_id'])
print(len(df_master_val))

df_master_test = df_master.merge(df_master_test[['study_id', 'dicom_id']], how = 'inner', on=['study_id', 'dicom_id'])
print(len(df_master_test))

In [None]:
print('Saving train dataframe...')
datasetPath = os.path.join(local_file_path, 'df_master_train.csv')
df_master_train.to_csv(datasetPath, index=False)

print('Saving val dataframe...')
datasetPath = os.path.join(local_file_path, 'df_master_val.csv')
df_master_val.to_csv(datasetPath, index=False)

print('Saving test dataframe...')
datasetPath = os.path.join(local_file_path, 'df_master_test.csv')
df_master_test.to_csv(datasetPath, index=False)

print('Finished saving dataframes')