# DICOM Standard Part 6
Explore [DICOM_harvest_xml.ipynb](https://github.com/paulnagy/DICOM2OMOP/blob/main/DICOM_harvest_xml.ipynb) from DICOM2OMOP, December 13, 2023
- DICOM Part 6: Data Dictionary
- Chapter 6. Registry of DICOM Data Elements
- [xml.etree.elementtree](https://docs.python.org/ko/3/library/xml.etree.elementtree.html)
- [requests](https://light-tree.tistory.com/6)

In [3]:
# Import modules
import requests # HTTP를 사용하기 위함
import xml.etree.ElementTree as ET # python 2.5x 버전부터 내부 라이브러리로 포함되어 따로 설치하지 않아도 됨

In [4]:
# URI for DICOM Part 6: Data Dictionary
xml_uri = 'https://dicom.nema.org/medical/dicom/current/source/docbook/part06/part06.xml' #HTTPS

# Parse the XML content
response = requests.get(xml_uri) # GET 방식
root = ET.fromstring(response.content) # attrib 

#### pause
- `root = ET.fromstring()`
- `ET.fromstring()`은 문자열에서 `Element`로 직접 XML 구문을 분석하고, 구문 분석된 tree의 `root` (`Element`) 

In [8]:
# (tag) a string identifying what kind of data this element represents
root.tag

'{http://docbook.org/ns/docbook}book'

In [9]:
# (attrib) a dictionary containing the element's attributes
root.attrib

{'label': 'PS3.6',
 'version': '5.0',
 '{http://www.w3.org/XML/1998/namespace}id': 'PS3.6'}

iterate 할 수 있는 child node 확인
- child는 중첩되며, index로 특정 child node에 액세스 가능

In [17]:
for child in root:
    print(child.tag, child.attrib)

{http://docbook.org/ns/docbook}title {}
{http://docbook.org/ns/docbook}subtitle {}
{http://docbook.org/ns/docbook}info {}
{http://docbook.org/ns/docbook}chapter {'label': '', 'status': '1', '{http://www.w3.org/XML/1998/namespace}id': 'chapter_Notice'}
{http://docbook.org/ns/docbook}chapter {'label': '', 'status': '1', '{http://www.w3.org/XML/1998/namespace}id': 'chapter_Foreword'}
{http://docbook.org/ns/docbook}chapter {'label': '1', 'status': '1', '{http://www.w3.org/XML/1998/namespace}id': 'chapter_1'}
{http://docbook.org/ns/docbook}chapter {'label': '2', 'status': '1', '{http://www.w3.org/XML/1998/namespace}id': 'chapter_2'}
{http://docbook.org/ns/docbook}chapter {'label': '3', 'status': '1', '{http://www.w3.org/XML/1998/namespace}id': 'chapter_3'}
{http://docbook.org/ns/docbook}chapter {'label': '4', 'status': '1', '{http://www.w3.org/XML/1998/namespace}id': 'chapter_4'}
{http://docbook.org/ns/docbook}chapter {'label': '5', 'status': '1', '{http://www.w3.org/XML/1998/namespace}id':

#### continue

In [5]:
# Find the child node with label="6" for 'chapter 6. Registry of DICOM Data Elements'
for child in root:
    if child.attrib.get('label') == '6':
        selected_node = child
        break

---

In [40]:
import pandas as pd
import numpy as np
from collections import defaultdict
import re

### Get table 1 from Chapter 6
- Table 6-1. Registry of DICOM Data Elements
- [참고: HTML/ table, thead, tbody, tfoot, th, tr, td/ 표만들기](https://www.codingfactory.net/10232)

In [61]:
# findall이 아니라 find를 사용한 이유; 첫번째 Table, 즉 Table 1만 찾을거여서
grandchild_table = selected_node.find('.//{http://docbook.org/ns/docbook}table')

In [200]:
grandchild_table.attrib # Table 6-1. 

{'frame': 'box',
 'label': '6-1',
 'rules': 'all',
 '{http://www.w3.org/XML/1998/namespace}id': 'table_6-1'}

-  'xml.etree.ElementTree.Element' object 

In [92]:
elements = grandchild_table.findall('.//{http://docbook.org/ns/docbook}*')
elements[:3] #  'xml.etree.ElementTree.Element' object

[<Element '{http://docbook.org/ns/docbook}caption' at 0x7f2fc820fa10>,
 <Element '{http://docbook.org/ns/docbook}thead' at 0x7f2fc820fb00>,
 <Element '{http://docbook.org/ns/docbook}tr' at 0x7f2fc820fbf0>]

In [99]:
elements[0].tag

'{http://docbook.org/ns/docbook}caption'

In [100]:
for i, elem in enumerate(elements):
    print(elem.tag)
    if i == 10:
        break

{http://docbook.org/ns/docbook}caption
{http://docbook.org/ns/docbook}thead
{http://docbook.org/ns/docbook}tr
{http://docbook.org/ns/docbook}th
{http://docbook.org/ns/docbook}para
{http://docbook.org/ns/docbook}emphasis
{http://docbook.org/ns/docbook}th
{http://docbook.org/ns/docbook}para
{http://docbook.org/ns/docbook}emphasis
{http://docbook.org/ns/docbook}th
{http://docbook.org/ns/docbook}para


In [106]:
# URL 네임스페이스를 없애고 값만 추출
tag_set = {re.sub(r'\{.*?\}', '', element.tag) for element in elements}
tag_list = list(tag_set)
print(tag_list)

['tr', 'caption', 'thead', 'xref', 'td', 'th', 'tbody', 'para', 'emphasis']


In [108]:
tag_order = list(range(len(tag_list)))
tag_dict = dict(zip(tag_order, tag_list))
tag_dict

{0: 'tr',
 1: 'caption',
 2: 'thead',
 3: 'xref',
 4: 'td',
 5: 'th',
 6: 'tbody',
 7: 'para',
 8: 'emphasis'}

In [118]:
tag_url = './/{http://docbook.org/ns/docbook}'
# table1의 (1st) tag - use .text for checking a(find) text
title = grandchild_table.find(tag_url + 'caption') # title ('Registry of DICOM Data Elements')
thead = grandchild_table.find(tag_url + 'thead') # groups the header content in the table
tbody = grandchild_table.find(tag_url + 'tbody') # groups the body content in the table

In [143]:
# check all(findall) the contents
for i in grandchild_table.findall(tag_url + tag_dict[1]):
    print(i.text)

Registry of DICOM Data Elements


In [120]:
for tr in thead.findall(tag_url+'tr'):
    print(tr)
    thead_names = tr.findall(tag_url + 'emphasis')
    print(thead_names)

<Element '{http://docbook.org/ns/docbook}tr' at 0x7f2fc820fbf0>
[<Element '{http://docbook.org/ns/docbook}emphasis' at 0x7f2fc820fe70>, <Element '{http://docbook.org/ns/docbook}emphasis' at 0x7f2fc8218130>, <Element '{http://docbook.org/ns/docbook}emphasis' at 0x7f2fc82183b0>, <Element '{http://docbook.org/ns/docbook}emphasis' at 0x7f2fc8218630>, <Element '{http://docbook.org/ns/docbook}emphasis' at 0x7f2fc82188b0>]


In [124]:
for name in thead_names:
    print('element: ', name)
    print('text: ', name.text)
    print('\n')

element:  <Element '{http://docbook.org/ns/docbook}emphasis' at 0x7f2fc820fe70>
text:  Tag


element:  <Element '{http://docbook.org/ns/docbook}emphasis' at 0x7f2fc8218130>
text:  Name


element:  <Element '{http://docbook.org/ns/docbook}emphasis' at 0x7f2fc82183b0>
text:  Keyword


element:  <Element '{http://docbook.org/ns/docbook}emphasis' at 0x7f2fc8218630>
text:  VR


element:  <Element '{http://docbook.org/ns/docbook}emphasis' at 0x7f2fc82188b0>
text:  VM




- store column names

In [147]:
column_names = []
for name in thead_names:
    column_names.append(name.text.strip())
print(column_names)

['Tag', 'Name', 'Keyword', 'VR', 'VM']


- store row values

In [169]:
rows = tbody.findall(tag_url+'tr')
rows_data = []

In [174]:
para = rows[0].findall(tag_url+'para')
emphasis = para[0].find(tag_url+'emphasis')

In [176]:
emphasis.text

'(0008,0001)'

In [177]:
emphasis.text.strip()

'(0008,0001)'

In [183]:
# Get table 1 from Chapter 6
if selected_node is not None:
    grandchild_table = selected_node.find('.//{http://docbook.org/ns/docbook}table')
    if grandchild_table is not None:
        thead = grandchild_table.find('.//{http://docbook.org/ns/docbook}thead')
        tbody = grandchild_table.find('.//{http://docbook.org/ns/docbook}tbody')

        # Store column names
        column_names = []
        if thead is not None:
            for tr in thead.findall('.//{http://docbook.org/ns/docbook}tr'):
                thead_names = tr.findall('.//{http://docbook.org/ns/docbook}emphasis')
                if thead_names is not None:
                    for name in thead_names:
                        column_names.append(name.text.strip())

            # Store row values
            rows = tbody.findall('.//{http://docbook.org/ns/docbook}tr')
            rows_data = []

            # Loop through tbody to extract values
            for tr in rows:
                row_values = defaultdict(lambda: None)
                idx = 0
                for para in tr.findall('.//{http://docbook.org/ns/docbook}para'):
                    emphasis = para.find('.//{http://docbook.org/ns/docbook}emphasis')
                    if emphasis is not None and emphasis.text is not None:
                        row_values[column_names[idx]] = emphasis.text.strip()
                        idx += 1
                        if idx >= len(column_names):
                            break
                    else:
                        if para is not None and para.text is not None:
                            row_values[column_names[idx]] = para.text.strip()
                            idx += 1
                            if idx >= len(column_names):
                                break

                # Append to the table only if it has more than 3 values
                if sum(1 for value in row_values.values() if value is not None) > 3:
                    rows_data.append(row_values)

            # Save the output as a DataFrame
            df = pd.DataFrame(rows_data, columns=column_names)
else:
    print("Node with label='6' not found.")

In [184]:
column_names

['Tag', 'Name', 'Keyword', 'VR', 'VM']

In [185]:
df.head(10)

Unnamed: 0,Tag,Name,Keyword,VR,VM
0,"(0008,0001)",Length to End,Length​To​End,UL,1
1,"(0008,0005)",Specific Character Set,Specific​Character​Set,CS,1-n
2,"(0008,0006)",Language Code Sequence,Language​Code​Sequence,SQ,1
3,"(0008,0008)",Image Type,Image​Type,CS,2-n
4,"(0008,0010)",Recognition Code,Recognition​Code,SH,1
5,"(0008,0012)",Instance Creation Date,Instance​Creation​Date,DA,1
6,"(0008,0013)",Instance Creation Time,Instance​Creation​Time,TM,1
7,"(0008,0014)",Instance Creator UID,Instance​Creator​UID,UI,1
8,"(0008,0015)",Instance Coercion DateTime,Instance​Coercion​Date​Time,DT,1
9,"(0008,0016)",SOP Class UID,SOP​Class​UID,UI,1


In [194]:
df.VR.unique()

array(['UL', 'CS', 'SQ', 'SH', 'DA', 'TM', 'UI', 'DT', 'US', 'LO', 'AE',
       'ST', 'PN', 'LT', 'UR', 'UC', '', 'UT', 'UV', 'OB', 'IS', 'FD',
       'DS', 'FL', 'AS', 'AT', 'OB or OW', 'OF', 'SL', 'SS', 'US or SS',
       'US or SS or OW', 'OW', 'US or OW', 'OD', 'OL', 'UN', 'OV', 'SV',
       'See Note'], dtype=object)

In [195]:
df.VR.value_counts()

VR
SQ                1265
CS                 799
DS                 483
US                 379
LO                 355
FD                 309
FL                 292
IS                 239
SH                 130
ST                 101
UI                  91
UL                  79
DT                  62
DA                  61
LT                  60
TM                  55
UT                  44
PN                  32
OB                  30
US or SS            25
OW                  24
AT                  24
SS                  18
OF                  15
SL                  15
UR                  13
UC                  13
OB or OW            11
AE                  11
OL                   7
UV                   6
OD                   6
OV                   3
See Note             3
AS                   2
US or SS or OW       1
                     1
US or OW             1
UN                   1
SV                   1
Name: count, dtype: int64

In [197]:
# 일부 VR에 대해서만 확인한다면
identified_VR = ['AT', 'CS', 'DA', 'DT', 'DS', 'FL', 'FD', 'IS', 'SL', 'SS', 'SV', 'TM', 'UL', 'US', 'UV']
df[df['VR'].isin(identified_VR)].VR.value_counts() #CS is values; others are numbers 

VR
CS    799
DS    483
US    379
FD    309
FL    292
IS    239
UL     79
DT     62
DA     61
TM     55
AT     24
SS     18
SL     15
UV      6
SV      1
Name: count, dtype: int64

### (+ JKL, 12/13/2023) DICOM VR의 name, definition 가져오기
Table 6.2-1. DICOM Value Representations 가져오면 됨
- PS3.5 Data Structures and Encoding 의
- (chapter) 6. Value Encoding 에서
- 6.2 Value Representation (VR)의 Table

In [224]:
# URI for DICOM Standard Part 5
xml05_uri = 'https://dicom.nema.org/medical/dicom/current/source/docbook/part05/part05.xml'

# Parse the XML content
response05 = requests.get(xml05_uri) 
root05 = ET.fromstring(response05.content) 

In [226]:
# for chapter 6. Value Encoding
for child in root05:
    if child.attrib.get('label') == '6':
        VR_node = child
        break

In [227]:
VR_node.attrib

{'label': '6',
 'status': '1',
 '{http://www.w3.org/XML/1998/namespace}id': 'chapter_6'}

In [229]:
VR_node.findall('.//{http://docbook.org/ns/docbook}table')

[<Element '{http://docbook.org/ns/docbook}table' at 0x7f2f6c5d62f0>,
 <Element '{http://docbook.org/ns/docbook}table' at 0x7f2f6c5dca40>]

In [239]:
i = 0
for table in VR_node.findall('.//{http://docbook.org/ns/docbook}table'):
    title = table.find(tag_url + 'caption')
    print(i) 
    print(title.text)
    i +=1

0
DICOM Control Characters and Their Encoding
1
DICOM Value Representations


In [240]:
VR_table = VR_node.findall('.//{http://docbook.org/ns/docbook}table')[1]

In [None]:
thead = VR_table.find('.//{http://docbook.org/ns/docbook}thead')
tbody = VR_table.find('.//{http://docbook.org/ns/docbook}tbody')

In [280]:
#from collections import defaultdict
#import pandas as pd

# Store column names
column_names = []
if thead is not None:
    for tr in thead.findall('.//{http://docbook.org/ns/docbook}tr'):
        thead_names = tr.findall('.//{http://docbook.org/ns/docbook}emphasis')
        if thead_names is not None:
            for name in thead_names:
                # 수정: 'VR Name'을 'VR'과 'Name'으로 분할하여 저장
                if name.text.strip() == 'VR Name':
                    column_names.extend(['VR', 'Name'])

    # Store row values
    rows = tbody.findall('.//{http://docbook.org/ns/docbook}tr')
    rows_data = []

    # Loop through tbody to extract values
    for tr in rows:
        row_values = defaultdict(lambda: None)
        idx = 0
        for para in tr.findall('.//{http://docbook.org/ns/docbook}para'):
            emphasis = para.find('.//{http://docbook.org/ns/docbook}emphasis')
            if emphasis is not None and emphasis.text is not None:
                row_values[column_names[idx]] = emphasis.text.strip()
                idx += 1
                if idx >= len(column_names):
                    break
            else:
                if para is not None and para.text is not None:
                    row_values[column_names[idx]] = para.text.strip()
                    idx += 1
                    if idx >= len(column_names):
                        break
                        
        rows_data.append(row_values)

    # Save the output as a DataFrame
    df_vr = pd.DataFrame(rows_data, columns=column_names)


In [281]:
df_vr.head(3)

Unnamed: 0,VR,Name
0,AE,Application Entity
1,AS,Age String
2,AT,Attribute Tag


In [282]:
df_vr

Unnamed: 0,VR,Name
0,AE,Application Entity
1,AS,Age String
2,AT,Attribute Tag
3,CS,Code String
4,DA,Date
5,DS,Decimal String
6,DT,Date Time
7,FL,Floating Point Single
8,FD,Floating Point Double
9,IS,Integer String


### merge
- PS3.6 6-1. Registry of DICOM Data Elements table과
- PS3.5 6.2에 명시된 VR의 name을 합쳐보자
- 왜냐면 나는 VR 약자가 뭔지 잘 안와닿음

In [285]:
print(len(df), len(df_vr))

5067 34


In [283]:
df.head(3)

Unnamed: 0,Tag,Name,Keyword,VR,VM
0,"(0008,0001)",Length to End,Length​To​End,UL,1
1,"(0008,0005)",Specific Character Set,Specific​Character​Set,CS,1-n
2,"(0008,0006)",Language Code Sequence,Language​Code​Sequence,SQ,1


In [284]:
df_vr.head(3)

Unnamed: 0,VR,Name
0,AE,Application Entity
1,AS,Age String
2,AT,Attribute Tag


In [287]:
df_vr.rename(columns = {'Name' : 'VR Name'}, inplace = True)

In [288]:
# merge
df_merged = pd.merge(df, df_vr, how='left', on='VR')
df_merged.head(10)

Unnamed: 0,Tag,Name,Keyword,VR,VM,VR Name
0,"(0008,0001)",Length to End,Length​To​End,UL,1,Unsigned Long
1,"(0008,0005)",Specific Character Set,Specific​Character​Set,CS,1-n,Code String
2,"(0008,0006)",Language Code Sequence,Language​Code​Sequence,SQ,1,Sequence of Items
3,"(0008,0008)",Image Type,Image​Type,CS,2-n,Code String
4,"(0008,0010)",Recognition Code,Recognition​Code,SH,1,Short String
5,"(0008,0012)",Instance Creation Date,Instance​Creation​Date,DA,1,Date
6,"(0008,0013)",Instance Creation Time,Instance​Creation​Time,TM,1,Time
7,"(0008,0014)",Instance Creator UID,Instance​Creator​UID,UI,1,Unique Identifier (UID)
8,"(0008,0015)",Instance Coercion DateTime,Instance​Coercion​Date​Time,DT,1,Date Time
9,"(0008,0016)",SOP Class UID,SOP​Class​UID,UI,1,Unique Identifier (UID)


In [294]:
df_merged = df_merged.astype(str)

In [296]:
df_merged

Unnamed: 0,Tag,Name,Keyword,VR,VM,VR Name
0,"(0008,0001)",Length to End,Length​To​End,UL,1,Unsigned Long
1,"(0008,0005)",Specific Character Set,Specific​Character​Set,CS,1-n,Code String
2,"(0008,0006)",Language Code Sequence,Language​Code​Sequence,SQ,1,Sequence of Items
3,"(0008,0008)",Image Type,Image​Type,CS,2-n,Code String
4,"(0008,0010)",Recognition Code,Recognition​Code,SH,1,Short String
...,...,...,...,...,...,...
5062,"(FFFA,FFFA)",Digital Signatures Sequence,Digital​Signatures​Sequence,SQ,1,Sequence of Items
5063,"(FFFC,FFFC)",Data Set Trailing Padding,Data​Set​Trailing​Padding,OB,1,Other Byte
5064,"(FFFE,E000)",Item,Item,See Note,1,
5065,"(FFFE,E00D)",Item Delimitation Item,Item​Delimitation​Item,See Note,1,


In [299]:
df_merged['Tag'].astype(str)

0       (0008,0001)
1       (0008,0005)
2       (0008,0006)
3       (0008,0008)
4       (0008,0010)
           ...     
5062    (FFFA,FFFA)
5063    (FFFC,FFFC)
5064    (FFFE,E000)
5065    (FFFE,E00D)
5066    (FFFE,E0DD)
Name: Tag, Length: 5067, dtype: object

In [290]:
import os
my_path = os.getcwd()
print(my_path)

/home/jklee320/workspace


In [303]:
# save (csv로 뽑아서 인쇄해두자 ,, DICOM 용어들 아직 안익숙해서) 
# df_merged.to_csv(my_path + '/DataElementWithVRname_231213.csv', index=False, encoding="utf-8-sig")

# Anatomic Modifier
### DICOM Standard Part 16: Content Mapping Resource
Table CID 2. Anatomic Modifier
- [Part 16 - pdf](https://dicom.nema.org/medical/dicom/current/output/pdf/part16.pdf)
- 원본은 xml인데 용량이 커서, pdf로 보자

In [304]:
#import requests
#import xml.etree.ElementTree as ET
#import pandas as pd

def extract_DICOM_Part_16_table_data(xml_url, label):
    # Parse the XML content
    response = requests.get(xml_url)
    root = ET.fromstring(response.content)

    # Find the child node with the specified label
    selected_node = None
    for child in root:
        if child.attrib.get('label') == label:
            selected_node = child
            break

    # Get all tables from the selected node
    if selected_node is not None:
        table_elements = selected_node.findall('.//{http://docbook.org/ns/docbook}table')

        # Dictionary to store DataFrames
        tables_dict = {}

        # Iterate through each table
        for index, table_element in enumerate(table_elements):
            thead = table_element.find('.//{http://docbook.org/ns/docbook}thead')
            tbody = table_element.find('.//{http://docbook.org/ns/docbook}tbody')

            # Store column names
            column_names = []
            if thead is not None:
                # Store the column names for the first row
                for tr in thead.findall('.//{http://docbook.org/ns/docbook}tr'):
                    for idx in range(len(tr)):
                        if tr[idx][0].text is not None:
                            column_names.append(tr[idx][0].text.strip())

            # Store row values
            rows_data = []
            if tbody is not None:
                # Iterate through each row in tbody
                for tr in tbody.findall('.//{http://docbook.org/ns/docbook}tr'):
                    row_values = {}
                    for idx, td in enumerate(tr.findall('.//{http://docbook.org/ns/docbook}td')):
                        if idx < len(column_names):
                            # Extract values from nested elements, such as links
                            value = ' '.join(text.strip() for text in td.itertext())
                            if value:
                                row_values[column_names[idx]] = value.strip()

                    rows_data.append(row_values)

            # Save the output as a DataFrame and add to the dictionary
            df = pd.DataFrame(rows_data, columns=column_names)
            tables_dict[f'df_{index + 1}'] = df

        return tables_dict

    else:
        print(f"Node with label='{label}' not found.")
        return None

In [305]:
# URI for DICOM Standard Part 16
xml_uri = 'https://dicom.nema.org/medical/dicom/current/source/docbook/part16/part16.xml'

# Extract tables with label 'B'
result_dict = extract_DICOM_Part_16_table_data(xml_uri, 'B')

In [306]:
result_dict['df_1'].head(5)

Unnamed: 0,Coding Scheme Designator,Code Value,Code Meaning,SNOMED-RT ID,UMLS Concept Unique ID
0,SCT,24028007,Right,G-A100,C0205090
1,SCT,7771000,Left,G-A101,C0205091
2,SCT,51440002,Bilateral,G-A102,C0238767
3,SCT,66459002,Unilateral,G-A103,C0205092
4,SCT,49370004,Lateral,G-A104,C0205093
