In [None]:
import os
import pandas as pd
import numpy as np
import pydicom
import openpyxl
import matplotlib.pyplot as plt

In [None]:
print(os.getcwd())

In [None]:
# .xlsx 명단 파일
sdp_dir = '/home/yuhsuser/sdp_imported/CaseControl_List_241115/'
xlsx_fname_ls = os.listdir(sdp_dir)
print(xlsx_fname_ls)

In [None]:
# .dcm 폴더
home_dir = '/home/yuhsuser/'
dcm_folder_ls = []
for dcm_folder in os.listdir(home_dir):
    if dcm_folder.startswith('전규리(2024100305)'):
        dcm_folder_ls.append(dcm_folder)
        print(dcm_folder)

# dicom 데이터에 대한 메타 테이블 만들기
- 데이터셋 별로 (전규리(2024100305)_ + 환자군_치료시작60일 _ 환자군_치료종료180일 + 대조군) --- dcm_dataset: 총 3종
- 폴더명 (e.g. 1_11111_19000101_CR): 순번(e.g. 1), 연구등록번호(e.g. 11111), 영상검사연월(e.g. 1900-01)
- 안에 포함된 파일

In [None]:
# Define home directory and initalize lists
home_dir = '/home/yuhsuser/'
meta_data = []

# Iterate over datasets in home directory
for dcm_dataset in os.listdir(home_dir):
    # Check if the folder starts with '전규리(2024100305)'
    if dcm_dataset.startswith('전규리(2024100305)'): # e.g. ~환자군_치료시작60일
        # Traverse the folder structure to find .dcm files  
        for root, dirs, files in os.walk(os.path.join(home_dir, dcm_dataset)):
            for file in files:
                if file.lower().endswith('.dcm'):
                    file_path = os.path.join(root, file)
                    split_path = root.split(os.sep)

                    # Add metadata to the list
                    meta_data.append({
                        "dataset": dcm_dataset,
                        "folder": split_path[-1] if len(split_path) > 0 else None,
                        "dcm_fname": file,
                        "dcm_fpath": file_path})

# Pandas Dataframe
meta_table = pd.DataFrame(meta_data)
print(meta_table.shape)
meta_table.head()

In [None]:
# 잘 만들어졌는 지 확인해보기:치료시작 60일(3544 files), 대조군(3000files), 치료종료 180일(2195 files)
meta_table['dataset'].value_counts()

In [None]:
print(meta_table.groupby('dataset')['folder'].nunique()) # folder-level
print(meta_table.groupby('dataset')['dcm_fname'].nunique()) # file-level

In [None]:
expected_totals = 3544+3000+2195
real_totals = meta_table.shape[0] # 564개는 1개씩 더 많고, 12개는 2개씩 더 많은 상황
difference = expected_totals - real_totals
print(expected_totals)
print(real_totals)
print(difference)

## Issue #1
- 하나의 폴더명 안에 여러 개의 DICOM 파일이 포함된 경우 확인(2 dcms: 564개, 3 dcms: 12개)

In [None]:
# 폴더 안에 2개 이상의 dcm 파일이 있는 경우도 있나 봄
dup_folder_ls = meta_table[meta_table['folder'].duplicated()]['folder'].unique()
dup_folder_df = meta_table[meta_table['folder'].isin(dup_folder_ls)].groupby('folder')['dcm_fname'].agg(['count'])
dup_folder_df['count'].value_counts()

In [None]:
print(564*(2-1) + 12*(3-1)) # 564개는 1개씩 더 많고, 12개는 2개씩 더 많은 상황

In [None]:
def check_dcm(fpath):
    dcm = pydicom.dcmread(fpath)
    print(dcm.StudyInstanceUID)
    print(dcm.SeriesInstanceUID)
    print(dcm.AccessionNumber)
    print(dcm.ViewPosition)
    
    img = dcm.pixel_array
    plt.figure(figsize=(4,4))
    plt.imshow(img, cmap='gray')
    plt.title(f"DICOM Image: {os.path.basename(fpath)}")
    plt.axis('off')
    plt.show()

### Example 1: 2개인 경우

In [None]:
df_a = meta_table[meta_table['folder']=='a']
ls_a = df_a['dcm_fpath'].unique()
# 한 번 까보자
for fpath in ls_a:
    check_dcm(fpath)

### Example 2: 3개인 경우

In [None]:
df_b = meta_table[meta_table['folder']=='b']
ls_b = df_b['dcm_fpath'].unique()
# 한 번 까보자
for fpath in ls_b:
    check_dcm(fpath)

## Solution #1: View Point도 컬럼으로 추가한다
- 추가: dcm.ViewPosition
- +) SPLIT folder(name) INTO 's순번', 's연구등록번호', 's시행연월', 's모달'

In [None]:
# Extract multiple columns from the 'folder' column
meta_table[['s순번', 's연구등록번호', 's시행연월', 's모달']] = meta_table['folder'].str.split('_', expand=True)

# Initialize 'ViewPosition' column
meta_table['ViewPosition'] = None

# Update 'ViewPosition' column by iterating over the rows
def extract_view_position(row):
    fpath = row['dcm_fpath']
    try:
        dcm = pydicom.dcmread(fpath, stop_before_pixels=True)
        return dcm.ViewPosition
    except Exception as e:
        print(f"Error reading DICOM files: {fpath}, error: {e}")
        return None

# Use apply for row-wise operation
meta_table['ViewPosition'] = meta_table.apply(extract_view_position, axis=1)

In [None]:
print(meta_table.shape)
meta_table.head(2)

In [None]:
meta_table.nunique()

In [None]:
meta_table.isnull().sum()

In [None]:
# View 
meta_table['ViewPosition'].value_counts()

In [None]:
# Modality
meta_table['s모달'].value_counts()

In [None]:
# Save
print(home_dir)
#meta_table.to_csv(os.path.join(home_dir, 'workspace/Files/Meta_Table_241116.csv'), index=False)

# dicom file 매칭해보기
- 일단 매칭키가 필요함! 전체 dicom file을 포함하는 meta_table에 "dcm_id" 인덱스를 부여하자

In [None]:
meta_table.head(2)

## dcm_id 부여하기

In [None]:
# uniqueness of each dcm fname 확인 (각 행에 index를 줄 만 한지)
meta_table[meta_table['dcm_fname'].duplicated()] # dcm fname에 중복이 있다는 것 확인 -> Issue 3으로 보류

## Issue #2

In [None]:
# ViewPosition이 None이거나 있어도 folder 안에서 구별 안되는 것 존재함을 확인: 301 rows
dup_folder_ls = meta_table[meta_table[['folder', 'ViewPosition']].duplicated()]['folder'].unique()
dup_folder_df = meta_table[meta_table['folder'].isin(dup_folder_ls)]
print(dup_folder_df.shape)

In [None]:
# 얘네는 그래도 dcm_fname이 다다르긴 함
dup_folder_df[dup_folder_df['dcm_fname'].duplicated()]

### Details
- ViewPosition이 아예 없어서 구별이 안되는 것과,
- 있어도 folder 내에서 구별 안되는 것 (하나의 폴더 안에 ViewPosition 마저 같은 경우)
- 을 구분해야 함! (참고로 여기서 '구별'은, 직접 dcm 파일을 까보지 않고도 Meta-table에서 식별 가능한 정도를 말함: 목표는 complete Meta_table)

In [None]:
dup_folder_df['ViewPosition'].value_counts()

In [None]:
# 1: ViewPosition이 아예 없는 것 (267개) -- 이걸로 뒤에 해결
dup_folder_no_vp = dup_folder_df[dup_folder_df['ViewPosition']=='']
print(dup_folder_no_vp.shape)

In [None]:
# 2: ViewPosition이 한 폴더 안에 중복되는 것 -- 얘는 완전 다른 attribute이 필요함
dup_folder_dup_vp = dup_folder_df[dup_folder_df['ViewPosition']!='']
print(dup_folder_dup_vp.shape)

## Issue # 2-1 (Step 1)

### Example

In [None]:
def check_dcm_full(fpath):
    dcm = pydicom.dcmread(fpath)
    print(dcm.StudyInstanceUID)
    print(dcm.SeriesInstanceUID)
    print(dcm.AccessionNumber)
    print(dcm.ViewPosition)
    
    img = dcm.pixel_array
    plt.figure(figsize=(4,4))
    plt.imshow(img, cmap='gray')
    plt.title(f"DICOM Image: {os.path.basename(fpath)}")
    plt.axis('off')
    plt.show()
    return dcm

In [None]:
print(dup_folder_no_vp.iloc[0, 3])
print(dup_folder_no_vp.iloc[1, 3])

In [None]:
dcm_c_0 = check_dcm_full(dup_folder_no_vp.iloc[0, 3])
dcm_c_1 = check_dcm_full(dup_folder_no_vp.iloc[1, 3])

In [None]:
## 다 펼쳐보고, 차이 비교: (0008,103E) Series Description [LO], (0020,0060) Laterality [CS], (0054,0020) View Code Sequence [SQ] 
## 이 중에서 (0054,0020) View Code Sequence가 제일 정확함
# print(dcm_c_0) 
# print(dcm_c_1)

## Solution #2-1 (Step 1): View Code Sequence에서 item 꺼내기
- Sequence는 리스트 형태

In [None]:
# (0054,0020) View Code Sequence 추출해보기 --- SQ 는 추출한 적이 없는 것 같네
## 1T
def check_ViewCodeSequence(fpath):
    dcm = pydicom.dcmread(fpath)
    print(dcm.ViewCodeSequence)

check_ViewCodeSequence(dup_folder_no_vp.iloc[0, 3])

In [None]:
## 2T
def check_ViewCodeSequence(fpath):
    dcm = pydicom.dcmread(fpath)
    print(dcm.ViewCodeSequence.CodeValue)

check_ViewCodeSequence(dup_folder_no_vp.iloc[0, 3]) # pydicom에서 DICOM 파일의 Sequence 항목은 일반적으로 리스트 형태로 반환되며, 리스트처럼 동작함 (인덱싱 필요)

In [None]:
## 3T
def extract_ViewCodeSeq(fpath):
    dcm = pydicom.dcmread(fpath)
    # Check if the sequence exists and is not empty
    if hasattr(dcm, 'ViewCodeSequence') and dcm.ViewCodeSequence:
        # Access the first item in the sequence
        items = dcm.ViewCodeSequence[0]
        if hasattr(items, 'CodeValue'):
            code_value = items.CodeValue
            print(f"CodeValue: {code_value}")
        else:
            print("CodeValue attribute is missing in the sequence item.")
    else:
        print("ViewCodeSequence is missing or empty.")
    
extract_ViewCodeSeq(dup_folder_no_vp.iloc[0, 3])

In [None]:
## 3T_2
def extract_ViewCodeSeq_2(fpath):
    dcm = pydicom.dcmread(fpath)
    # Check if the sequence exists and is not empty
    if hasattr(dcm, 'ViewCodeSequence') and dcm.ViewCodeSequence:
        # Access the first item in the sequence
        items = dcm.ViewCodeSequence[0]
        if hasattr(items, 'CodeValue'):
            code_value = items.CodeValue
            print(f"CodeValue: {code_value}")
        if hasattr(items, 'CodeMeaning'):
            code_meaning = items.CodeMeaning
            print(f"CodeMeaning: {code_meaning}")
        else:
            print("Either Code attribution missing in the sequence item.")
    else:
        print("ViewCodeSequence is missing or empty.")
    
#extract_ViewCodeSeq_2(dup_folder_no_vp.iloc[0, 3])

In [None]:
# 3T-extension: make a dataframe with repetition

def extract_viewcode_sequence(fpath):
    """
    Extract CodeValue, CodeSchemeDesignator, and CodeMeaning from ViewCodeSequence in a DICOM file.

    Parameters:
    - fpath: str, path to the DICOM file.

    Returns:
    - A list of dictionaries containing the extracted data for each sequence item.
    """
    try:
        dcm = pydicom.dcmread(fpath)
        if hasattr(dcm, 'ViewCodeSequence') and dcm.ViewCodeSequence:
            # Extract data from each item in the ViewCodeSequence
            return [
                {
                    "fpath": fpath,
                    "CodeValue": getattr(item, "CodeValue", "N/A"),
                    "CodeSchemeDesignator": getattr(item, "CodeSchemeDesignator", "N/A"),
                    "CodeMeaning": getattr(item, "CodeMeaning", "N/A")
                }
                for item in dcm.ViewCodeSequence
            ]
        else:
            # Return an empty row if ViewCodeSequence is missing
            return [{"fpath": fpath, "CodeValue": "N/A", "CodeSchemeDesignator": "N/A", "CodeMeaning": "N/A"}]
    except Exception as e:
        # Handle exceptions and return a row with error details
        print(f"Error reading {fpath}: {e}")
        return [{"fpath": fpath, "CodeValue": "Error", "CodeSchemeDesignator": "Error", "CodeMeaning": "Error"}]

def create_viewcode_dataframe(fpaths):
    """
    Create a Dataframe containing ViewCodeSequence data for a list of file paths.

    Parameters:
    - fpaths: list of str, paths to DICOM files.

    Returns:
    - A pandas DataFrame with columns: fpath, CodeValue, CodeSchemeDesignator, CodeMeaning.
    """
    # Flatten the list of extracted data
    data = []
    for fpath in fpaths:
        data.extend(extract_viewcode_sequence(fpath))

    # Convert the data to a DataFrame
    return pd.DataFrame(data)

In [None]:
# 일단 전체에 적용하기 전, [ViewPosition이 None이거나 있어도 folder 안에서 구별 안되는 것] 301 rows에 대해 시험삼아 적용해보기
# 여기서 확실히 구별될 수 있는지 보기! 즉, 이게 unique key로 사용될 수 있는지 보기!
dup_folder_no_vp_fpaths = dup_folder_no_vp['dcm_fpath']
dup_folder_no_vp_viewcode_df = create_viewcode_dataframe(dup_folder_no_vp_fpaths)
print(dup_folder_no_vp_viewcode_df.shape)

In [None]:
dup_folder_no_vp_viewcode_df.head(2)

In [None]:
# 여전히 Null인 것: CodeValue (116)
print(dup_folder_no_vp_viewcode_df['CodeValue'].value_counts())
print(dup_folder_no_vp_viewcode_df['CodeMeaning'].value_counts())

In [None]:
# 추가 확인: CodeValue - CodeMeaning은 얼마나 깔끔히 관리되고 있나 보기 위함
dup_folder_no_vp_viewcode_df.groupby('CodeValue')['CodeMeaning'].value_counts()

## Issue # 2-1 (Step 2)
- (0054,0020) ViewCodeSequence 에도 여전히 Null 존재
- 그래도 지금까지 정리해보면: (한 폴더 안에 정체를 알 수 없는 것 588 중 no view point 267)  > (00185101 ViewPoint로 정체 식별하고도 안된거) extra 116 > 이제 (0020,0060) Laterality [CS]를 봐보자

In [None]:
dup_folder_no_vp_viewcode_nan = dup_folder_no_vp_viewcode_df[dup_folder_no_vp_viewcode_df['CodeValue']=='N/A']
print(dup_folder_no_vp_viewcode_nan.shape)
dup_folder_no_vp_viewcode_nan.head(2)

### Example
- 아직도 해결안된 301개에 대해, (0020,0060) Laterality [CS]를 봐보자

In [None]:
# Add: (0020,0060) Laterality [CS]
def check_dcm2(fpath):
    dcm = pydicom.dcmread(fpath)
    print(f"Study UID: {dcm.StudyInstanceUID}")
    print(f"Series UID: {dcm.SeriesInstanceUID}")
    print(f"Accession No.: {dcm.AccessionNumber}")
    print(f"View Position: {dcm.ViewPosition}")
    print('View Code Sequence')
    extract_ViewCodeSeq_2(fpath) # ViewCodeSequence - CodeValue
    print(f"Laterality: {dcm.Laterality}") # 0020,0060) Laterality
    
    img = dcm.pixel_array
    plt.figure(figsize=(4,4))
    plt.imshow(img, cmap='gray')
    plt.title(f"DICOM Image: {os.path.basename(fpath)}")
    plt.axis('off')
    plt.show()

In [None]:
# test
check_dcm2(dup_folder_no_vp_viewcode_nan.iloc[0, 0]) # 가능할 듯? 

## Solution # 2-1 (Step 2): Laterality
- 아직도 해결안된 301개에 대해, (0020,0060) Laterality [CS]를 추가하자

In [None]:
dup_folder_no_vp_viewcode_nan.head(2)

In [None]:
## Initalize 'Laterality' column 
dup_folder_no_vp_viewcode_nan['Laterality'] = None

## Update 'Laterality' column by iterating over the rows
def extract_laterality(row):
    fpath = row['fpath']
    try:
        dcm = pydicom.dcmread(fpath, stop_before_pixels=True)
        return dcm.Laterality
    except Exception as e:
        return ''

## Use apply for row-wise operation
dup_folder_no_vp_viewcode_nan['Laterality'] = dup_folder_no_vp_viewcode_nan.apply(extract_laterality, axis=1)
print(dup_folder_no_vp_viewcode_nan.shape)

In [None]:
dup_folder_no_vp_viewcode_nan['Laterality'].value_counts() # 여전히 안되는거 48개 있음

## Issue #2-1 (Step 3)
- 267개 중에서 ViewPosition (Nan: 267), ViewCodeSequence (Nan: 116), Laterality (Nan: 48) 로도 안되는 애들

In [None]:
dup_folder_nvp_nvc_nl = dup_folder_no_vp_viewcode_nan[dup_folder_no_vp_viewcode_nan['Laterality']=='']
print(dup_folder_nvp_nvc_nl.shape)
dup_folder_nvp_nvc_nl.head(2)

### Example

In [None]:
def check_dcm_full(fpath):
    dcm = pydicom.dcmread(fpath)
    print(dcm.StudyInstanceUID)
    print(dcm.SeriesInstanceUID)
    print(dcm.AccessionNumber)
    print(dcm.ViewPosition)
    
    img = dcm.pixel_array
    plt.figure(figsize=(4,4))
    plt.imshow(img, cmap='gray')
    plt.title(f"DICOM Image: {os.path.basename(fpath)}")
    plt.axis('off')
    plt.show()
    return dcm

In [None]:
example = check_dcm_full(dup_folder_nvp_nvc_nl.iloc[0,0])

In [None]:
example2 = check_dcm_full(dup_folder_nvp_nvc_nl.iloc[1,0])

## Solution #2-1 (Step 3): Protocol Name으로 간다
- 아직도 해결 안된 48개에 대해, (0018,1030) Protocol Name [LO]를 추가하자

In [None]:
print(dup_folder_nvp_nvc_nl.shape)
dup_folder_nvp_nvc_nl.head(2)

In [None]:
## Initalize 'Laterality' column 
dup_folder_nvp_nvc_nl['ProtocolName'] = None

## Update 'Laterality' column by iterating over the rows
def extract_protocolname(row):
    fpath = row['fpath']
    try:
        dcm = pydicom.dcmread(fpath, stop_before_pixels=True)
        return dcm.ProtocolName
    except Exception as e:
        return ''

## Use apply for row-wise operation
dup_folder_nvp_nvc_nl['ProtocolName'] = dup_folder_nvp_nvc_nl.apply(extract_protocolname, axis=1)
print(dup_folder_nvp_nvc_nl.shape)

In [None]:
dup_folder_nvp_nvc_nl['ProtocolName'].value_counts() # 전체 해결 완!

## Solution # 2-1 (Completeness) 
- ViewPosition/ ViewCodeSequence / Laterality / Protocol Name 중에 우선순위 처음부터 정해보기: ViewPosition > ViewCodeSequence > Protocol Name으로 결정

In [None]:
# Initialize new columns
meta_table['ViewPosition'] = None
meta_table['ViewCodeSeq_Value'] = None
meta_table['Laterality'] = None
meta_table['ProtocolName'] = None

# Function to extract required DICOM attributes
def extract_dicom_attributes(row):
    fpath = row['dcm_fpath']
    try:
        # Read the DICOM file
        dcm = pydicom.dcmread(fpath, stop_before_pixels=True)

        # Extract attributes with safe access
        view_position = getattr(dcm, 'ViewPosition', 'N/A')
        laterality = getattr(dcm, 'Laterality', 'N/A')
        protocol_name = getattr(dcm, 'ProtocolName', 'N/A')

        # Extract CodeMeaning from ViewCodeSequence
        code_value = 'N/A'
        if hasattr(dcm, 'ViewCodeSequence') and dcm.ViewCodeSequence:
            code_value = getattr(dcm.ViewCodeSequence[0], 'CodeValue', 'N/A')
        return pd.Series({
            'ViewPosition': view_position,
            'ViewCodeSeq_Value': code_value,
            'Laterality': laterality,
            'ProtocolName': protocol_name
        })
        
    except Exception as e:
        return pd.Series({
            'ViewPosition': 'Error',
            'ViewCodeSeq_Value': 'Error',
            'Laterality': 'Error',
            'ProtocolName': 'Error'
        })

# Use apply for row-wise operation
meta_table[['ViewPosition', 'ViewCodeSeq_Value', 'Laterality', 'ProtocolName']] = meta_table.apply(extract_dicom_attributes, axis=1)

In [None]:
print(meta_table.shape)
meta_table.head(2)

In [None]:
# Check the values in each columns related to 'View'
assist_cols = ['ViewPosition', 'ViewCodeSeq_Value', 'Laterality', 'ProtocolName']
for col in assist_cols:
    print(f"<{col}> --------------------------------")
    print(meta_table[col].value_counts())
    print("\n")

In [None]:
## (0018,5101) ViewPosition [CS] > (0054,0020) ViewCodeSequence [SQ] > (0018,1030) Protocol Name [LO]으로 결정

# Initialize the 'View' column with 'ViewPosition' values (firstly)
meta_table['View'] = meta_table['ViewPosition']

# Fill missing, empty (''), or 'N/A' values in 'View' with 'ViewCodeSeq_values'
meta_table.loc[
    meta_table['View'].isnull() |
    (meta_table['View']=='') | (meta_table['View']=='N/A'), 'View'
] = meta_table['ViewCodeSeq_Value']

# Fill missing, empty (''), or 'N/A' values in 'View' with 'ProtocolName'
meta_table.loc[
    meta_table['View'].isnull() |
    (meta_table['View']=='') | (meta_table['View']=='N/A'), 'View'
] = meta_table['ProtocolName']

print(meta_table['View'].value_counts()) # 그래도 459개 (''), 115개 ('N/A') 남음 .. 안되겠다 Laterality도 가야지

In [None]:
# Fill missing, empty (''), or 'N/A' values in 'View' with 'Laterality'
meta_table.loc[
    meta_table['View'].isnull() |
    (meta_table['View']=='') | (meta_table['View']=='N/A'), 'View'
] = meta_table['Laterality']

print(meta_table['View'].value_counts()) # 그래도 ''인 115개 있음.. 킹받네

## Issue # 2-1 (Further)
- Laterality 까지 했는데도 아직 115개가 해결이 안됨; 이전에 ''가 제외되어 쌓였나본데

In [None]:
eg_fpath = meta_table[meta_table['View'] == ''].iloc[0,3]
print(eg_fpath)

In [None]:
example = check_dcm_full(eg_fpath)

In [None]:
example.AcquisitionDeviceProcessingDescription

## Solution # 2-1 (Further)
- (0018,1400) Acquisition Device Processing Description [LO] 넣어보자

In [None]:
# Initialize 'ViewPosition' column
meta_table['AcqDevProcDesc'] = None

# Update 'ViewPosition' column by iterating over the rows
def extract_AcqDevProcDesc(row):
    fpath = row['dcm_fpath']
    try:
        dcm = pydicom.dcmread(fpath, stop_before_pixels=True)
        return dcm.AcquisitionDeviceProcessingDescription
    except Exception as e:
        return ''

# Use apply for row-wise operation
meta_table['AcqDevProcDesc'] = meta_table.apply(extract_AcqDevProcDesc, axis=1)

In [None]:
meta_table['AcqDevProcDesc'].value_counts()

In [None]:
# Fill missing, empty (''), or 'N/A' values in 'View' with 'AcqDevProcDesc'
meta_table.loc[
    meta_table['View'].isnull() |
    (meta_table['View']=='') | (meta_table['View']=='N/A'), 'View'
] = meta_table['AcqDevProcDesc']

print(meta_table['View'].value_counts()) # 

In [None]:
# 일단 Save!
print(home_dir)
#meta_table.to_csv(os.path.join(home_dir, 'workspace/Files/Meta_Table_241116_withRawView.csv'), index=False)

### Details: 아리까리한거 정리하기
- Clean: PA, AP, LAT (Left, Right 포함)
- 아리까리: R

In [None]:
example = pydicom.dcmread(R_fpath_ls[71])
example

In [None]:
# Add: ProtocolName (0018,1030), AcquisitionDeviceProcessingDescription (0018,1400)
def check_dcm3(fpath):
    dcm = pydicom.dcmread(fpath)
    print(fpath)
    print(f"Study UID: {dcm.StudyInstanceUID}")
    print(f"Series UID: {dcm.SeriesInstanceUID}")
    print(f"Accession No.: {dcm.AccessionNumber}")
    print(f"View Position: {dcm.ViewPosition}")
    print('View Code Sequence')
    extract_ViewCodeSeq_2(fpath) # ViewCodeSequence - CodeValue
    if hasattr(dcm, 'ProtocolName') and dcm.ProtocolName:
        try:
            print(f"ProtocolName: {dcm.ProtocolName}") # adding
        except:
            pass
    if hasattr(dcm, 'Laterality') and dcm.Laterality:
        try:
            print(f"Laterality: {dcm.Laterality}") # (0020,0060) Laterality
        except:
            pass
    if hasattr(dcm, 'AcquisitionDeviceProcessingDescription') and dcm.AcquisitionDeviceProcessingDescription:
        try: 
            print(f"AcquisitionDeviceProcessingDescription: {dcm.AcquisitionDeviceProcessingDescription}") # adding
        except:
            pass
    print(f"SeriesDescription: {dcm.SeriesDescription}") # adding
    
    img = dcm.pixel_array
    plt.figure(figsize=(4,4))
    plt.imshow(img, cmap='gray')
    plt.title(f"DICOM Image: {os.path.basename(fpath)}")
    plt.axis('off')
    plt.show()

In [None]:
R_fpath_ls = meta_table[meta_table['View']=='R']['dcm_fpath']

for fpath in R_fpath_ls:
    check_dcm3(fpath)

In [None]:
# 일단 Save by copy
meta_data_wlaterality = meta_data.copy()

## Solution # 2-1 (Furthermore)
- Series Description 추가
- (+) Laterality 빼고 순서 바꿔서 다시 해보자: ViewPosition[CS] > ViewCodeSequence[SQ] > ProtocolName[LO] > SeriesDescription[LO]> AcqDevProcDescr[LO]

In [None]:
# Initialize 'ViewPosition' column
meta_table['SeriesDescription'] = None

# Update 'View' column by iterating over the rows
def extract_SeriesDescription(row):
    fpath = row['dcm_fpath']
    try:
        dcm = pydicom.dcmread(fpath, stop_before_pixels=True)
        return dcm.SeriesDescription
    except Exception as e:
        return ''

# Use apply for row-wise operation
meta_table['SeriesDescription'] = meta_table.apply(extract_SeriesDescription, axis=1)

In [None]:
meta_table['SeriesDescription'].value_counts() # 확신을 얻엇다... 

In [None]:
## Initialization: ViewPosition[CS] > ViewCodeSequence[SQ] > ProtocolName[LO] > SeriesDescription[LO]> AcqDevProcDescr[LO]

# Initialize the 'View' column with 'ViewPosition' values (firstly)
meta_table['View'] = None
meta_table['View'] = meta_table['ViewPosition']

# Fill missing, empty (''), or 'N/A' values in 'View' with 'ViewCodeSeq_values'
meta_table.loc[
    meta_table['View'].isnull() |
    (meta_table['View']=='') | (meta_table['View']=='N/A'), 'View'
] = meta_table['ViewCodeSeq_Value']

# Fill missing, empty (''), or 'N/A' values in 'View' with 'ProtocolName'
meta_table.loc[
    meta_table['View'].isnull() |
    (meta_table['View']=='') | (meta_table['View']=='N/A'), 'View'
] = meta_table['ProtocolName']

# Fill missing, empty (''), or 'N/A' values in 'View' with 'SeriesDescription'
meta_table.loc[
    meta_table['View'].isnull() |
    (meta_table['View']=='') | (meta_table['View']=='N/A'), 'View'
] = meta_table['SeriesDescription']

# Fill missing, empty (''), or 'N/A' values in 'View' with 'AcqDevProcDesc'
meta_table.loc[
    meta_table['View'].isnull() |
    (meta_table['View']=='') | (meta_table['View']=='N/A'), 'View'
] = meta_table['AcqDevProcDesc']

print(meta_table['View'].value_counts()) # 훨씬 낫다' 이걸로 간다

In [None]:
# Save
print(home_dir)
#meta_table.to_csv(os.path.join(home_dir, 'workspace/Files/Meta_Table_241116_withRawView_Completed.csv'), index=False)

### Details: Triming Values (Goal: PA, AP, LT LAT, RT LAT)
- 참고: R-10214 (postero-anterior), R-10206 (antero-posterior), R-10236 (left lateral), R-10232 (right lateral)
- Keep going on Nov 17 2024

In [None]:
meta_table = pd.read_csv(os.path.join(home_dir, 'workspace/Files/Meta_Table_241116_withRawView_Completed.csv'))
print(meta_table.shape)
meta_table.head(2)

In [None]:
raw_view_ls = meta_table['View'].unique()

pa_ls = []
ap_ls = []
lat_ls = [] 

for val in raw_view_ls:
    if 'ap' in val.lower():
        ap_ls.append(val)
    elif 'pa' in val.lower():
        pa_ls.append(val)
    elif 'lat' in val.lower():
        lat_ls.append(val)
    else:
        print(f"Not classified: {val}")
print(f"PA: {pa_ls}")
print(f"AP: {ap_ls}")
print(f"LATERAL: {lat_ls}")

In [None]:
# R-10214 (postero-anterior), R-10206 (antero-posterior), R-10236 (left lateral), R-10232 (right lateral)
pa_ls.append('R-10214')
ap_ls.append('R-10206')
for v in ['R-10236', 'R-10232', 'RL', 'LL']:
    lat_ls.append(v)
print(f"PA: {pa_ls}")
print(f"AP: {ap_ls}")
print(f"LATERAL: {lat_ls}")

In [None]:
# Lateral -> LT, RT
llat_ls = [] # left 
rlat_ls = [] # right
nlat_ls = [] # unknown laterality 
for val in lat_ls:
    if any(char in val for char in ['(L)', 'L ', 'LT', 'Lt.', 'R-10236', 'LL']):
        llat_ls.append(val)
    elif any(char in val for char in ['(R)', 'R ', 'RT', 'Rt.', 'R-10232', 'RL']):
        rlat_ls.append(val)
    else:
        nlat_ls.append(val)
print(f"Left Laterality: {llat_ls}")
print(f"Right Laterality: {rlat_ls}")
print(f"Unknown Laterality: {nlat_ls}")

In [None]:
# Check
print(f"PA: {pa_ls}")
print(f"AP: {ap_ls}")
print(f"Left Laterality: {llat_ls}")
print(f"Right Laterality: {rlat_ls}")
print(f"Unknown Laterality: {nlat_ls}")

In [None]:
# Insert (View Code Sequence's Code Value +'_'+ Code Meaning): R-10214_PA, R-10206_AP, R-10236_LL, R-10232_RL, R-102CD_LAT
def classify_raw_view(view):
    if view in pa_ls:
        return 'R-10214_PA'
    elif view in ap_ls:
        return 'R-10206_AP'
    elif view in llat_ls:
        return 'R-10236_LL'
    elif view in rlat_ls:
        return 'R-10232_RL'
    elif view in nlat_ls:
        return 'R-102CD_LAT'
    else:
        print(f"Unclassified view: {view}")

# Apply classification to create 'View_trimmed' column
meta_table['View_trimmed'] = meta_table['View'].apply(classify_raw_view_v2)

# Check
meta_table['View_trimmed'].value_counts()

#### Sub-Issue: Laterality
- To find the laterality for unknown laterality (meta_table['View_trimmed'] == 'R-102CD_LAT' # 259)

In [None]:
df_others = meta_table[meta_table['View_trimmed']=='R-102CD_LAT']
print(df_others.shape)

In [None]:
df_others.head(2)

In [None]:
print(f"NaN   {df_others['Laterality'].isnull().sum()}")
print(df_others['Laterality'].value_counts()) 
# 'L', 'R' -> solved with Laterlaity.
# NaN, 'B' -> Other solution is needed.

Laterality: NaN, 'B'

In [None]:
condition = (df_others['Laterality'].isnull()) | (df_others['Laterality']=='B')
df_nanb = df_others[condition]
print(df_nanb.shape) # NaN 59 + 'B' 5

In [None]:
print(df_nanb['ViewCodeSeq_Value'].isnull().sum())
print(df_nanb['ViewCodeSeq_Value'].value_counts()) # R-102CD: lateral (just lateral)

In [None]:
# Extension
def check_dcm3_full(fpath):
    dcm = pydicom.dcmread(fpath)
    print(fpath)
    print(f"Study UID: {dcm.StudyInstanceUID}")
    print(f"Series UID: {dcm.SeriesInstanceUID}")
    print(f"Accession No.: {dcm.AccessionNumber}")
    print(f"View Position: {dcm.ViewPosition}")
    print('View Code Sequence')
    extract_ViewCodeSeq_2(fpath) # ViewCodeSequence - CodeValue
    if hasattr(dcm, 'ProtocolName') and dcm.ProtocolName:
        try:
            print(f"ProtocolName: {dcm.ProtocolName}") # adding
        except:
            pass
    if hasattr(dcm, 'Laterality') and dcm.Laterality:
        try:
            print(f"Laterality: {dcm.Laterality}") # (0020,0060) Laterality
        except:
            pass
    if hasattr(dcm, 'AcquisitionDeviceProcessingDescription') and dcm.AcquisitionDeviceProcessingDescription:
        try: 
            print(f"AcquisitionDeviceProcessingDescription: {dcm.AcquisitionDeviceProcessingDescription}") # adding
        except:
            pass
    print(f"SeriesDescription: {dcm.SeriesDescription}") # adding
    
    img = dcm.pixel_array
    plt.figure(figsize=(4,4))
    plt.imshow(img, cmap='gray')
    plt.title(f"DICOM Image: {os.path.basename(fpath)}")
    plt.axis('off')
    plt.show()
    return dcm

In [None]:
# Then, I just have to record it as 'Lateral' without any laterality? Let me check finally.
check_dcm3_full(df_nanb.iloc[2,3])

#### Sub-Solution: Laterality
- Record 'Lateral' without right or left as R-102CD_lat

Laterality: R, L
- REVISE meta_table

In [None]:
# Update 'View_trimmed' based on conditions
## Left Laterality
meta_table.loc[(meta_table['View_trimmed']=='R-102CD_LAT') & (meta_table['Laterality']=='L'), 'View_trimmed'] = 'R-10236_LL'

## Right Laterality
meta_table.loc[(meta_table['View_trimmed']=='R-102CD_LAT') & (meta_table['Laterality']=='R'), 'View_trimmed'] = 'R-10232_RL'

## Check
meta_table[meta_table['View_trimmed']=='R-102CD_LAT'].head()

In [None]:
print(meta_table['View_trimmed'].isnull().sum())
meta_table['View_trimmed'].value_counts()

In [None]:
print(69+74)
print(236+121)
print(259 - 74 - 121)

### Details: Finalize

In [None]:
# Re-order: ViewPosition > ViewCodeSeq > ProtocolName > SeriesDescription > AcqDevProcDesc > Laterality
print(meta_table.columns)
meta_table = meta_table[['dataset', 'folder', 'dcm_fname', 'dcm_fpath', 's순번', 's연구등록번호','s시행연월', 's모달', 'View_trimmed',
                         'ViewPosition', 'ViewCodeSeq_Value', 'ProtocolName', 'SeriesDescription','AcqDevProcDesc', 'View', 'Laterality']]
meta_table.head()

In [None]:
# Save
print(home_dir)
# meta_table.to_csv(os.path.join(home_dir, 'workspace/Files/Meta_Table_241117_withViewTrimmed_5views.csv'), index=False)

In [None]:
print(meta_table.shape)
meta_table.nunique()

---

## Issue #2-2 (Step 1)

In [None]:
# folder 안에서 구별 안되는 것을 다시 확인해보기: #2에서는 'ViewPosition'으로만 확인했었는데(301), 'View_Trimmed'으로 다시 확인해보기(59개)
dup_folder_ls = meta_table[meta_table[['folder', 'View_trimmed']].duplicated()]['folder'].unique()
dup_folder_df = meta_table[meta_table['folder'].isin(dup_folder_ls)]
print(dup_folder_df.shape)

In [None]:
# 얘네는 그래도 dcm_fname이 다다르긴 함
dup_folder_df[dup_folder_df['dcm_fname'].duplicated()]

In [None]:
# 난 사실 PA가 지금 당장 필요한데 PA도 포함되어 있을 지
dup_folder_df['View_trimmed'].value_counts()

### 뒤적뒤적 1) Lateral: 4 files
- View 관련된 태그로는 구별 불가. 직접 까서 보자

In [None]:
dup_folder_df[dup_folder_df['View_trimmed'] == 'R-102CD_LAT']

In [None]:
dup_lat_fls = dup_folder_df[dup_folder_df['View_trimmed'] == 'R-102CD_LAT']['dcm_fpath'].unique()
print(dup_lat_fls)

In [None]:
def import_dcm(fpath):
    dcm = pydicom.dcmread(fpath)

    img = dcm.pixel_array
    plt.figure(figsize=(4,4))
    plt.imshow(img, cmap='gray')
    plt.title(f"DICOM Image: {os.path.basename(fpath)}")
    plt.axis('off')
    plt.show()
    return dcm

In [None]:
# 1 (1)
import_dcm(dup_lat_fls[0])

In [None]:
# 1 (2)
import_dcm(dup_lat_fls[1])

### 뒤적뒤적 2) PA: 55 files

In [None]:
dup_folder_df[dup_folder_df['View_trimmed'] == 'R-10214_PA']

In [None]:
dup_pa_fls = dup_folder_df[dup_folder_df['View_trimmed'] == 'R-10214_PA']['dcm_fpath'].unique()
print(len(dup_pa_fls))
print(dup_pa_fls[:4])

까보자

In [None]:
# 1 (1)
import_dcm(dup_pa_fls[0])

In [None]:
# 1 (2)
import_dcm(dup_pa_fls[1])

In [None]:
# 2 (1)
import_dcm(dup_lat_fls[2])

In [None]:
# 2 (2)
import_dcm(dup_lat_fls[3])

## Solution #2-2 (Step 1)
- ProtocolName에서 Auto Diagnosis는 따로 표시해주기
- 그 외는 구별 불가 (암거나 쓴다)

In [None]:
meta_table['ProtocolName'].value_counts()

In [None]:
# Protocol Name 종류 별로 dcm_fpath 하나씩 까보려고 함
def check_dcm_protocol(fpath):
    dcm = pydicom.dcmread(fpath)
    print(dcm.
    print(dcm.ProtocolName)

    img = dcm.pixel_array
    plt.figure(figsize=(3,3))
    plt.imshow(img, cmap='gray')
    plt.title(f"DICOM Image: {os.path.basename(fpath)}")
    plt.axis('off')
    plt.show()

protc_ls = meta_table['ProtocolName'].unique() # 29개

protc_fpath_ls = []
for protc in protc_ls:
    df_protc = meta_table[meta_table['ProtocolName']==protc].reset_index(drop=True)
    if df_protc.shape[0] != 0:
        protc_fpath = df_protc.loc[0, 'dcm_fpath']
        protc_fpath_ls.append(protc_fpath)
print(len(protc_ls))
print(len(protc_fpath_ls)) # nan 하나 제외

In [None]:
for fpath in protc_fpath_ls:
    check_dcm_protocol(fpath)

In [None]:
# CHEST_PA_LAT_LT (6 files)
protc_complex_lt_fls = meta_table[meta_table['ProtocolName']=='CHEST_PA_LAT_LT']['dcm_fpath'].unique()
print(len(protc_complex_lt_fls))

In [None]:
for fpath in protc_complex_lt_fls:
    check_dcm3(fpath) # 다행히 다 ViewCodeSeq CodeValue가 있었음! 휴

In [None]:
# CHEST_PA_LAT_RT (12 files)
protc_complex_rt_fls = meta_table[meta_table['ProtocolName']=='CHEST_PA_LAT_RT']['dcm_fpath'].unique()
print(len(protc_complex_rt_fls))

In [None]:
for fpath in protc_complex_rt_fls:
    check_dcm3(fpath) # 다행히 다 ViewCodeSeq CodeValue가 있었음! 휴

---

## Issue #2-1 Again (Step 4) > NonIssue!
- ProtocolName은 ViewCode로 쓸 수 없다

In [None]:
# protocol로 한 경우?
condition = meta_table['ProtocolName'] == meta_table['View']
df_protc = meta_table[condition]
print(df_protc.shape)
df_protc['ProtocolName'].value_counts() #휴~ NonIssue!

In [None]:
# 그 다음 Series Description으로 한 경우?
condition = meta_table['SeriesDescription'] == meta_table['View']
df_ser = meta_table[condition]
print(df_ser.shape)
df_ser['SeriesDescription'].value_counts() # 혼합 없어보임

In [None]:
# 마지막, AcqDevProcDesc으로 한 경우?
condition = meta_table['AcqDevProcDesc'] == meta_table['View']
df_adpd = meta_table[condition]
print(df_adpd.shape)
df_adpd['AcqDevProcDesc'].value_counts() # 혼합없어보임!

---

# .dcm 메타 테이블과, 환자 명단 엑셀 파일 연계하기

In [None]:
print(meta_table.shape)
print(meta_table.nunique())
meta_table.head()

In [None]:
# folder-level
meta_table.groupby(['dataset'])['folder'].nunique()

In [None]:
# file-level
meta_table.groupby(['dataset'])['dcm_fname'].nunique()

## 환자 명단 뒤적뒤적

In [None]:
for fname in xlsx_fname_ls:
    print(fname)

In [None]:
# .xlsx 명단 파일 예시 (환자군)
print(xlsx_fname_ls[0])
df_care_b60 = pd.read_excel(sdp_dir+xlsx_fname_ls[0])
print(df_care_b60.shape)
print(df_care_b60[['연구등록번호', 'PACS 일련번호']].nunique())
df_care_b60.head()

In [None]:
df_care_b60['처방코드(코드명)'].value_counts()

In [None]:
# .xlsx 명단 파일 (환자군 after 180)
df_care_a180 = pd.read_excel(sdp_dir+xlsx_fname_ls[2])
print(df_care_a180.shape)
print(df_care_a180[['연구등록번호', 'PACS 일련번호']].nunique())
df_care_a180.head()

In [None]:
df_care_a180['처방코드(코드명)'].value_counts()

In [None]:
# .xlsx 명단 파일 (환자군 after 180)
df_control = pd.read_excel(sdp_dir+xlsx_fname_ls[1])
print(df_control.shape)
print(df_control[['연구등록번호', 'PACS_일련번호']].nunique())
df_control.head()

In [None]:
df_control['처방코드(코드명)'].value_counts()

## 1. 환자군(치료시작 60일 전~ 치료시작) 명단: 3536건 (1228명) 부터

### 1) display the dataframes

In [None]:
# excel
print(df_care_b60.shape)
df_care_b60.head(2)

In [None]:
# meta_table (for .dcm files)
dataset_care_b60 = meta_table[meta_table['dataset']=='전규리(2024100305)_환자군_치료시작60일'].reset_index(drop=True)
print(dataset_care_b60.shape)
dataset_care_b60.head(1)

### 2) make a matching key
- dataset['folder'] 와 매칭될 수 있는 키를, df에서 만들기
- df['순번'] + df['연구등록번호'] + df['시행연월'] # 시행일자를 시행연월로 만들어야 함! 

In [None]:
# in excel
def create_key_column(df_excel):
    df_excel['key'] = (
        df_excel['순번'].astype(str) + '_' + 
        df_excel['연구등록번호'].astype(str) + '_' +
        df_excel['시행일자'].str.replace('-','').str[:6]+'01'
    )
    return df_excel

In [None]:
print(df_care_b60.shape)
df_care_b60 = create_key_column(df_care_b60)
print(df_care_b60.shape)

In [None]:
df_care_b60.head()

### 3) matching
- .xlsx patient table: df_care_b60
- .dcm meta table: dataset_care_b60

#### ISSUE: 순번이 안맞는데..?

In [None]:
df_care_b60[df_care_b60['연구등록번호'] == no]

In [None]:
dataset_care_b60[dataset_care_b60['s연구등록번호']== no]

#### TRIAL: New Key without '순번'

In [None]:
# in excel
def create_new_key_column(df_excel):
    df_excel['new_key'] = (
        df_excel['연구등록번호'].astype(str) + '_' +
        df_excel['시행일자'].str.replace('-','').str[:6]+'01'
    )
    return df_excel

In [None]:
print(df_care_b60.shape)
df_care_b60 = create_new_key_column(df_care_b60)
print(df_care_b60.shape)

In [None]:
print(df_care_b60['new_key'].nunique()) # 이걸로는 절반 밖에 매칭이 안됨