In [None]:
import os
import pandas as pd
import numpy as np
import pydicom
from datetime import datetime
import pymssql
from sqlalchemy import inspect, create_engine

In [None]:
import sys
sys.path.append('/home/yuhsuser/workspace/kyulee/Codes')

from dicom_viewer import check_dicom_attributes
from dicom_series_viewer import check_dicom_in_series
from dicom_code_sequence import extract_code_sequence, create_code_sequence_dataframe
from datetime_format import datetime_form, date_form

# Import Data

In [None]:
print(os.getcwd())

In [None]:
dir_staging = os.path.join(os.getcwd(), '[ChestCT]ETL_Staging/')
dir_source = os.path.join(os.getcwd(), '[ChestCT]SourceTable/')
dir_cdm = os.path.join(dir_source, 'CDMTables/')

## Staging Tables

In [None]:
# import the imaging extension staging tables 
image_occurrence_staging = pd.read_csv(os.path.join(dir_staging, 'image_occurrence_staging_0804.csv')) # FK: procedure_unique_id (폴더명)
print(f" image_occurrence_staging: {image_occurrence_staging.shape[0]} rows of {image_occurrence_staging['person_id'].nunique()} patients")

measurement_staging = pd.read_csv(os.path.join(dir_staging, 'measurement_staging_0918.csv'))
print(f" measurement_staging: {measurement_staging.shape[0]} rows of {measurement_staging['person_id'].nunique()} patients")

image_feature_staging = pd.read_csv(os.path.join(dir_staging, 'image_feature_staging_0918.csv')) # FK: image_occurrence_id, image_feature_event_id (== measurement_id), image_instance_UID (file path)
print(f" image_feature_staging: {image_feature_staging.shape[0]} rows of {image_feature_staging['person_id'].nunique()} patients")

## DICOM File Path
- add local path of dicom files 

In [None]:
# file-level
df_file = pd.read_csv(os.path.join(dir_source, 'Selected/ImagedInstanceList(225289).csv'))
print(f" df_file: {df_file.shape[0]} rows")
df_file.head(1)

In [None]:
# retrieval of dicom file path (raw)
df_dcm = pd.read_csv(os.path.join(dir_source, 'Metadata_v2.csv'))
print(f" df_dcm: {df_dcm.shape[0]} rows of {df_dcm['연구등록번호'].nunique()} patients")
print(df_dcm.iloc[0,1])
df_dcm.head(1)

# Complement Image_occurrence Table with 'local_path'
- 확인해 볼 것: 현재 DICOM 폴더가 Study-level로 나눠진게 맞을지?

In [None]:
df_series = df_dcm[['File Name', 'Study UID', 'Series UID', 'Study Name']]
df_series.rename(columns = {'File Name':'dcm_fpath', 'Study Name': 'folder_name'}, inplace=True)
df_series.head(1)

In [None]:
"""
# Study Name 이 진짜 folder name of the dcm_fpath인지 확인 (즉, 폴더가 study-level로 나눠진게 맞는지?)
def folder_name(dcm_fpath):
    folder_name = dcm_fpath.split('/')[-2]
    return folder_name

df_series['folder_name'] = df_series['dcm_fpath'].apply(folder_name)
df_series[df_series['Study Name'] != df_series['folder_name']] # 맞음!
"""

In [None]:
# Save (and used with '/kyulee/Codes/organize_dicom.py' 
# df_series.to_csv(os.path.join(dir_source, 'DCMFileMetaTable_241121.csv'), index=False)

## Folder 구조 변경
- (before) study-level only
- (after) sub files: series-level

In [None]:
# 잘 만들어졌나 보기 (Series-folders로 안묶이는 .dcm 들도 있나 보기) 
def find_remaining_dcm_files(base_folder):
    remaining_files = []

    # base_folder의 study-level 폴더만 탐색 =
    for root, dirs, files in os.walk(base_folder):
        if root == base_folder:
            for study_folder in dirs:
                study_path = os.path.join(root, study_folder)
                # study-level 폴더의 하위 파일 및 디렉토리 확인
                for sub_root, sub_dirs, sub_files in os.walk(study_path):
                    if sub_root == study_path: # Series-level 폴더가 아닌 경우만
                        for file in sub_files:
                            if file.endswith(".dcm"):
                                remaining_files.append(os.path.join(root, file))
    return remaining_files

# base_folder path
base_folder = '/home/yuhsuser/workspace/DICOM/(2023300243)1~200_CT/'

remaining_dcm_files = find_remaining_dcm_files(base_folder)
if remaining_dcm_files:
    print(f"남아 있는 .dcm 파일 {len(remaining_dcm_files)}개 발견:")
    #for file in remaining_dcm_files:
        #print(file)
else:
    print("모든 .dcm 파일이 Series-level 폴더로 이동되었습니다.")

## MetaTable 업데이트

In [None]:
def update_file_path(row):
    # 기존 file_path에서 Study-level 폴더를 추출
    study_folder = os.path.dirname(row["dcm_fpath"])
    # 새 경로 생성: Study-level 폴더 아래에 Series-level 폴더 추가
    series_path = os.path.join(study_folder, row["Series UID"])
    # 파일 이름 유지
    dcm_fname = os.path.basename(row["dcm_fpath"])
    # Series-level 파일 경로 생성
    updated_path = os.path.join(series_path, dcm_fname) 
    return series_path, dcm_fname, updated_path

# 각 행에 대해 새로운 경로 및 파일 이름 생성
df_series["series_path"], df_series["dcm_fname"], df_series["updated_fpath"] = zip(*df_series.apply(update_file_path, axis=1))
print(df_series.shape)
df_series.head(1)

In [None]:
# Save (according to changed file path due to '/kyulee/Codes/organize_dicom.py')
#df_series.to_csv(os.path.join(dir_source, 'DCMFileMetaTable_Updated_241121.csv'), index=False)

## Update Image_occurrence_staging Table
- From df_series['series_path'] To image_occurrence_staging['local_path']
- Using: df_series['Series UID'] and image_occurrence_staging['image_series_uid']

In [None]:
df_series_short = df_series[['Series UID', 'series_path']].drop_duplicates().reset_index(drop=True)
df_series_short.rename(columns={'Series UID':'image_series_uid'}, inplace=True)
print(f" df_series_short: {df_series_short.shape[0]} rows of {df_series_short['image_series_uid'].nunique()} series")

In [None]:
image_occurrence_updated = pd.merge(df_series_short, image_occurrence_staging, on = 'image_series_uid', how = 'inner')
print(f" image_occurrence_updated: {image_occurrence_updated.shape[0]} rows of {image_occurrence_updated['image_series_uid'].nunique()} series")

In [None]:
image_occurrence_updated.drop('local_path', axis=1, inplace=True) # remove empty one
image_occurrence_updated.rename(columns={'series_path':'local_path'}, inplace=True)

In [None]:
print(image_occurrence_updated.iloc[1405, 1])
image_occurrence_updated[image_occurrence_updated['image_occurrence_id'] == 1]

In [None]:
# Save
# image_occurrence_updated.to_csv(os.path.join(dir_staging, 'image_occurrence_staging_1121.csv'), index=False)

In [None]:
# Import
# del image_occurrence_staging
image_occurrence_updated = pd.read_csv(os.path.join(dir_staging, 'image_occurrence_staging_1121.csv'))

# Compare DICOM at Series-level
- Identify series using Image_occurrence_table (같은 image_occurrence_id면 같은 Series)

## Explore kinds of tags

In [None]:
# measurement
print(measurement_staging['measurement_source_value'].unique()) # SliceThickness, KVP, PatientPosition, PatientOrientation, ImagePositionPatient, ContrastBolusVolume

In [None]:
# search
def search_keyword(keyword):
    search_condition = measurement_staging['measurement_source_value'] == keyword
    df = measurement_staging[search_condition]
    return print(df['value_source_value'].value_counts())

ct_attributes = ['SliceThickness', 'KVP', 'ExposureTime', 'PatientPosition', 'ScanOptions', 'CTDIvol', 'SeriesNumber', 'AcquisitionNumber', 'PhotometricInterpretation', 
                 'ContrastBolusVolume', 'ContrastBolusTotalDose']

for keyword in ct_attributes:
    print(f"keyword: {keyword} --------------------------------------------------")
    search_keyword(keyword)
    print('\n')

## Compare with viewer

In [None]:
def search_series(keyword, value):
    # Measurement
    meas_condition = (measurement_staging['measurement_source_value'] == keyword) & (measurement_staging['value_source_value'] == value)
    df_measurement = measurement_staging[meas_condition]
    measurement_ids = df_measurement['measurement_id'].unique()
    print(len(measurement_ids))

    # Image_feature
    feature_condition = (image_feature_staging['image_feature_event_field_concept_id'] == 1147330) & (image_feature_staging['image_feature_event_id'].isin(measurement_ids))
    df_feature = image_feature_staging[feature_condition]
    occurrence_ids = df_feature['image_occurrence_id'].unique()
    print(len(occurrence_ids))

    # Image_occurrence
    df_occurrence = image_occurrence_updated[image_occurrence_updated['image_occurrence_id'].isin(occurrence_ids)]
    print(f"df_occurrence: {df_occurrence.shape[0]} rows of {df_occurrence['image_study_uid'].nunique()} studies and {df_occurrence['image_series_uid'].nunique()} series")
    #print(f"example series path: {df_occurrence.loc[0, 'local_path']}")
    
    #return df_occurrence[['image_occurrence_id', 'person_id', 'image_occurrence_date', 'procedure_unique_id', 'local_path']].head()
    return df_occurrence['local_path'].unique()

In [None]:
# KVP == 120
kvp_120_ls = search_series('KVP', '120')
check_dicom_in_series(kvp_120_ls[0], ['SliceThickness', 'KVP', 'ScanOptions', 'ContrastBolusVolume', 'CTDIvol', 'PhotometricInterpretation'])

In [None]:
# KVP == 80
kvp_80_ls = search_series('KVP', '80')
check_dicom_in_series(kvp_80_ls[0], ['Modality', 'SliceThickness', 'KVP', 'ScanOptions', 'ContrastBolusVolume', 'CTDIvol', 'PhotometricInterpretation'])

In [None]:
# PhotometricInterpretation == RGB
rgb_ls = search_series('PhotometricInterpretation', 'RGB')
check_dicom_in_series(rgb_ls[0], ['Modality', 'SliceThickness', 'KVP', 'ScanOptions', 'ContrastBolusVolume', 'CTDIvol', 'PhotometricInterpretation'])

In [None]:
# ScanOptions == SCOUT MODE
#scout_ls = search_series('ScanOptions', 'SCOUT MODE')
check_dicom_in_series(scout_ls[30], ['Modality', 'BodyPartExamined', 'SliceThickness', 'KVP', 'ScanOptions', 'ContrastBolusVolume', 'CTDIvol', 'PhotometricInterpretation'])

In [None]:
# ScanOptions == HELICAL MODE
#helic_ls = search_series('ScanOptions', 'HELICAL MODE')
check_dicom_in_series(helic_ls[15], ['Modality', 'BodyPartExamined', 'SliceThickness', 'KVP', 'ScanOptions', 'ContrastBolusVolume', 'CTDIvol', 'PhotometricInterpretation'])

In [None]:
# ScanOptions == SURVIEW
surview_ls = search_series('ScanOptions', 'SURVIEW')
check_dicom_in_series(surview_ls[0], ['Modality', 'BodyPartExamined', 'SliceThickness', 'KVP', 'ScanOptions', 'ContrastBolusVolume', 'CTDIvol', 'PhotometricInterpretation'])

In [None]:
# ContrastBolusVolume == 0 
#noncontrast_ls = search_series('ContrastBolusVolume', '0')
check_dicom_in_series(noncontrast_ls[0], ['Modality', 'BodyPartExamined', 'SliceThickness', 'KVP', 'ScanOptions', 'ContrastBolusVolume', 'CTDIvol', 'PhotometricInterpretation'])

In [None]:
# ContrastBolusVolume == 94
#contrast_ls = search_series('ContrastBolusVolume', '94')
check_dicom_in_series(contrast_ls[100], ['Modality', 'BodyPartExamined', 'SliceThickness', 'KVP', 'ScanOptions', 'ContrastBolusVolume', 'CTDIvol', 'PhotometricInterpretation'])

In [None]:
# PatientOrientation == ['L', 'P']
lp_ls = search_series('PatientOrientation', "['L', 'P']")
check_dicom_in_series(lp_ls[0], ['Modality', 'BodyPartExamined', 'SliceThickness', 'KVP', 'ScanOptions', 'ContrastBolusVolume', 'CTDIvol', 'PhotometricInterpretation'])

## Further Check

In [None]:
def return_image_occurrence(keyword, value_ls):
    # Measurement
    meas_condition = (measurement_staging['measurement_source_value'] == keyword) & (measurement_staging['value_source_value'].isin(value_ls))
    df_measurement = measurement_staging[meas_condition]
    measurement_ids = df_measurement['measurement_id'].unique()
    #print(len(measurement_ids))

    # Image_feature
    feature_condition = (image_feature_staging['image_feature_event_field_concept_id'] == 1147330) & (image_feature_staging['image_feature_event_id'].isin(measurement_ids))
    df_feature = image_feature_staging[feature_condition]
    occurrence_ids = df_feature['image_occurrence_id'].unique()
    #print(len(occurrence_ids))

    # Image_occurrence
    df_occurrence = image_occurrence_updated[image_occurrence_updated['image_occurrence_id'].isin(occurrence_ids)]
    print(f"df_occurrence: {df_occurrence.shape[0]} rows of {df_occurrence['image_study_uid'].nunique()} studies and {df_occurrence['image_series_uid'].nunique()} series")
    return df_occurrence[['image_occurrence_id', 'person_id', 'image_occurrence_date', 'procedure_unique_id', 'local_path']].head()

def return_corresponding__all_measurement(df_occurrence):
    occurrence_ids = df_occurrence['image_occurrence_id'].unique()
    
    df_feature = image_feature_staging[image_feature_staging['image_occurrence_id'].isin(occurrence_ids)]
    feature_ids = df_feature['image_feature_event_id'].unique()

    meas_condition = (measurement_staging['measurement_id'].isin(feature_ids))
    df_measurement = measurement_staging[meas_condition]
    return df_measurement[['measurement_id', 'person_id', 'measurement_source_value', 'value_source_value']]

# 위에 두 개 합침 
def return_corresponding__all_measurement_atonce(keyword, value_ls):
    df_occurrence = return_image_occurrence(keyword, value_ls)
    occurrence_ids = df_occurrence['image_occurrence_id'].unique()
    
    df_feature = image_feature_staging[image_feature_staging['image_occurrence_id'].isin(occurrence_ids)]
    feature_ids = df_feature['image_feature_event_id'].unique()

    meas_condition = (measurement_staging['measurement_id'].isin(feature_ids))
    df_measurement = measurement_staging[meas_condition]
    print(f"df_measurement: {df_measurement.shape[0]} rows of {df_measurement['person_id'].nunique()} patients")
    return df_measurement[['measurement_id', 'person_id', 'measurement_source_value', 'value_source_value']]

# 특정 key,value만 보고 싶을 때
def return_corresponding_measurement(df_occurrence, keyword, value):
    occurrence_ids = df_occurrence['image_occurrence_id'].unique()
    
    df_feature = image_feature_staging[image_feature_staging['image_occurrence_id'].isin(occurrence_ids)]
    feature_ids = df_feature['image_feature_event_id'].unique()

    meas_condition = (measurement_staging['measurement_id'].isin(feature_ids)) & (measurement_staging['measurement_source_value'] == keyword) & (measurement_staging['value_source_value']==value)
    df_measurement = measurement_staging[meas_condition]
    return df_measurement[['measurement_id', 'person_id', 'measurement_source_value', 'value_source_value']]

### ScanOptions
- 추가로 궁금한거: ScanOptions이 SCOUT일 때랑 HELICAL일 때의 Slice Thickness 비교
- +) SURVIEW, AXIAL, AXIAL MODE, FLUORO MODE, SCANSCOPE

In [None]:
# SCOUT
scout_val_ls = ['SCOUT MODE']
df_scout = return_corresponding__all_measurement_atonce('ScanOptions', scout_val_ls)
df_scout[df_scout['measurement_source_value']=='SliceThickness']['value_source_value'].value_counts()

In [None]:
# HELIX
helix_val_ls = ['HELICAL MODE', 'HELIX', 'HELICAL_CT']
df_helix = return_corresponding__all_measurement_atonce('ScanOptions', helix_val_ls)
df_helix[df_helix['measurement_source_value']=='SliceThickness']['value_source_value'].value_counts()

### In same study and different series
- study: 49_2998071_20210301_CT

In [None]:
#df_series = pd.read_csv(os.path.join(dir_source, 'DCMFileMetaTable_Updated_241121.csv'))

In [None]:
def return_corresponding_measurement_id(image_occurrence_id):
    df_feature = image_feature_staging[image_feature_staging['image_occurrence_id']==image_occurrence_id]
    feature_ids = df_feature['image_feature_event_id'].unique()

    meas_condition = (measurement_staging['measurement_id'].isin(feature_ids))
    df_measurement = measurement_staging[meas_condition]
    return df_measurement[['measurement_id', 'person_id', 'measurement_source_value', 'value_source_value']]

In [None]:
procedure_unique_id = '49_2998071_20210301_CT'

In [None]:
# 같은 'procedure_unique_id' '49_2998071_20210301_CT' (8 series)
df_io = image_occurrence_updated[image_occurrence_updated['procedure_unique_id'] == procedure_unique_id]
print(df_io.shape)

#### 같은 Study 내에서, Scan Options이 다른 경우

In [None]:
df_study = return_corresponding__all_measurement(df_io)
df_study[df_study['measurement_source_value']=='ScanOptions']['value_source_value'].value_counts()

In [None]:
# SCOUT
df_study[df_study['value_source_value']=='SCOUT MODE']

In [None]:
image_feature_staging[image_feature_staging['image_feature_event_id'] == 28325] # image_occurrence_id: 550

In [None]:
scout_series_path = image_occurrence_updated[image_occurrence_updated['image_occurrence_id'] == 550]['local_path'].unique()[0]
print(scout_series_path)

In [None]:
# HELICAL
df_study[df_study['value_source_value']=='HELICAL MODE'].head(1)

In [None]:
image_feature_staging[image_feature_staging['image_feature_event_id'] == 18840] # image_occurrence_id: 544

In [None]:
helix_series_path = image_occurrence_updated[image_occurrence_updated['image_occurrence_id'] == 544]['local_path'].unique()[0]
print(helix_series_path)

#### 위 케이스에 대해 이미지 열어보기

In [None]:
# SCOUT
check_dicom_in_series(scout_series_path, ['Modality', 'SliceThickness', 'KVP', 'ScanOptions', 'ContrastBolusVolume', 'CTDIvol', 'PhotometricInterpretation'])

In [None]:
# HELICAL
check_dicom_in_series(helix_series_path, ['Modality', 'SliceThickness', 'KVP', 'ScanOptions', 'ContrastBolusVolume', 'CTDIvol', 'PhotometricInterpretation'])

# Cohort Definition with DICOM Retrieval

## Update MI-CDM Tables to be connected with Original CDM Tables
- 지금 MI-CDM의 'person_id' 컬럼이 모두 사실은 'person_source_value'인데, person_id도 업데이트 해두기

In [None]:
# Person
#person = pd.read_csv('/home/yuhsuser/workspace/kyulee/Project/MI-CDM/MI-CDM_DB_Table/Person.csv')
df_person = person[['person_id', 'person_source_value']]
print(f"df_person: {df_person.shape[0]} rows of {df_person['person_id'].nunique()} patients")
df_person.head(1)

In [None]:
def match_personid(df, df_person):
    if ("person_id" in df.columns) and ("person_source_value" not in df.columns):
        df = df.rename(columns={"person_id": "person_source_value"})
        df = df.merge(df_person[["person_id", "person_source_value"]], on="person_source_value", how="left")
        print(f"{df.shape[0]} rows: transformed person_source_value ({df['person_source_value'].nunique()}) to person_id ({df['person_id'].nunique()})")
        return df
    else:
        print("There's no person_id or already have person_source_value")

In [None]:
# Update: person_source_value to person_id
image_occurrence_updated = match_personid(image_occurrence_updated, df_person)
measurement_updated = match_personid(measurement_staging, df_person)
image_feature_updated = match_personid(image_feature_staging, df_person)

In [None]:
# Save
#image_occurrence_updated.to_csv(os.path.join(dir_staging, 'image_occurrence_staging_1123.csv'), index=False)
#measurement_updated.to_csv(os.path.join(dir_staging, 'measurement_staging_1123.csv'), index=False)
#image_feature_updated.to_csv(os.path.join(dir_staging, 'image_feature_staging_1123.csv'), index=False)

## Import Original CDM Tables Extracted by CSV Files

In [None]:
# Condition_occurrence
condition_occurrence = pd.read_csv(os.path.join(dir_cdm, 'condition_occurrence.csv'))
condition_occurrence.columns = condition_occurrence.columns.str.lower()
print(f"condition_occurrence: {condition_occurrence.shape[0]} rows of {condition_occurrence['person_id'].nunique()} patients")
condition_occurrence.head(1)

In [None]:
# Procedure_occurrence
procedure_occurrence = pd.read_csv(os.path.join(dir_cdm, 'procedure_occurrence_ExtractedByPerson.csv'))
print(f"procedure_occurrence: {procedure_occurrence.shape[0]} rows of {procedure_occurrence['person_id'].nunique()} patients")
procedure_occurrence.head(1)

## Check if Orignal CDM and MI-CDM are well-connected
- using person_id, visit_occurrence_id

In [None]:
condition_occurrence[(condition_occurrence['person_id'] == 111111)&(condition_occurrence['visit_occurrence_id'] == 222222)].head(3)

In [None]:
procedure_occurrence[procedure_occurrence['visit_occurrence_id'] == 222222].head(3)

## Build a Cohort
- 폐암 수술: 4069074, 4172438, 4070879, 4096152, 4070880, 4021362, 4067713
- 폐암 진단: 4311499 (Primary malignant neoplasm of respiratory tract)

### 1. Concept Set

In [None]:
# 폐암 수술
procedure_ls = [4069074, 4172438, 4070879, 4096152, 4070880, 4021362, 4067713]
procedure_df = procedure_occurrence[procedure_occurrence['procedure_concept_id'].isin(procedure_ls)]
print(f"procedure_df: {procedure_df.shape[0]} rows of {procedure_df['person_id'].nunique()} patients")

In [None]:
# 폐암 진단
condition_ls = ['4311499']
condition_df = condition_occurrence[condition_occurrence['condition_concept_id'].isin(condition_ls)]
print(f"condition_df: {condition_df.shape[0]} rows of {condition_df['person_id'].nunique()} patients")

### 2. Cohort Definition

#### 1) 폐암 수술일자 기준 index_date 설정

In [None]:
# Year of Procedure Date
date_form(procedure_df, 'procedure_date')
procedure_df['procedure_year'] = pd.DatetimeIndex(procedure_df['procedure_date']).year
procedure_df['procedure_year'].hist()

In [None]:
# index_date를 환자 단위로 정의
procedure_df = procedure_df.rename(columns={'procedure_date':'index_date'})
index_dates = procedure_df[['person_id', 'index_date']]

#### 2) Chest CT 영상 검사 조건 확인

In [None]:
imaging_1c = image_occurrence_updated['modality_source_value']=='CT'
imaging_2c = image_occurrence_updated['anatomic_site_source_value']=='CHEST'
image_occurrence_df = image_occurrence_updated[(imaging_1c)&(imaging_2c)]
date_form(image_occurrence_df, 'image_occurrence_date')
print(f"image_occurrence_df: {image_occurrence_df.shape[0]} rows of {image_occurrence_df['person_id'].nunique()} patients")

#### 3) Join image_occurrence data with index_date (30 days before index_date))

In [None]:
# Join image_occurrence data with index_date (by='person_id')
imaging_before_30d = image_occurrence_df.merge(index_dates, on="person_id", how="inner")
print(f"imaging_before_30d: {imaging_before_30d.shape[0]} rows of {imaging_before_30d['person_id'].nunique()} patients")

In [None]:
# Keep conditions within before 30 days of index_date
imaging_before_30d = imaging_before_30d[
    (imaging_before_30d['image_occurrence_date'] >= imaging_before_30d['index_date'] - pd.Timedelta(days=30))
    & (imaging_before_30d['image_occurrence_date'] <= imaging_before_30d['index_date'])]
print(f"imaging_before_30d: {imaging_before_30d.shape[0]} rows of {imaging_before_30d['person_id'].nunique()} patients")

#### 4) Check

In [None]:
imaging_before_30d['procedure_unique_id'].value_counts()

In [None]:
paths_study_1 = imaging_before_30d[imaging_before_30d['procedure_unique_id']=='study_1']['local_path'].unique()
paths_study_2 = imaging_before_30d[imaging_before_30d['procedure_unique_id']=='study_2']['local_path'].unique()
print(len(paths_study_1), len(paths_study_2))

In [None]:
check_attributes = ['SeriesDescription', 'ProtocolName', 'ScanOptions', 'PatientAge', 'PatientSex', 'SliceThickness', 'KVP']

In [None]:
# paths_study_1
for i, path in enumerate(paths_study_1):
    print(i)
    check_dicom_in_series(path, check_attributes, max_files = 5)
    print('-----------'*9)

In [None]:
# paths_study_2
for i, path in enumerate(paths_study_2):
    print(i)
    check_dicom_in_series(path, check_attributes, max_files = 5)
    print('-----------'*9)