In [1]:
import json
import numpy as np
import pandas as pd
import os
import re
import glob
import shutil

def search_object(sentence,organ):
    ##Check whether a given structured text unit contains information about a specific organ.
    ##input sentence
    ##output  IO+AEのtoken,IO token,AE token,IOのcertainty_score,IOのtype
    img_obs = sentence
    mdfs = sentence['modifiers']
    ae_tokens = []

    for i in range(len(mdfs)):
        mdf = mdfs[i]
        if mdf['type'] == 'Anatomical_entity':
            ae_tokens += mdf['tokens']
    
    keywords = ae_tokens+img_obs['tokens']
    #print(keywords)
    if organ in keywords:
        keywords = ''.join(ae_tokens+img_obs['tokens'])
        io_tokens = ''.join(img_obs['tokens'])
        ae_tokens = ''.join(ae_tokens)
        return keywords,io_tokens,ae_tokens,img_obs['certainty_score'],img_obs['type']
    else:
        return None,None,None,None,None

def search_clinicalfinding(modifiers):
    ##Determine whether the term “clinical_descriptor” is included in the sentence.
    ##input modifiers
    ##output  A boolean indicating whether clinicalfinding is included, and the words identified as clinical (or 0 if none). 
    for i in range(len(modifiers)):
        modifier = modifiers[i]
        if modifier['type'] == 'Clinical_finding':
            return modifier['certainty_score'],''.join(modifier['tokens'])
        else:continue
    return None,'not_found'

def to_multi_label(df,abnormal_list):
    for abnormal_label in abnormal_list:
        df[abnormal_label] = df['label'].apply(lambda x: abnormal_label in x).astype(int)
    df['nofinding'] = df[abnormal_list].sum(axis=1) == 0
    return df

def rename_files(file_series):
    def rename_file(filename):
        parts = filename.split('_')
        parts[2] = parts[2].zfill(4)
        match = re.match(r'(\d+)([a-z]*)(\..+)', parts[3])
        if match:
            number, _, ext = match.groups()
            parts[3] = number.zfill(3) + ext
        return '_'.join(parts)

    return file_series.apply(rename_file)

path = 'path/to/the/file'
category_path = 'path/to/the/category/file'
df = pd.read_csv(path)
file_names = df['file_name']



In [13]:
###全体のラベル作成
###
organs = ['肝']
os.makedirs('../output',exist_ok=True)
counter = 0

#def detect_change(modifiers):
change_word_list = []
clinical_word_list = []
data = []
counter = 0
for file_name,json_load in zip(file_names,df['所見_JSON'].values):
    for organ in organs:
        #print(organ)
        for i in range(len(json_load)):
            all_keyword,io_keyword,ae_keyword,certainty,img_type = search_object(json_load[i],organ)
            clinical_scale,clinical_word = search_clinicalfinding(json_load[i]['modifiers'])
            
            if all_keyword is not None:
                counter+=1
                data.append([file_name,organ,all_keyword,io_keyword,ae_keyword,certainty,img_type,clinical_word,clinical_scale])
            else:
                continue

data_liver = pd.DataFrame(data,columns=['file_name','organ','all_tokens','io_tokens','ae_tokens','obs_certainty','obs_name','clin_findings','clin_certainty'])
data_liver['file_name'] = data_liver['file_name'].apply(lambda x:x.split('/')[-1])

In [18]:
normal_obs_finding_list = [i for i in data_liver[data_liver['obs_certainty'] == 0]['io_tokens'].value_counts().index]
abnormal_obs_finding_list = pd.read_csv(category_path)
abnormal_obs_finding_id = abnormal_obs_finding_list['category'].to_list()
abnormal_obs_finding_list = abnormal_obs_finding_list['keyword'].to_list()

abnormal_obs_finding_list_renamed = ['嚢胞','脂肪肝','胆管拡張','SOL','変形','石灰化','pneumobilia','other_abnormality','nofinding']

In [20]:
use_normalfindings = (data_liver['io_tokens'].apply(lambda x:x in normal_obs_finding_list)) & (data_liver['obs_certainty'] == 0)
use_abnormalfindings = (data_liver['io_tokens'].apply(lambda x:x in abnormal_obs_finding_list)) & (data_liver['obs_certainty'] != 0)

data_liver['use_training'] = (use_normalfindings|use_abnormalfindings)
data_liver['label'] = 'not used'
data_liver.loc[use_abnormalfindings,'label'] = data_liver.loc[use_abnormalfindings,'io_tokens'].apply(lambda x:abnormal_obs_finding_list_renamed[abnormal_obs_finding_id[abnormal_obs_finding_list.index(x)]])
data_liver.loc[use_normalfindings,'label'] = 'nofinding'
tmp = data_liver.groupby('file_name')['label'].apply(lambda x: x.tolist())
data_liver.drop('label',axis=1,inplace=True)
data_liver = data_liver.merge(tmp,how='left',on='file_name')


In [None]:
data_liver = to_multi_label(data_liver,abnormal_obs_finding_list_renamed)

data_liver = data_liver.drop_duplicates(subset='file_name')
print(data_liver.shape)
data_liver_use = data_liver[data_liver['use_training'] == True].reset_index(drop=True)
data_liver_use['file'] = data_liver_use['file_name']
data_liver_use['file_name'] = data_liver_use['file_name'].apply(lambda x:'_'.join(x.split('_')[:-1])+'_0000.nii.gz')

print(data_liver_use.shape)
##元の所見文と結合
data_liver_use = data_liver_use.merge(df,on='file_name',how='left').drop_duplicates(subset='file_name')
print(data_liver_use.shape)
data_liver_use[abnormal_obs_finding_list_renamed].sum(axis=0)

In [14]:
data_liver_use[abnormal_obs_finding_list_renamed]

Unnamed: 0,嚢胞,脂肪肝,胆管拡張,SOL,変形,石灰化,pneumobilia,other_abnormality
0,1,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0
3,1,1,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
180576,1,0,0,0,0,0,0,0
180577,1,0,0,0,0,0,0,0
180578,0,0,0,0,0,0,0,0
180579,1,0,0,0,0,0,0,0
