In [8]:
from data import all_identifiers, all_features, all_files
from config import config
import pandas
from data import data
import os

In [9]:
DIRECTORY = os.path.join(os.environ["DATA_DIR"], "liver-ultrasound")
FREE = os.path.join(DIRECTORY, "free")
UNCERTAIN = os.path.join(DIRECTORY, "c3-c4-free")

def train_dir(base): 
    return os.path.join(base, "train")
def validation_dir(base): 
    return os.path.join(base, "validation")
def test_dir(base): 
    return os.path.join(base, "test")

In [15]:
train, validation, test = data(shuffle_train=False, train_dir=train_dir(FREE), validation_dir=validation_dir(FREE), test_dir=test_dir(FREE))

Found 660 images belonging to 2 classes.
Found 660 images belonging to 2 classes.
Found 172 images belonging to 2 classes.
Found 79 images belonging to 2 classes.


In [16]:
uncertain_train, uncertain_validation, uncertain_test = data(shuffle_train=False, train_dir=train_dir(UNCERTAIN), validation_dir=validation_dir(UNCERTAIN), test_dir=test_dir(UNCERTAIN))

Found 314 images belonging to 2 classes.
Found 314 images belonging to 2 classes.
Found 80 images belonging to 2 classes.
Found 82 images belonging to 2 classes.


In [17]:
def identifier_from_filenames(filenames): 
    return [f.split("/")[1].replace("free-", "").replace(".jpeg", "").split("-")[0] for f in filenames]

In [21]:
files = all_identifiers(all_files("free", config.RAW_DIR))
evidence, malignant, diagnosis, code = all_features(features = config.FEATURES, fieldnames=["a", "b", "c", "d"])
identifiers = list(files.keys())

identifiers = list(set(identifier_from_filenames(train.filenames) + identifier_from_filenames(validation.filenames) + identifier_from_filenames(test.filenames)))

identifier_list = list()

for i in identifiers:
    try: 
        identifier_list.append({
            "identifier": i,
            "files": len(files[i]),
            "diagnosis": diagnosis[i],
            "evidence": evidence[i],
            "malignant": malignant[i],
            "code": code[i],
        })
    except: 
        print(i)

df = pandas.DataFrame(identifier_list).set_index("identifier")

In [23]:
df[df.code=="C6"]

Unnamed: 0_level_0,files,diagnosis,evidence,malignant,code
identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
27896604,1,ccc,biopsy,malignant,C6
26987467,1,metastases,biopsy,malignant,C6
26360243,1,metastases,biopsy,malignant,C6
26799352,2,ccc,biopsy,malignant,C6
28376394,3,metastases,biopsy,malignant,C6
31150775,5,metastases,biopsy,malignant,C6
30077448,2,metastases,biopsy,malignant,C6
28415489,3,metastases,biopsy,malignant,C6
7802051,4,metastases,biopsy,malignant,C6
26797566,2,metastases,biopsy,malignant,C6


In [22]:
TEXT = """
Among the {patients} patients who met the inclusion criteria, there were {images} images of individual lesions, of which {malignant} were malignant and {benign} were benign based on MRI or histopathology. The diagnosis of benign versus malignant was established by histopathology in {biopsy} patients and MRI in {mri} patients [@strauss2015diagnosis; @anderson2009benign; @qian2016mri; @semelka2001focal; @albiin2012mri; @fowler2011magnetic; @itai1985noninvasive]. Every patient with benign or malignant lesion definitely diagnosed on MRI has typical imaging features of a benign or malignant solid liver lesion, as interpreted in the original radiology report and subsequently reviewed and confirmed by a radiologist (JW). Malignant lesions were diagnosed in MRI based on clearly defined criteria as described in the guidelines. Benign lesions had a reasonable follow-up period to ensure that they were benign.

There were {c2} images in Code Abdomen liver category 2, {c3} in category 3, {c4} in category 4, {c5} in category 5. C2 and C3 lesions were more likely to have been confirmed by MRI, while C4 and C5 lesions were more likely to have been confirmed by biopsy (Table 1). The complete set was divided into a training set of {train} lesions with {train_augmented} augmented images, validation set of {validation} lesions, and a test set of {test} lesions. The detailed clinical characteristics of the patient cohort is shown in Supplementary Table S1. The uncertain diagnosis set was divided by patient into a training set of {uncertain_train} lesions with {uncertain_train_augmented} augmented images, a validation set of {uncertain_validation} lesions, and a test set of {uncertain_test} lesions.

"""

ABSTRACT_TEXT = """Among the {patients} patients who met the inclusion criteria, there were {images} images of individual liver lesions, of which {malignant} were malignant and {benign} were benign. Our training set contained {train} lesions augmented dynamically during training for a total of {train_augmented} images; our test set contained {test} images."""

print(TEXT.format(
    patients=len(df), 
    images=df.files.sum(),
    malignant=df[df.malignant=="malignant"].files.sum(),
    benign=df[df.malignant=="benign"].files.sum(),
    biopsy=len(df[df.evidence=="biopsy"]),
    mri=len(df[df.evidence!="biopsy"]),
    c2=df[df.code=='C2'].files.sum(),
    c3=df[df.code=='C3'].files.sum(),
    c4=df[df.code=='C4'].files.sum(),
    c5=df[df.code=='C5'].files.sum(),
    train=train.n,
    train_augmented=train.n*500,
    validation=validation.n,
    test=test.n,
    uncertain_train=uncertain_train.n,
    uncertain_train_augmented=uncertain_train.n*500,
    uncertain_validation=uncertain_validation.n,
    uncertain_test=uncertain_test.n,
))

print(ABSTRACT_TEXT.format(
    patients=len(df), 
    images=df.files.sum(),
    malignant=len(df[df.malignant=="malignant"]),
    benign=len(df[df.malignant=="benign"]),
    train=train.n,
    train_augmented=train.n*500,
    test=test.n,
))



Among the 596 patients who met the inclusion criteria, there were 911 images of individual lesions, of which 535 were malignant and 376 were benign based on MRI or histopathology. The diagnosis of benign versus malignant was established by histopathology in 265 patients and MRI in 331 patients [@strauss2015diagnosis; @anderson2009benign; @qian2016mri; @semelka2001focal; @albiin2012mri; @fowler2011magnetic; @itai1985noninvasive]. Every patient with benign or malignant lesion definitely diagnosed on MRI has typical imaging features of a benign or malignant solid liver lesion, as interpreted in the original radiology report and subsequently reviewed and confirmed by a radiologist (JW). Malignant lesions were diagnosed in MRI based on clearly defined criteria as described in the guidelines. Benign lesions had a reasonable follow-up period to ensure that they were benign.

There were 158 images in Code Abdomen liver category 2, 238 in category 3, 217 in category 4, 256 in category 5. C2 an

In [20]:
df

Unnamed: 0_level_0,files,diagnosis,evidence,malignant,code
identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
14194543,2,Inflammatory myofibroblastic tumor,biopsy,benign,C3
14194543,2,Inflammatory myofibroblastic tumor,biopsy,benign,C3
14499072,1,hemangioma,mri,benign,C2
15154878,1,hemangioma,mri,benign,C2
15265143,1,hemangioma,mri,benign,C2
...,...,...,...,...,...
28357978,4,hcc,biopsy,malignant,C3
28357978,4,hcc,biopsy,malignant,C3
28758395,1,hcc,biopsy,malignant,C4
30134176,1,hcc,biopsy,malignant,C4


In [31]:
table_1 = """C2 | Benign | {c2_benign}:{c2_malignant} | {c2_biopsy}:{c2_mri} | {c2}
C3 | Indeterminate  | {c3_benign}:{c3_malignant} | {c3_biopsy}:{c3_mri}| {c3}
C4 | Suspicious | {c4_benign}:{c4_malignant} | {c4_biopsy}:{c4_mri} | {c4}
C5 | Highly suspicious | {c5_benign}:{c5_malignant} | {c5_biopsy}:{c5_mri} | {c5}
C6 | Known cancer | {c6_benign}:{c6_malignant} | {c6_biopsy}:{c6_mri} | {c6}
"""

print(table_1.format(
    c2=len(df[df.code=="C2"]),
    c2_benign=len(df[(df.code=="C2") & (df.malignant=="benign")]), 
    c2_malignant=len(df[(df.code=="C2") & (df.malignant=="malignant")]),
    c2_biopsy=len(df[(df.code=="C2") & (df.evidence=="biopsy")]),
    c2_mri=len(df[(df.code=="C2") & (df.evidence!="biopsy")]),
    c3=len(df[df.code=="C3"]),
    c3_benign=len(df[(df.code=="C3") & (df.malignant=="benign")]), 
    c3_malignant=len(df[(df.code=="C3") & (df.malignant=="malignant")]),
    c3_biopsy=len(df[(df.code=="C3") & (df.evidence=="biopsy")]),
    c3_mri=len(df[(df.code=="C3") & (df.evidence!="biopsy")]),
    c4=len(df[df.code=="C4"]),
    c4_benign=len(df[(df.code=="C4") & (df.malignant=="benign")]), 
    c4_malignant=len(df[(df.code=="C4") & (df.malignant=="malignant")]),
    c4_biopsy=len(df[(df.code=="C4") & (df.evidence=="biopsy")]),
    c4_mri=len(df[(df.code=="C4") & (df.evidence!="biopsy")]),
    c5=len(df[df.code=="C5"]),
    c5_benign=len(df[(df.code=="C5") & (df.malignant=="benign")]), 
    c5_malignant=len(df[(df.code=="C5") & (df.malignant=="malignant")]),
    c5_biopsy=len(df[(df.code=="C5") & (df.evidence=="biopsy")]),
    c5_mri=len(df[(df.code=="C5") & (df.evidence!="biopsy")]),
    c6=len(df[df.code=="C6"]),
    c6_benign=len(df[(df.code=="C6") & (df.malignant=="benign")]), 
    c6_malignant=len(df[(df.code=="C6") & (df.malignant=="malignant")]),
    c6_biopsy=len(df[(df.code=="C6") & (df.evidence=="biopsy")]),
    c6_mri=len(df[(df.code=="C6") & (df.evidence!="biopsy")]),
))



C2 | Benign | 156:1 | 4:153 | 157
C3 | Indeterminate  | 127:28 | 38:117| 155
C4 | Suspicious | 14:116 | 107:23 | 130
C5 | Highly suspicious | 2:136 | 100:38 | 138
C6 | Known cancer | 0:15 | 15:0 | 15



In [33]:
def printf(f): 
    print(f.split("/")[1].replace(".jpeg", "").replace("free-",""))

In [34]:
print("# TRAIN")
for f in train.filenames: 
    printf(f)
    
print("# VALIDATION")    
for f in validation.filenames: 
    printf(f)

print("# TEST")    
for f in test.filenames: 
    printf(f)

# TRAIN
14194543-1
14194543-2
14499072
15154878
15265143
15427145
15447071
15478991
15549574
15573577
15579240
15588939
15626731
15634000
15694564-2
15694564
15705324
15705343-1
15705343-2
15709568
15714472-1
15714472-2
15764983
15779715
15889706
15899132
15946996
15987441
16013023
26002248
26025036
26051811-1
26051811
26124887
26126607
26184437
26211237
26211275
26213342-1
26213342-2
26231700-1
26231700-2
26252663-1
26255625
26339265
26356026
26373027
26391888
26401332
26420142
26420409
26424318
26438501-1
26438501-2
26438501-3
26438501-4
26438501-5
26438501-6
26453970
26630992
26631175
26765562
26788792
26827945
26864409
26935222
26998610
27058428
27068562
27073745-2
27073745
27115175-1
27115175-2
27115175-3
27115175-4
27115175
27164560
27177540-2
27177540-3
27177540-4
27177540-5
27177540
27182550
27217389-2
27217389
27217906
27238696
27247600
27257835
27265310
27279444
27299036
27340452
27369141-2
27369141-3
27369141
27371526
27382661-1
27382661-2
27403964-2
27403964
27418203
274634

In [35]:
print("# TRAIN")
for f in uncertain_train.filenames: 
    printf(f)
    
print("# VALIDATION")    
for f in uncertain_validation.filenames: 
    printf(f)

print("# TEST")    
for f in uncertain_test.filenames: 
    printf(f)

# TRAIN
14194543-1
14194543-2
15291498
15478991
15551563-2
15551563
15588939
15648344
15754818-1
15889706
26023246
26025036
26051811-1
26051811
26087647-1
26114650
26124887
26211237
26224129-1
26252663-1
26339265
26394881-2
26394881
26438501-1
26438501-2
26438501-3
26438501-4
26438501-5
26438501-6
26453970
26616867
26765562
26805873
26827945
26864409
26887085
26894020
26914423
27068562
27163050
27177540-2
27177540-3
27177540-4
27177540-5
27177540
27217389-2
27217389
27247600
27257835
27279444
27369141-2
27369141-3
27369141
27371526
27382661-1
27382661-2
27403964-2
27403964
27459339
27463469
27463705
27544156
27545537
27550808
27553542
27577353
27600935-2
27600935-3
27600935
27624711
27625116-1
27625116-2
27662039-2
27662039-3
27662039
27677681
27741572
27811439
27835860
27848627-2
27848627
27932241-2
27932241
27952285-2
27952285
28058797-2
28058797
28100188-2
28100188
28119827
28129989-2
28129989-3
28129989
28142611
28144446
28164768
28188413
28235562-2
28235562-3
28235562-4
28235562
2