In [9]:
from transformers import pipeline
import numpy as np
import pandas as pd
from sklearn import metrics
import torch
import datasets
from datasets import Dataset
from transformers.pipelines.pt_utils import KeyDataset
from tqdm.auto import tqdm

In [2]:
# https://huggingface.co/docs/transformers/v4.26.1/en/main_classes/pipelines#transformers.ZeroShotClassificationPipeline
pipe = pipeline(
    model="facebook/bart-large-mnli",
    framework="pt",
    device=0,
#     batch_size=10,
)
assert pipe.device.type == "cuda"

In [3]:
pipe("I have a problem with my iphone that needs to be resolved asap!",
     candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"],
     multi_label=True,
)

{'sequence': 'I have a problem with my iphone that needs to be resolved asap!',
 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'],
 'scores': [0.9987171292304993,
  0.9945850968360901,
  0.18989984691143036,
  0.000767416029702872,
  0.0003826073370873928]}

In [4]:
from datasets import load_dataset
dataset = load_dataset("csv", data_files="/root/data/chexbert_results.csv")
print(dataset)
print(dataset["train"][0:10])

Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-94444637865139fd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['index', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture', 'Support Devices', 'No Finding', 'Report Impression'],
        num_rows: 146149
    })
})
{'index': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'Enlarged Cardiomediastinum': [None, None, None, None, None, None, None, None, None, None], 'Cardiomegaly': [None, None, None, None, None, None, None, None, None, None], 'Lung Opacity': [1.0, None, None, None, None, None, None, None, None, None], 'Lung Lesion': [None, None, 1.0, None, None, None, None, 1.0, None, -1.0], 'Edema': [1.0, None, None, None, None, None, None, None, None, None], 'Consolidation': [0.0, None, None, 1.0, None, None, None, None, 0.0, -1.0], 'Pneumonia': [None, None, None, -1.0, None, -1.0, None, None, None, None], 'Atelectasis': [None, None, None, -1.0, None, None, None, None, No

In [5]:
# labels = ["Fracture", "Edema", "Cardiomegaly", "Pneumonia", "Atelectasis", "Pneumothorax", "Pleural Effusion"]
# TODO: use all labels
labels = [
	"Pneumonia", # XR CHEST and CT CHEST
	"Pneumothorax", # XR CHEST and CT CHEST
	"Pleural Effusion", # XR CHEST and CT CHEST
	"Edema", # "Pulmonary edema", "Cerebral edema"; XR CHEST and CT CHEST
	"Fracture", # "Rib fracture", "Skull fracture"; XR CHEST and CT CHEST
	"Infection",
	"Aspiration",
	"Cardiomegaly",
	"Opacities",
	"Atelectasis",
	"Intracranial hemorrhage",
	"Subarachnoid hemorrhage",
	"Subdural hemorrhage",
	"Epidural hemorrhage",
	"Intraparenchymal hemorrhage",
	"Intraventricular hemorrhage",
	"Stroke",
	"Diffuse axonal injury",
	"Appendicitis ",
	"Cholecystitis",
	"Abdominal Aortic Aneurysm",
	"Small bowel obstruction",
	"Pancreatitis",
	"Splenic laceration",
	"Liver laceration",
	"Colitis",
	"Pyelonephritis",
	"Nephrolithiasis",
	"Malignancy",
	"Pericaridial effusion",
	"Aortic dissection",
]
print(len(labels))

31


In [6]:
# key_dataset = KeyDataset(dataset["train"].select(range(10)), "Report Impression")
key_dataset = KeyDataset(dataset["train"], "Report Impression")

pred_all = pipe(
    sequences=key_dataset,
    candidate_labels=labels,
    multi_label=True,
    batch_size=16,
)

results = []
partial_results = []
for i, out in enumerate(tqdm(pred_all, total=len(key_dataset))):
    results.append(out)
    partial_results.append(out)
    
    if i % 1000 == 0:
        print(f"saving partial results {i}")
        partial_results = Dataset.from_list(partial_results)
        partial_results.save_to_disk(f"/root/data/zero_shot_predictions/checkpoint_{i}")
        partial_results = []
    
print(len(results))

  0%|          | 0/146149 [00:00<?, ?it/s]

saving partial results 0


Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

saving partial results 10


Saving the dataset (0/1 shards):   0%|          | 0/10 [00:00<?, ? examples/s]

saving partial results 20


Saving the dataset (0/1 shards):   0%|          | 0/10 [00:00<?, ? examples/s]

saving partial results 30


Saving the dataset (0/1 shards):   0%|          | 0/10 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [None]:
predictions = Dataset.from_list(results)
print(predictions)
print(predictions[0])

In [None]:
# predictions[0]

In [None]:
predictions.save_to_disk("/root/data/zero_shot_predictions/all")

In [None]:
from numba import cuda
device = cuda.get_current_device()
device.reset()

In [11]:
d = datasets.load_from_disk("/root/data/zero_shot_predictions/checkpoint_10")
d

Dataset({
    features: ['sequence', 'labels', 'scores'],
    num_rows: 10
})

In [12]:
d[:2]

{'sequence': ['1.  No acute intracranial abnormality. 2.  Mild right forehead scalp fat stranding, correlate with exam for possible mild soft tissue contusion versus cutaneous lesion. No fracture of the skull or imaged facial bones.',
  '1.  Multiple loops of dilated small bowel with fecalized material, similar to the prior CT. Multiple regions of matted small bowel in the pelvis and relative caliber change. Findings are compatible with malignant partial small bowel obstruction related to peritoneal/serosal implants from known metastatic ovarian cancer. 2.  Diffuse metastatic disease with malignant ascites, peritoneal carcinomatosis, pleural metastases, and lymphadenopathy. While the volume of ascites has decreased, the peritoneal soft tissue nodules and pleural/pulmonary nodular opacities at the lung bases have mildly progressed when compared to the prior CT. 3.  Slightly decreased size of the cystic and solid pelvic masses at the bilateral adnexa. 4.  Mild hydroureteronephrosis.'],
 