# Generation of Annotations with BERT

In [1]:
import pandas as pd
from simpletransformers.classification import ClassificationModel, MultiLabelClassificationModel

In [5]:
args={'output_dir': 'outputs/',
      'cache_dir': 'cache_dir/',
      'fp16': False,
      'no_cache': True, 
      'fp16_opt_level': 'O1',
      'max_seq_length': 512,           
      'train_batch_size': 8,
      'gradient_accumulation_steps': 10,
      'eval_batch_size': 12,
      'num_train_epochs': 10,          
      'weight_decay': 0,
      'learning_rate': 4e-5,
      'adam_epsilon': 1e-8,
      'warmup_ratio': 0.06,
      'warmup_steps': 0,
      'max_grad_norm': 1.0,
      'logging_steps': 50,
      'save_steps': 2000,  
      'evaluate_during_training': True,
      'overwrite_output_dir': True,
      'reprocess_input_data': True,
      'n_gpu': 2,
      'use_multiprocessing': True,
      'silent': False,
      'threshold': 0.5,
      
      # for long texts     
      'sliding_window': True,
      'tie_value': 1}

DATADIR = '../data/'

In [None]:
data_to_annotate = pd.read_csv(DATADIR + 'to-annotate.csv', header=0)

In [None]:
model = ClassificationModel('bert', 'outputs/final/radbert-binary/', args=args)
pred, raw =  model.predict(data_to_annotate.text)   

In [None]:
preselected_data = data_to_annotate[pred == 1]

In [None]:
model = MultiLabelClassificationModel('bert', 'outputs/final/radbert/', args=args)
pred, raw =  model.predict(preselected_data.text)   

In [112]:
def write_out_annotations(raw, out_name):
    annotated_data = pd.DataFrame(raw[0].mean(0)).transpose()
    
    for i in range(1, preselected_data.shape[0]):
        annotated_data = annotated_data.append(pd.DataFrame(raw[i].mean(0)).transpose())

    annotated_data = annotated_data.rename(columns={0: 'Stauung', 1: 'Verschattung', 2: 'Erguss', 3: 'Pneumothorax',
                                                    4: 'Thoraxdrainage', 5: 'ZVK', 6: 'Magensonde', 7: 'Tubus', 
                                                    8: 'Materialfehllage'})
    annotated_data['NameTextFile'] = list(preselected_data['NameTextFile'])
    annotated_data['text'] = list(preselected_data['text'])
    annotated_data['DicomPath'] = list(preselected_data['DicomPath'])

    annotated_data.to_csv(DATADIR + out_name + ".csv")

In [84]:
write_out_annotations(raw, 'radbert-annotations')

In [None]:
model = MultiLabelClassificationModel('bert', 'outputs/final/gerbert/', args=args)
pred, raw =  model.predict(preselected_data.text)   
write_out_annotations(raw, 'gerbert-annotations')

In [None]:
model = MultiLabelClassificationModel('bert', 'outputs/final/multibert/', args=args)
pred, raw =  model.predict(preselected_data.text)   
write_out_annotations(raw, 'multibert-annotations')

In [3]:
data_to_annotate = pd.read_csv(DATADIR + 'ct-to-annotate.csv', header=0)
preselected_data = data_to_annotate # no preselection
data_to_annotate

Unnamed: 0,NameTextFile,DicomPath,Stauung,Verschattung,Erguss,Pneumothorax,Thoraxdrainage,ZVK,Magensonde,Tubus,Materialfehllage,text
0,0001000799_0312830972_12133511_22683201.txt,12133511/0000/0000,,,,,,,,,,"llung, Rechtfertigende Indikation: Pat. mit ko..."
1,0001002171_0310228483_10470316_20031561.txt,10470316/0000/0000,,,,,,,,,,"llung, Rechtfertigende Indikation: Bauchschmer..."
2,0001002268_0310321531_10540364_20147483.txt,10540364/0000/0000,,,,,,,,,,"llung, Rechtfertigende Indikation: Zustand nac..."
3,0001002595_0312428520_11864839_22251134.txt,11864839/0000/0000,,,,,,,,,,tigende Indikation: Z.n. Reanimation. Frage na...
4,0001002735_0304289475_7127835_14187329.txt,7127835/0000/0000,,,,,,,,,,Ausdehnung? Technik: HR-CT des Thorax nativ. K...
...,...,...,...,...,...,...,...,...,...,...,...,...
18657,0082690096_0305867914_7998082_15787002.txt,7998082/0000/0000,,,,,,,,,,dchen mit V.a. offene TBC. Lobärpneumonie rech...
18658,0082756286_0312340482_11801188_22144392.txt,11801188/0000/0000,,,,,,,,,,"llung, rechtfertigende Indikation: CT zur Foku..."
18659,0082756286_0312341470_11836507_22203468.txt,11836507/0000/0000,,,,,,,,,,"gestellung, rechtfertigende Indikation: Septis..."
18660,0082758356_0305348219_7750807_15344192.txt,7750807/0000/0000,,,,,,,,,,e vom 28.12.2010: Klinik: Z.n. NTx 2000. Aktue...


In [6]:
model = ClassificationModel('bert', 'outputs/final/radbert/', args=args)
pred, raw =  model.predict(data_to_annotate.text)   

Converting to features started. Cache is not used.
sliding_window enabled


HBox(children=(FloatProgress(value=0.0, max=18662.0), HTML(value='')))


18662 features created from 18662 samples.


HBox(children=(FloatProgress(value=0.0, max=1845.0), HTML(value='')))




In [85]:
write_out_annotations(raw, 'radbert-ct-annotations')

In [None]:
model = ClassificationModel('bert', 'outputs/final/gerbert/', args=args)
pred, raw =  model.predict(data_to_annotate.text)   

In [113]:
write_out_annotations(raw, 'ct-gerbert-annotations')

In [None]:
model = ClassificationModel('bert', 'outputs/final/multibert/', args=args)
pred_multi, raw_multi =  model.predict(data_to_annotate.text)   


In [114]:
write_out_annotations(raw_multi, 'ct-multibert-annotations')