[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1I_Yc0puEBRGgPdVd8noZuBOtEoYh5clP?usp=sharing)


# Install and import packages 



In [None]:
import os
import pandas as pd
import numpy as np

In [None]:
!pip install sentence_transformers
!pip install datasets
!pip install transformers
!pip install scikit-learn
!pip install unzip

In [None]:
from sentence_transformers import SentenceTransformer
import datasets
from datasets import load_metric
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
from tqdm.auto import tqdm

Upload the zip from this link since it is to large for wget"https://drive.google.com/uc?export=download&id=14nA64QrUsuUJUxB_Gatgtiobo6S9vdam"

In [None]:
#!wget -O csv.zip --no-check-certificate "https://drive.google.com/uc?export=download&id=14nA64QrUsuUJUxB_Gatgtiobo6S9vdam"


In [None]:
#Upload csv.zip and text.zip from the drop box folder CogAI/synthea_colab

!unzip csv.zip


Archive:  csv.zip
   creating: 2022_04_04T07_13_58Z/
  inflating: 2022_04_04T07_13_58Z/claims.csv  
  inflating: 2022_04_04T07_13_58Z/procedures.csv  
  inflating: 2022_04_04T07_13_58Z/allergies.csv  
  inflating: 2022_04_04T07_13_58Z/supplies.csv  
  inflating: 2022_04_04T07_13_58Z/devices.csv  
  inflating: 2022_04_04T07_13_58Z/encounters.csv  
  inflating: 2022_04_04T07_13_58Z/payers.csv  
  inflating: 2022_04_04T07_13_58Z/providers.csv  
  inflating: 2022_04_04T07_13_58Z/payer_transitions.csv  
  inflating: 2022_04_04T07_13_58Z/conditions.csv  
  inflating: 2022_04_04T07_13_58Z/claims_transactions.csv  
  inflating: 2022_04_04T07_13_58Z/patients.csv  
  inflating: 2022_04_04T07_13_58Z/organizations.csv  
  inflating: 2022_04_04T07_13_58Z/immunizations.csv  
  inflating: 2022_04_04T07_13_58Z/imaging_studies.csv  
  inflating: 2022_04_04T07_13_58Z/careplans.csv  
  inflating: 2022_04_04T07_13_58Z/medications.csv  
  inflating: 2022_04_04T07_13_58Z/observations.csv  


# Load the model and encode any text

In [None]:
#Load the pretrained model and save it locally or simply load it.
cwd = os.getcwd()
modelPath = cwd+"/models/BioBERT"
bioBERT = SentenceTransformer('emilyalsentzer/Bio_ClinicalBERT')
bioBERT.save(modelPath)
local_bioBERT = SentenceTransformer(modelPath)

Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/emilyalsentzer_Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
#model = SentenceTransformer(bioBERT)
model = local_bioBERT
example_embeddings = model.encode("we can embed anything")

# Format patient info 

## Create a dictonary of the medical records csvs and a list of patients 

In [None]:
csvs = {}
synthea = os.getcwd()
csv_loc = synthea+'/2022_04_04T07_13_58Z/'
csv_names = os.listdir(csv_loc)
for i, name in enumerate(csv_names):
    csv_names[i] = name[:-4]
    csvs[name[:-4]] = pd.read_csv(csv_loc+ name)


In [None]:
#Collect all the uniqe patient IDs

info_keys = ['allergies', 'medications', 'conditions', 'careplans', 
             'observations', 'procedures', 'immunizations', 'imaging_studies']
ids = np.array(csvs['procedures']['PATIENT'])

for info_key in info_keys:
    csv = csvs[info_key]
    if 'PATIENTID' in csv.keys():
        key = "PATIENTID"
    else:
        key = "PATIENT"
    uniq_ids = csv[key].unique()
    ids = np.concatenate((ids, np.array(uniq_ids)), axis=0)
ids = np.unique(ids)
print('Number of patients: ', ids.shape[0])

Number of patients:  1140


Create CSVs with just codes and descriptions (lables and text)

In [None]:
lean_csvs = {}
for key in csvs:
    csv = csvs[key]
    drop_list= list(csv.columns)
    try:
        drop_list.remove('DESCRIPTION')
        drop_list.remove('CODE')
        df = csv.drop(labels=drop_list, axis=1)
        lean_csvs[key] = df
    except:
        print(key + ' was unchanged')
print(lean_csvs.keys())


claims_transactions was unchanged
payers was unchanged
organizations was unchanged
claims was unchanged
patients was unchanged
payer_transitions was unchanged
imaging_studies was unchanged
providers was unchanged
dict_keys(['procedures', 'immunizations', 'encounters', 'observations', 'allergies', 'supplies', 'careplans', 'medications', 'conditions', 'devices'])


Turn the a csv dataframe into a Huggingfaces dataset

In [None]:
# Pick the type of codes/ csv name
csv_name = 'procedures'
# Hugging Face requires the target to be named 'labels'.
lean_csvs[csv_name].columns = ['labels', 'DESCRIPTION']
mod_code_procedures = lean_csvs[csv_name].copy()
# We are translating the codes to their index in a list of the unique codes.
# This is need to insure the number of labels in the model is equal to the 
# maximum number that can be given as a label
code_translation = list(set(lean_csvs[csv_name]['labels']))
translated_codes = []
for code in mod_code_procedures['labels']:
    translated_codes.append(code_translation.index(code))
mod_code_procedures['labels'] =translated_codes 
#Turn the dataframe into a dataset
procedures_ds = datasets.Dataset.from_pandas(mod_code_procedures)
train_dataset, test_dataset= procedures_ds.train_test_split(test_size=0.1).values()
#train_dataset, validation_dataset= train_dataset.train_test_split(test_size=0.1).values()
dataset = datasets.DatasetDict({"train":train_dataset,"test":test_dataset})
# Set the number of labels to the number of unique labels
num_lab = len(lean_csvs[csv_name]['labels'].unique())
mod_code_procedures['labels'].max()

178

# Fine Tuning

In [None]:
# Load your model locally or from a huggingfaces locations
# 
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

#bioBERT = 'emilyalsentzer/Bio_ClinicalBERT'
bioBERT = modelPath
tokenizer = AutoTokenizer.from_pretrained(bioBERT)
model = AutoModelForSequenceClassification.from_pretrained(bioBERT, num_labels=num_lab)


# Tokenize the dataset and change "DESCRIPTION" to the name of your text feature if it is not "DESCRIPTION"
def tokenize_function(examples):
    return tokenizer(examples["DESCRIPTION"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/models/BioBERT and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/70 [00:00<?, ?ba/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/8 [00:00<?, ?ba/s]

In [None]:
train_dataset

Dataset({
    features: ['labels', 'DESCRIPTION'],
    num_rows: 69711
})

In [None]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
num_examples =5000
train_dataset = tokenized_datasets["train"].shuffle(seed=1).select(range(num_examples))
eval_dataset = tokenized_datasets["test"].shuffle(seed=1).select(range(1000))
# Check that the max lable does not exccede the number of lables
assert num_lab > max(small_train_dataset['labels'])

In [None]:
# Define the rest of the elements of the Trainer
metric = load_metric("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

In [None]:
# Training arguments defines the specific arguments being passed to Trainer
training_args = TrainingArguments(
    output_dir="test_trainer",
    #learning_rate=2e-5,
    #per_device_train_batch_size=16,
    #per_device_eval_batch_size=16,
    num_train_epochs=5,
    evaluation_strategy="epoch",
    #weight_decay=0.01,
)
#training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", num_train_epochs=5 )
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from datetime import datetime
start = datetime.now()

# Run the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    tokenizer=tokenizer,
) 
trainer.train()

stop = datetime.now()
seconds = (stop-start)
rate = seconds/num_examples
print("This file took: ", seconds)
print('At a rate of ' + str(rate) +' per line')

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: DESCRIPTION. If DESCRIPTION are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 5000
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3125


Epoch,Training Loss,Validation Loss,Accuracy
1,1.3238,0.340953,0.966
2,0.3005,0.147776,0.985
3,0.1495,0.09417,0.992
4,0.0589,0.070109,0.995
5,0.0499,0.061205,0.996


Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json
Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test_trainer/checkpoint-500/tokenizer_config.json
Special tokens file saved in test_trainer/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: DESCRIPTION. If DESCRIPTION are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
Saving model checkpoint to test_trainer/checkpoint-1000
Configuration saved in test_trainer/checkpoint-1000/config.json
Model weights saved in test_trainer/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in test_trainer/checkpoint-1000/tokenizer_config.json
Special tokens file saved in test_tr

This file took:  0:05:01.127691
At a rate of 0:00:00.060226 per line


In [None]:
import torch
torch.cuda.is_available()

True

In [None]:
predictions = trainer.predict(small_eval_dataset)

NameError: ignored

In [None]:

metric.compute(predictions=predictions[1], references=small_eval_dataset['labels'])

In [None]:
print(model)

# Load Other Data

In [None]:
!wget -O text.zip --no-check-certificate "https://drive.google.com/uc?export=download&id=1_B6loMS--piFuaffoAVDMYEgEPVgCupZ"
!unzip text.zip

Next we will make a list filled with a dictonary for each patient (patients_info). Each dictonary will contain keys for the patient name, each record csv, and info. Info contains the full text of all the record csvs for this patient.

gt_patients_info is a list where each index corresponds to the same index in patients_info and contains the true insurance codes coressponding to each patinet.

gt_dict, Lastly gt_dict is a dictonary containing a key for each patient found in "ids" and all the insurance codes for the associated key/patients 



In [None]:
patients_info = []
gt_patients_info = []
gt_dict = {}
for patient in ids:
    patient_dict = {}
    patient_dict['name'] = patient
    info_list = []
    gt_list = []
    for info_key in info_keys:
        csv = csvs[info_key]
        # print(csv.keys())
        # print(info_key)
        if 'PATIENTID' in csv.keys():
            key = "PATIENTID"
        else:
            key = "PATIENT"
        try:
            data =np.array(csv.groupby(by = [key]).get_group(patient)['DESCRIPTION'])
            gt = np.array(csv.groupby(by = [key]).get_group(patient)['CODE'])
            patient_dict[info_key] = data
            info_list  = np.concatenate((info_list, data))
            gt_list = np.concatenate((gt_list, gt))
        except:
            
            pass
    patient_dict['info'] = info_list
    patients_info.append(patient_dict)
    gt_patients_info.append(gt_list) 
    gt_dict[patient] = gt_list

In [None]:
patients_info[0].keys()


dict_keys(['name', 'medications', 'conditions', 'careplans', 'observations', 'procedures', 'immunizations', 'info'])

## Make Text Files list of strings

Text_dict simply contains all of the patients record names as keys and the file text content as the value

In [None]:
texts_dict = {}
texts = []
text_loc = synthea+'/text/text/'
text_names = os.listdir(text_loc)
for i, name in enumerate(text_names):
    text_names[i] = name[:-4]
    file = open(text_loc+ name, encoding="utf8")
    text=file.readlines()
    # with open(text_loc+ name) as f:
    #     text = f.readlines()
    texts_dict[name[:-4]] = text
    texts.append(text)