#**Overview:**
This notebook handles the second to last part of the multimodal pipeline.
It takes the images and text reports as input and encodes them extracting the [CLS] tokens from the BERT and ViT models

#**Import Libraries**

In [None]:
import pickle
import cv2
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
import torch
import seaborn as sns
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from google.colab import drive
from transformers import AutoImageProcessor, AutoModel, AutoConfig, AutoModelForImageClassification, BertForSequenceClassification, AutoTokenizer
from sklearn.metrics import accuracy_score

###**Load csv files and merge in to a single dataframe**

In [None]:
drive.mount('/content/drive')
report_df = pd.read_csv('/content/drive/My Drive/Dissertation/pneumonia_full.csv')
image_df  = pd.read_csv('/content/drive/My Drive/Dissertation/Images/pneumonia_1519_PAAP.csv')


#load images
with open('/content/drive/My Drive/Dissertation/Images/image_list_1519_384.pkl', 'rb') as f:
    images = pickle.load(f)

#making sure the image list and dataframe loaded have the same length
assert len(images) == len(image_df)

Mounted at /content/drive


In [None]:
#saving image arrays to a new column in the dataframe
image_df['image'] = images
print(image_df['image'][0].shape)
display(image_df)

(384, 384)


Unnamed: 0,dicom_id,subject_id,study_id,ViewPosition,img_path,pneumonia,split,image
0,043f2b1c-1b8b0a20-c9e5ec5d-02ac7d4a-35000b4c,15000170,56450978,PA,files/p15/p15000170/s56450978/043f2b1c-1b8b0a2...,0.0,train,"[[2, 1, 4, 2, 3, 3, 5, 5, 4, 3, 5, 2, 3, 2, 4,..."
1,39ee0432-150f8ee9-e65abf9a-15bc5beb-80fbf3f6,15000393,51634677,PA,files/p15/p15000393/s51634677/39ee0432-150f8ee...,0.0,train,"[[228, 197, 183, 149, 135, 136, 139, 166, 122,..."
2,80eeb158-92ef7719-b43ae606-fb2745cf-99680d44,15000393,51634677,PA,files/p15/p15000393/s51634677/80eeb158-92ef771...,0.0,train,"[[129, 96, 75, 51, 29, 17, 13, 11, 10, 9, 8, 7..."
3,8a2da5f5-09ea301d-768e059c-5f053a34-2d3b3057,15000393,52929930,PA,files/p15/p15000393/s52929930/8a2da5f5-09ea301...,1.0,train,"[[12, 6, 5, 4, 4, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2..."
4,b08efb71-38c915e9-3d9d7df0-d783d4d6-1317bf59,15000393,54674484,PA,files/p15/p15000393/s54674484/b08efb71-38c915e...,0.0,train,"[[251, 248, 246, 237, 226, 209, 207, 198, 202,..."
...,...,...,...,...,...,...,...,...
30679,14c4f70b-51110089-a731e968-fc1e017e-dd4c536b,19997473,57809462,AP,files/p19/p19997473/s57809462/14c4f70b-5111008...,-1.0,train,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
30680,a29987d8-abd13298-7a067b12-620f9fdb-103ecf53,19998330,54053771,AP,files/p19/p19998330/s54053771/a29987d8-abd1329...,1.0,train,"[[80, 78, 79, 77, 76, 75, 73, 72, 76, 75, 76, ..."
30681,518011e2-346dbd44-3e738335-c5006bf8-d69f6b68,19998770,51149538,AP,files/p19/p19998770/s51149538/518011e2-346dbd4...,0.0,train,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
30682,1427ad57-5bf4f3e6-90be02f3-d1760987-99d7f2ce,19998843,56350227,AP,files/p19/p19998843/s56350227/1427ad57-5bf4f3e...,1.0,train,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [None]:
df = pd.merge(report_df, image_df, on='study_id')
df = df.drop(['pneumonia_x','split_x'],axis=1)
df = df.rename(columns={"pneumonia_y": "pneumonia","split_y":'split'})

#format labels
labels = [label for label in df['pneumonia']]
for i in range(len(labels)):
  if labels[i]==-1:
    labels[i]=2

df['pneumonia'] = [int(label) for label in labels]
df = df[df['pneumonia'] !=2]
labels = [label for label in df['pneumonia']]

display(df.head())

Unnamed: 0,study_id,text,dicom_id,subject_id,ViewPosition,img_path,pneumonia,split,image
0,56450978,FINAL REPORT\...,043f2b1c-1b8b0a20-c9e5ec5d-02ac7d4a-35000b4c,15000170,PA,files/p15/p15000170/s56450978/043f2b1c-1b8b0a2...,0,train,"[[2, 1, 4, 2, 3, 3, 5, 5, 4, 3, 5, 2, 3, 2, 4,..."
1,51634677,FINAL REPORT\...,39ee0432-150f8ee9-e65abf9a-15bc5beb-80fbf3f6,15000393,PA,files/p15/p15000393/s51634677/39ee0432-150f8ee...,0,train,"[[228, 197, 183, 149, 135, 136, 139, 166, 122,..."
2,51634677,FINAL REPORT\...,80eeb158-92ef7719-b43ae606-fb2745cf-99680d44,15000393,PA,files/p15/p15000393/s51634677/80eeb158-92ef771...,0,train,"[[129, 96, 75, 51, 29, 17, 13, 11, 10, 9, 8, 7..."
3,52929930,FINAL REPORT\...,8a2da5f5-09ea301d-768e059c-5f053a34-2d3b3057,15000393,PA,files/p15/p15000393/s52929930/8a2da5f5-09ea301...,1,train,"[[12, 6, 5, 4, 4, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2..."
4,54674484,FINAL REPORT\...,b08efb71-38c915e9-3d9d7df0-d783d4d6-1317bf59,15000393,PA,files/p15/p15000393/s54674484/b08efb71-38c915e...,0,train,"[[251, 248, 246, 237, 226, 209, 207, 198, 202,..."


In [None]:
df

Unnamed: 0,study_id,text,dicom_id,subject_id,ViewPosition,img_path,pneumonia,split,image
0,56450978,FINAL REPORT\...,043f2b1c-1b8b0a20-c9e5ec5d-02ac7d4a-35000b4c,15000170,PA,files/p15/p15000170/s56450978/043f2b1c-1b8b0a2...,0,train,"[[2, 1, 4, 2, 3, 3, 5, 5, 4, 3, 5, 2, 3, 2, 4,..."
1,51634677,FINAL REPORT\...,39ee0432-150f8ee9-e65abf9a-15bc5beb-80fbf3f6,15000393,PA,files/p15/p15000393/s51634677/39ee0432-150f8ee...,0,train,"[[228, 197, 183, 149, 135, 136, 139, 166, 122,..."
2,51634677,FINAL REPORT\...,80eeb158-92ef7719-b43ae606-fb2745cf-99680d44,15000393,PA,files/p15/p15000393/s51634677/80eeb158-92ef771...,0,train,"[[129, 96, 75, 51, 29, 17, 13, 11, 10, 9, 8, 7..."
3,52929930,FINAL REPORT\...,8a2da5f5-09ea301d-768e059c-5f053a34-2d3b3057,15000393,PA,files/p15/p15000393/s52929930/8a2da5f5-09ea301...,1,train,"[[12, 6, 5, 4, 4, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2..."
4,54674484,FINAL REPORT\...,b08efb71-38c915e9-3d9d7df0-d783d4d6-1317bf59,15000393,PA,files/p15/p15000393/s54674484/b08efb71-38c915e...,0,train,"[[251, 248, 246, 237, 226, 209, 207, 198, 202,..."
...,...,...,...,...,...,...,...,...,...
30677,56428935,FINAL REPORT\...,0802e3d3-5c5c09e8-eddfff18-c451289a-1fb33127,19997367,AP,files/p19/p19997367/s56428935/0802e3d3-5c5c09e...,0,train,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
30680,54053771,FINAL REPORT\...,a29987d8-abd13298-7a067b12-620f9fdb-103ecf53,19998330,AP,files/p19/p19998330/s54053771/a29987d8-abd1329...,1,train,"[[80, 78, 79, 77, 76, 75, 73, 72, 76, 75, 76, ..."
30681,51149538,FINAL REPORT\...,518011e2-346dbd44-3e738335-c5006bf8-d69f6b68,19998770,AP,files/p19/p19998770/s51149538/518011e2-346dbd4...,0,train,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
30682,56350227,FINAL REPORT\...,1427ad57-5bf4f3e6-90be02f3-d1760987-99d7f2ce,19998843,AP,files/p19/p19998843/s56350227/1427ad57-5bf4f3e...,1,train,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [None]:
# #save filtered x-rays to file for seperate loading
# images = df['image'].tolist()
# with open('/content/drive/My Drive/Dissertation/Multimodal/filtered_images_384.pkl', 'wb') as f:
#   pickle.dump(images, f)

#**Load the NLP and VLM models**

In [None]:
# Set up the device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

#####load the Vit model#####
model_name = "google/vit-base-patch16-384"
processor = AutoImageProcessor.from_pretrained(model_name)
vision_model = AutoModel.from_pretrained(model_name)

# configuring the model to change the classifcation head, and adding dropout to the hidden layers
config = AutoConfig.from_pretrained(model_name)
config.num_labels = 2
config.hidden_dropout_prob = 0.3
config.attention_probs_dropout_prob = 0.1
vision_model = AutoModelForImageClassification.from_pretrained(model_name, config=config,ignore_mismatched_sizes=True)


# Move the ViT model to the device
vision_model.to(device)


# load model's finetuned state
ViT_model_name = "google/vit-base-patch16-384"
ViT_epoch=5
# vision_model.load_state_dict(torch.load(f'/content/drive/MyDrive/Dissertation/Images/Models/{ViT_model_name}_epoch_{ViT_epoch}_wd.pth',weights_only=True))
vision_model.load_state_dict(torch.load(f'/content/drive/MyDrive/Dissertation/Images/Models/google/vit-base-patch16-384_epoch_10.pth',weights_only=True))


#####load the BERT model#####

# load the model and the tokenizer
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
text_model = BertForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", num_labels=2)

#move the BERT model to the device
text_model.to(device)

#load model's fine tuned state
BERT_model_name = "Bio_ClinicalBERT"
BERT_epoch = 2
# text_model.load_state_dict(torch.load(f'/content/drive/MyDrive/Dissertation/reports/Models/{BERT_model_name}_epoch_{BERT_epoch}.pth',weights_only=True))
text_model.load_state_dict(torch.load(f'/content/drive/MyDrive/Dissertation/reports/Models/Bio_ClinicalBERT_full.pth',weights_only=True)) # highest performing model on the text data

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


model.safetensors:   0%|          | 0.00/347M [00:00<?, ?B/s]

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-384 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-384 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [None]:
# class MultimodalModel(nn.Module):
#     def __init__(self,output_dim):
#         super(MultimodalModel, self).__init__()
#         # Define layers
#         self.fc1 = nn.Linear(768*2, 512)
#         self.fc2 = nn.Linear(512, 256)
#         self.fc3 = nn.Linear(256, output_dim)
#         self.relu = nn.ReLU()
#         self.dropout = nn.Dropout(0.5)

#     def forward(self, text_emb, image_emb):
#         # Forward pass
#         x = self.relu(self.fc1(combined))
#         x = self.dropout(x)
#         x = self.relu(self.fc2(x))
#         x = self.dropout(x)
#         output = self.fc3(x)
#         return output

#**Text Preprocessing**

In [None]:
# Define a function to extract [CLS] token for text data
def bert_cls(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
    outputs = text_model(**inputs, output_hidden_states=True)
    # Get the hidden states from the last layer ([-1]), and extract [CLS] token (index 0)
    cls_token = outputs.hidden_states[-1][:, 0, :].squeeze().detach().cpu().numpy()
    return cls_token

#test cls token extraction
cls = bert_cls(df['text'][0])
print(cls.shape)
print(cls[:10])
print("...")
print(cls[-10:])


(768,)
[ 0.9977652  -0.8374106   0.33930928 -0.35599285 -0.64674264 -0.87577796
 -0.46077308 -1.5376433   0.22735122 -1.670899  ]
...
[ 8.3757085e-01 -1.1313006e+00  4.3713462e-01  1.7433886e-04
  5.9566092e-01 -7.6082027e-01  2.2048359e-01  1.1273799e+00
  7.9452218e-03  9.6675014e-01]


#**Image Preprocessing**

In [None]:

size = (384,384) # switch to 384x384 if necessary
def vit_cls(image,size):
    # Preprocess the image using the processor
    if image.shape != size:
      image = cv2.resize(image, size)
      print("the image has been scaled")
    image = np.stack((image,)*3, axis=0)
    inputs = processor(images=image, return_tensors="pt").to(device)
    outputs = vision_model(**inputs, output_hidden_states=True)
    # Get the hidden states from the last layer ([-1]), and extract [CLS] token (index 0)
    cls_token = outputs.hidden_states[-1][:, 0, :].squeeze().detach().cpu().numpy()
    return cls_token

#test extraction
cls = vit_cls(df['image'].iloc[0],size)
print(cls.shape)
print(cls[:10])
print("...")
print(cls[-10:])

(768,)
[21.161226  -1.730015  16.236362   2.1023936 -6.074485   4.5679812
 -1.9561322 16.080692   8.677797  10.699538 ]
...
[  -3.5132236 -285.43204    -10.578303     9.118515   -15.865076
    5.312481    12.301923     2.2242506   36.227203   -18.453682 ]


###**Extract [CLS] tokens from both models and concatenate in to 1 vector as the full representation**

In [None]:
text_cls = []
img_cls = []
concat = []
for i in tqdm(range(len(df))):
  cls1 = bert_cls(df['text'].iloc[i])
  cls2 = vit_cls(df['image'].iloc[i],size)
  concat.append(np.concatenate((cls1,cls2),axis=0))
  text_cls.append(cls1)
  img_cls.append(cls2)

100%|██████████| 20843/20843 [12:03<00:00, 28.80it/s]


In [None]:
df['embedding']=concat
len(df['embedding'][0])
df.head()

Unnamed: 0,study_id,text,dicom_id,subject_id,ViewPosition,img_path,pneumonia,split,image,embedding
0,56450978,FINAL REPORT\...,043f2b1c-1b8b0a20-c9e5ec5d-02ac7d4a-35000b4c,15000170,PA,files/p15/p15000170/s56450978/043f2b1c-1b8b0a2...,0,train,"[[2, 1, 4, 2, 3, 3, 5, 5, 4, 3, 5, 2, 3, 2, 4,...","[0.9977652, -0.8374106, 0.33930928, -0.3559928..."
1,51634677,FINAL REPORT\...,39ee0432-150f8ee9-e65abf9a-15bc5beb-80fbf3f6,15000393,PA,files/p15/p15000393/s51634677/39ee0432-150f8ee...,0,train,"[[228, 197, 183, 149, 135, 136, 139, 166, 122,...","[0.8906752, -0.8320705, 0.34006056, -0.3635144..."
2,51634677,FINAL REPORT\...,80eeb158-92ef7719-b43ae606-fb2745cf-99680d44,15000393,PA,files/p15/p15000393/s51634677/80eeb158-92ef771...,0,train,"[[129, 96, 75, 51, 29, 17, 13, 11, 10, 9, 8, 7...","[0.8906752, -0.8320705, 0.34006056, -0.3635144..."
3,52929930,FINAL REPORT\...,8a2da5f5-09ea301d-768e059c-5f053a34-2d3b3057,15000393,PA,files/p15/p15000393/s52929930/8a2da5f5-09ea301...,1,train,"[[12, 6, 5, 4, 4, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2...","[-0.8936431, 0.53670514, -0.76670307, -0.12457..."
4,54674484,FINAL REPORT\...,b08efb71-38c915e9-3d9d7df0-d783d4d6-1317bf59,15000393,PA,files/p15/p15000393/s54674484/b08efb71-38c915e...,0,train,"[[251, 248, 246, 237, 226, 209, 207, 198, 202,...","[0.4692915, -0.7610968, 0.10469077, 1.0525774,..."


In [None]:
#drop some columns
df = df.drop(['img_path','image'],axis=1)
df.head()

Unnamed: 0,study_id,text,dicom_id,subject_id,ViewPosition,pneumonia,split,embedding
0,56450978,FINAL REPORT\...,043f2b1c-1b8b0a20-c9e5ec5d-02ac7d4a-35000b4c,15000170,PA,0,train,"[0.9977652, -0.8374106, 0.33930928, -0.3559928..."
1,51634677,FINAL REPORT\...,39ee0432-150f8ee9-e65abf9a-15bc5beb-80fbf3f6,15000393,PA,0,train,"[0.8906752, -0.8320705, 0.34006056, -0.3635144..."
2,51634677,FINAL REPORT\...,80eeb158-92ef7719-b43ae606-fb2745cf-99680d44,15000393,PA,0,train,"[0.8906752, -0.8320705, 0.34006056, -0.3635144..."
3,52929930,FINAL REPORT\...,8a2da5f5-09ea301d-768e059c-5f053a34-2d3b3057,15000393,PA,1,train,"[-0.8936431, 0.53670514, -0.76670307, -0.12457..."
4,54674484,FINAL REPORT\...,b08efb71-38c915e9-3d9d7df0-d783d4d6-1317bf59,15000393,PA,0,train,"[0.4692915, -0.7610968, 0.10469077, 1.0525774,..."


In [None]:
# save the embedded images and text to a file
embeddings = df['embedding'].tolist()
with open('/content/drive/My Drive/Dissertation/Multimodal/pneumonia_embeddings3.pkl', 'wb') as f:
  pickle.dump(embeddings, f)
# save the dataframe to csv without embeddings (pd can't handle numpy arrays)
df_save = df.drop('embedding',axis=1)
df_save.to_csv('/content/drive/My Drive/Dissertation/Multimodal/pneumonia_embeddings3.csv',index=False)

In [None]:
import os

#disconnetct colab gpu
os.kill(os.getpid(), 9)
exit()

In [None]:
exit()