In [1]:
!pip install transformers



In [2]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import tensorflow as tf
from transformers import BertTokenizer

In [3]:
df = pd.read_excel('/content/medical diagnosis.xlsx', engine='openpyxl')

df.head()

Unnamed: 0,Symptoms,Disease
0,Increased thirst and frequent urination,Type 2 Diabetes
1,Headaches and dizziness,Hypertension
2,Persistent cough and shortness of breath,Chronic Obstructive Pulmonary Disease (COPD)
3,Wheezing and chest tightness,Asthma
4,Joint pain and swelling,Rheumatoid Arthritis


In [4]:

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [5]:

token = tokenizer.encode_plus(
    df['Symptoms'].iloc[0],
    max_length=256,
    truncation=True,
    padding='max_length',
    add_special_tokens=True,
    return_tensors='tf'
)

In [6]:
df['Disease'].value_counts()

Unnamed: 0_level_0,count
Disease,Unnamed: 1_level_1
Polycythemia Vera,4
Basal Cell Carcinoma,4
Tuberculosis,4
Endometriosis,3
Rheumatoid Arthritis,3
...,...
Dupuytren's Contracture,1
Epiglottitis,1
Glomerulonephritis,1
Goodpasture Syndrome,1


In [7]:

token.input_ids

<tf.Tensor: shape=(1, 256), dtype=int32, numpy=
array([[  101,  3561, 20934, 26190,  1105,  6539,   190,  9324,  2116,
          102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0, 

In [8]:

X_input_ids = np.zeros((len(df), 256))
X_attn_masks = np.zeros((len(df), 256))

In [9]:

def generate_training_data(df, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(df['Symptoms'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=256,
            truncation=True,
            padding='max_length',
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [10]:

X_input_ids, X_attn_masks = generate_training_data(df, X_input_ids, X_attn_masks, tokenizer)

0it [00:00, ?it/s]

In [11]:

labels = np.zeros((len(df),327))
labels.shape

(500, 327)

In [12]:
disease_mapping = {disease: idx for idx, disease in enumerate(df['Disease'].unique())}
print("Disease to integer mapping:", disease_mapping)

# Map the 'Disease' column to integer labels
df['Disease'] = df['Disease'].map(disease_mapping)
print(df)


Disease to integer mapping: {'Type 2 Diabetes': 0, 'Hypertension': 1, 'Chronic Obstructive Pulmonary Disease (COPD)': 2, 'Asthma': 3, 'Rheumatoid Arthritis': 4, 'Influenza': 5, 'Tuberculosis': 6, 'Hypothyroidism': 7, 'Multiple Sclerosis': 8, 'Eczema': 9, 'Crohn’s Disease': 10, 'Gout': 11, 'Psoriasis': 12, 'Parkinson’s Disease': 13, 'Celiac Disease': 14, 'Herpes Simplex Virus': 15, 'Hepatitis B': 16, 'Generalized Anxiety Disorder': 17, 'Bipolar Disorder': 18, 'Major Depressive Disorder': 19, 'Schizophrenia': 20, 'Obsessive-Compulsive Disorder': 21, 'Post-Traumatic Stress Disorder': 22, 'Peptic Ulcer': 23, 'Endometriosis': 24, 'Polycythemia Vera': 25, 'Chronic Kidney Disease': 26, 'Liver Cirrhosis': 27, 'Sinusitis': 28, 'Bronchitis': 29, 'Urinary Tract Infection (UTI)': 30, 'Bacterial Pneumonia': 31, 'Lymphoma': 32, 'Leukemia': 33, 'Melanoma': 34, 'Non-Hodgkin’s Lymphoma': 35, 'Basal Cell Carcinoma': 36, 'Prostate Cancer': 37, 'Ovarian Cancer': 38, 'Gastritis': 39, 'Colitis': 40, 'Atrial

In [13]:
# Ensure 'Disease' column contains integers
df['Disease'] = df['Disease'].astype(int)

# Create one-hot encoded labels
labels = np.zeros((len(df), 327))
for i, disease_idx in enumerate(df['Disease']):
    labels[i, disease_idx] = 1 # Now disease_idx should be an integer

In [14]:

# creating a data pipeline using tensorflow dataset utility, creates batches of data for easy loading...
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))
dataset.take(1) # one sample data


<_TakeDataset element_spec=(TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(327,), dtype=tf.float64, name=None))>

In [15]:
labels

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [16]:

def SentimentDatasetMapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels


In [17]:
dataset = dataset.map(SentimentDatasetMapFunction) # converting to required format for tensorflow dataset

In [18]:
print(dataset)

<_MapDataset element_spec=({'input_ids': TensorSpec(shape=(256,), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(256,), dtype=tf.float64, name=None)}, TensorSpec(shape=(327,), dtype=tf.float64, name=None))>


In [19]:

dataset = dataset.shuffle(10000).batch(16, drop_remainder=True) # batch size, drop any left out tensor

In [20]:

p = 0.8
train_size = int((len(df)//16)*p) # f

In [21]:
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

In [22]:

from transformers import TFBertModel

In [23]:

model = TFBertModel.from_pretrained('bert-base-cased') # b

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [29]:
import tensorflow as tf # make sure to import tensorflow

input_ids = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name='input_ids')
attn_masks = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name='attention_mask')


bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1]
intermediate_layer = tf.keras.layers.Dense(256, activation='relu', name='intermediate_layer')(bert_embds)
output_layer = tf.keras.layers.Dense(327, activation='softmax', name='output_layer')(intermediate_layer) # softmax -> calcs probs of classes


sentiment_model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
sentiment_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 256)]                0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 256)]                0         []                            
 )                                                                                                
                                                                                                  
 bert (TFBertMainLayer)      TFBaseModelOutputWithPooli   1083102   ['input_ids[0][0]',           
                             ngAndCrossAttentions(last_   72         'attention_mask[0][0]']      
                             hidden_state=(None, 256, 7                                     

In [30]:
# Use a learning rate scheduler instead of the deprecated 'decay' argument
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=1e-5,
    decay_steps=10000,  # Adjust this value based on your dataset and training process
    decay_rate=0.9
)
optim = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

In [31]:
import tensorflow as tf

# List available GPUs
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))



Num GPUs Available:  1


In [33]:
# Compile the model
sentiment_model.compile(optimizer=optim, loss=loss_func, metrics=[acc])

# Now you can fit the model
hist = sentiment_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=2
)

Epoch 1/2
Epoch 2/2


In [34]:
# Save model weights after training

sentiment_model.save('sentiment_model')


In [36]:
model=tf.keras.models.load_model('/content/sentiment_model')

In [35]:
def prepare_data(input_text, tokenizer):
    token = tokenizer.encode_plus(
        input_text,
        max_length=256,
        truncation=True,
        padding='max_length',
        add_special_tokens=True,
        return_tensors='tf'
    )
    return {
        'input_ids': tf.cast(token.input_ids, tf.int64), # Cast to tf.int64 instead of tf.float64
        'attention_mask': tf.cast(token.attention_mask, tf.int64) # Cast to tf.int64 instead of tf.float64
    }

In [37]:
text=prepare_data("coughing and sneezing",tokenizer)

In [38]:
prob=model.predict(text)




In [40]:
import numpy as np

# Assuming the output is a probability distribution
predicted_class_index = np.argmax(prob, axis=1)[0]

# Map the index to the disease name (if you have a mapping)
disease_mapping = {idx: disease for disease, idx in disease_mapping.items()}
predicted_disease = disease_mapping[predicted_class_index]

print(f"Predicted disease: {predicted_disease}")


Predicted disease: Temporomandibular Joint Disorder (TMJ)


AttributeError: 'TFSMLayer' object has no attribute 'predict'