In [34]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from tensorflow.keras.optimizers import Adam
import pickle

In [25]:
# Load data
df = pd.read_csv('/content/drive/MyDrive/capstone/diseases.csv')

In [26]:
df = df.drop(['category', 'category_type'], axis=1)

# Drop rows with NaN in either column
df = df.dropna(subset=['symptoms', 'disease_name'])

# Clean symptoms - convert to string and clean
df['symptoms_clean'] = df['symptoms'].astype(str).str.replace(';', ' ').str.strip()

# Remove any rows that became 'nan' string or empty
df = df[df['symptoms_clean'] != 'nan']
df = df[df['symptoms_clean'] != '']
df = df.reset_index(drop=True)

print(f"Clean data: {len(df)} rows")

Clean data: 9332 rows


In [27]:
# ClinicalBERT embeddings
clinicalbert = SentenceTransformer('emilyalsentzer/Bio_ClinicalBERT')
df['symptoms_clean'] = df['symptoms'].str.replace(';', ' ')
df = df[df['symptoms_clean'].str.strip() != '']  # Remove empty strings
df = df[df['symptoms_clean'] != 'nan']  # Remove 'nan' strings

X = clinicalbert.encode(df['symptoms_clean'].tolist(), show_progress_bar=True, batch_size=64)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['disease_name'])
num_classes = len(np.unique(y))



Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: emilyalsentzer/Bio_ClinicalBERT
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.predictions.decoder.weight             | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Batches:   0%|          | 0/146 [00:00<?, ?it/s]

In [32]:
model = Sequential([
    Input(shape=(768,)),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(num_classes, activation='softmax')
])

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Train with smaller batches
history = model.fit(X, y, epochs=100, batch_size=32, verbose=1)

Epoch 1/100
[1m292/292[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.0000e+00 - loss: 9.1618
Epoch 2/100
[1m292/292[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 3.0503e-05 - loss: 9.1439
Epoch 3/100
[1m292/292[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 4.1311e-04 - loss: 8.9754
Epoch 4/100
[1m292/292[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.0021 - loss: 8.3592
Epoch 5/100
[1m292/292[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.0083 - loss: 7.4589
Epoch 6/100
[1m292/292[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.0372 - loss: 6.2339
Epoch 7/100
[1m292/292[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.0911 - loss: 4.8688
Epoch 8/100
[1m292/292[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.2024 - loss: 3.6933
Epoch 9/100
[1m292/

In [33]:
# Evaluate on training data
y_pred = model.predict(X).argmax(axis=1)
accuracy = np.mean(y_pred == y)
f1 = f1_score(y, y_pred, average='macro')

print(f"\nAccuracy: {accuracy:.3f}")
print(f"F1-Score: {f1:.3f}")

[1m292/292[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step

Accuracy: 0.651
F1-Score: 0.611


In [35]:
# Save # Save the label encoder
model.save('/content/drive/MyDrive/capstone/model/disease_classifier.h5')

with open('/content/drive/MyDrive/capstone/model/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

