In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import AutoTokenizer
import torch.nn as nn


In [2]:
#reading the csv file
df=pd.read_csv(r"c:\Users\kwaky\Downloads\patient_records_1000.csv")

In [3]:
df.head()


Unnamed: 0,text,label,age,gender,temperature,blood_pressure,heart_rate
0,"Patient has swelling, fatigue",arthritis,79.0,Male,98.8,150/77,86.0
1,"Patient has fatigue, dizziness, chest pain, ir...",heart_disease,50.0,Female,102.9,125/113,90.0
2,"Patient has shortness of breath, chest pain, i...",,27.0,,101.5,97/97,95.0
3,"Patient has dry cough, loss of taste, tiredness",covid-19,71.0,Female,100.8,152/95,
4,"Patient has fatigue, runny nose, cough",flu,79.0,Other,102.0,138/89,91.0


In [4]:
df.tail() #to see the last 10 rows of the dataset

Unnamed: 0,text,label,age,gender,temperature,blood_pressure,heart_rate
995,"Patient has feelings of worthlessness, difficu...",depression,62.0,Male,,113/105,
996,"Patient has vomiting, severe headache, sensiti...",migraine,,Other,99.4,97/80,104.0
997,"Patient has crackles in lungs, shortness of br...",pneumonia,71.0,,,164/87,108.0
998,"Patient has high blood sugar, unexplained weig...",diabetes,34.0,Male,102.5,171/91,73.0
999,"Patient has feelings of worthlessness, insomni...",depression,21.0,Female,98.4,166/84,113.0


In [5]:
#seeing the unber of rows and column
df.shape

(1000, 7)

In [6]:
#getting the info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   text            913 non-null    object 
 1   label           942 non-null    object 
 2   age             900 non-null    float64
 3   gender          893 non-null    object 
 4   temperature     891 non-null    float64
 5   blood_pressure  904 non-null    object 
 6   heart_rate      877 non-null    float64
dtypes: float64(3), object(4)
memory usage: 54.8+ KB


In [7]:
#see the column
df.columns


Index(['text', 'label', 'age', 'gender', 'temperature', 'blood_pressure',
       'heart_rate'],
      dtype='object')

In [8]:
#seeing the null values 
df.isnull()

Unnamed: 0,text,label,age,gender,temperature,blood_pressure,heart_rate
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,True,False,True,False,False,False
3,False,False,False,False,False,False,True
4,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...
995,False,False,False,False,True,False,True
996,False,False,True,False,False,False,False
997,False,False,False,True,True,False,False
998,False,False,False,False,False,False,False


In [9]:
#get statistical summary
df.describe()

Unnamed: 0,age,temperature,heart_rate
count,900.0,891.0,877.0
mean,54.247778,100.530079,90.166477
std,21.340237,1.99267,18.289956
min,18.0,97.0,60.0
25%,35.0,98.9,74.0
50%,54.0,100.5,91.0
75%,73.0,102.1,106.0
max,90.0,104.0,120.0


In [10]:
df.isnull().sum()

text               87
label              58
age               100
gender            107
temperature       109
blood_pressure     96
heart_rate        123
dtype: int64

In [11]:
mode_value = df['text'].mode()
if not mode_value.empty:
    mode_value = mode_value[0]  # Extract the first mode value
else:
    mode_value = "Unknown"  # Fallback in case the column is empty


In [12]:
mode_value = df['text'].mode()[0]  # Get the most frequent value
df['text'] = df['text'].fillna(mode_value)

print(df['text'].head())

0                        Patient has swelling, fatigue
1    Patient has fatigue, dizziness, chest pain, ir...
2    Patient has shortness of breath, chest pain, i...
3      Patient has dry cough, loss of taste, tiredness
4               Patient has fatigue, runny nose, cough
Name: text, dtype: object


In [13]:
df.reset_index(drop=True, inplace=True)
print(df.head())


                                                text          label   age  \
0                      Patient has swelling, fatigue      arthritis  79.0   
1  Patient has fatigue, dizziness, chest pain, ir...  heart_disease  50.0   
2  Patient has shortness of breath, chest pain, i...            NaN  27.0   
3    Patient has dry cough, loss of taste, tiredness       covid-19  71.0   
4             Patient has fatigue, runny nose, cough            flu  79.0   

   gender  temperature blood_pressure  heart_rate  
0    Male         98.8         150/77        86.0  
1  Female        102.9        125/113        90.0  
2     NaN        101.5          97/97        95.0  
3  Female        100.8         152/95         NaN  
4   Other        102.0         138/89        91.0  


In [14]:
mode_value = df['label'].mode()
if not mode_value.empty:
    mode_value = mode_value[0]  # Extract the first mode value
else:
    mode_value = "Unknown"  # Fallback in case the column is empty


In [15]:
mode_value = df['label'].mode()[0]  # Get the most frequent value
df['label'] = df['label'].fillna(mode_value)

print(df['text'].head())

0                        Patient has swelling, fatigue
1    Patient has fatigue, dizziness, chest pain, ir...
2    Patient has shortness of breath, chest pain, i...
3      Patient has dry cough, loss of taste, tiredness
4               Patient has fatigue, runny nose, cough
Name: text, dtype: object


In [16]:
mode_value = df['gender'].mode()
if not mode_value.empty:
    mode_value = mode_value[0]  # Extract the first mode value
else:
    mode_value = "Unknown"  # Fallback in case the column is empty
    


In [17]:
mode_value = df['gender'].mode()[0]  # Get the most frequent value
df['gender'] = df['gender'].fillna(mode_value)

print(df['gender'].head())

0      Male
1    Female
2     Other
3    Female
4     Other
Name: gender, dtype: object


In [18]:
mode_value = df['blood_pressure'].mode()
if not mode_value.empty:
    mode_value = mode_value[0]  # Extract the first mode value
else:
    mode_value = "Unknown"  # Fallback in case the column is empty
    

In [19]:
mode_value = df['blood_pressure'].mode()[0]  # Get the most frequent value
df['blood_pressure'] = df['blood_pressure'].fillna(mode_value)

print(df['blood_pressure'].head())

0     150/77
1    125/113
2      97/97
3     152/95
4     138/89
Name: blood_pressure, dtype: object


In [20]:
mode_value = df['temperature'].mode()
if not mode_value.empty:
    mode_value = mode_value[0]  # Extract the first mode value
else:
    mode_value = "Unknown"  # Fallback in case the column is empty
    

In [21]:
mode_value = df['temperature'].mode()[0]  # Get the most frequent value
df['temperature'] = df['temperature'].fillna(mode_value)

print(df['temperature'].head())

0     98.8
1    102.9
2    101.5
3    100.8
4    102.0
Name: temperature, dtype: float64


In [22]:
mode_value = df['heart_rate'].mode()
if not mode_value.empty:
    mode_value = mode_value[0]  # Extract the first mode value
else:
    mode_value = "Unknown"  # Fallback in case the column is empty
    

In [23]:
mode_value = df['heart_rate'].mode()[0]  # Get the most frequent value
df['heart_rate'] = df['heart_rate'].fillna(mode_value)

print(df['heart_rate'].head())

0    86.0
1    90.0
2    95.0
3    60.0
4    91.0
Name: heart_rate, dtype: float64


In [24]:
mode_value = df['age'].mode()
if not mode_value.empty:
    mode_value = mode_value[0]  # Extract the first mode value
else:
    mode_value = "Unknown"  # Fallback in case the column is empty
    

In [25]:
mode_value = df['age'].mode()[0]  # Get the most frequent value
df['age'] = df['age'].fillna(mode_value)

print(df['age'].head())

0    79.0
1    50.0
2    27.0
3    71.0
4    79.0
Name: age, dtype: float64


In [26]:
# Save the DataFrame to a CSV file
df.to_csv("updated_dataset.csv", index=False)

# Inform the user where the file is saved
print("The updated dataset has been saved as 'updated_patient_records_1000.csv' in the current directory.")


The updated dataset has been saved as 'updated_patient_records_1000.csv' in the current directory.


In [27]:
updated_patient_records = df.copy()

In [28]:
df.isnull().sum()

text              0
label             0
age               0
gender            0
temperature       0
blood_pressure    0
heart_rate        0
dtype: int64

In [29]:
from sklearn.preprocessing import LabelEncoder

In [30]:
le = LabelEncoder()
df["label_encoded"] = le.fit_transform(df["label"])

In [31]:
label_map = dict(zip(le.classes_, le.transform(le.classes_)))

In [32]:
from sklearn.model_selection import train_test_split

In [33]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text"], df["label_encoded"], test_size=0.2, random_state=42
)

In [34]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


In [35]:
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)

In [36]:
import torch

class PatientDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = PatientDataset(train_encodings, list(train_labels))
val_dataset = PatientDataset(val_encodings, list(val_labels))


In [37]:
from transformers import AutoModelForSequenceClassification

In [38]:
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=len(le.classes_)
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    do_eval=True, 
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)


In [40]:
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()


Step,Training Loss
10,2.4124
20,2.2451
30,2.0873
40,1.8995
50,1.4629
60,1.4313
70,1.175
80,1.0162
90,0.9681
100,0.7271


TrainOutput(global_step=300, training_loss=0.8547341744105021, metrics={'train_runtime': 366.2082, 'train_samples_per_second': 6.554, 'train_steps_per_second': 0.819, 'total_flos': 32068963180800.0, 'train_loss': 0.8547341744105021, 'epoch': 3.0})

In [41]:
preds = trainer.predict(val_dataset)
predicted_classes = preds.predictions.argmax(axis=1)

from sklearn.metrics import classification_report
print(classification_report(val_labels, predicted_classes, target_names=le.classes_))


               precision    recall  f1-score   support

    arthritis       0.93      0.88      0.90        16
       asthma       0.90      0.90      0.90        21
     covid-19       0.81      0.72      0.76        18
   depression       1.00      0.62      0.77        37
     diabetes       0.85      0.85      0.85        13
          flu       0.40      1.00      0.57        14
heart_disease       1.00      0.88      0.93        16
 hypertension       0.77      0.77      0.77        13
     migraine       0.93      0.96      0.94        26
    pneumonia       0.96      0.85      0.90        26

     accuracy                           0.82       200
    macro avg       0.85      0.84      0.83       200
 weighted avg       0.89      0.82      0.84       200



In [42]:
model.save_pretrained("disease_bert_model")
tokenizer.save_pretrained("disease_bert_model")


('disease_bert_model\\tokenizer_config.json',
 'disease_bert_model\\special_tokens_map.json',
 'disease_bert_model\\vocab.txt',
 'disease_bert_model\\added_tokens.json',
 'disease_bert_model\\tokenizer.json')