##Imports


In [33]:
pip install transformers



In [34]:
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, classification_report


##Data Load and dataframe

In [35]:
csv_file_path = '/content/mtsamples.csv'
df = pd.read_csv(csv_file_path)


##Data Cleaning and processing


In [36]:
import nltk
import string
from nltk.corpus import stopwords

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [37]:
df = df.dropna(subset=['medical_specialty'], axis=0)

def preprocess_text(text):
    words = text.split()
    words = [word.lower() for word in words]
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in string.punctuation and word not in stop_words]
    processed_text = ' '.join(words)
    return processed_text

df['description'] = df['description'].apply(preprocess_text)

##Data split 80-20

In [38]:
X = df['description']
y = df['medical_specialty']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

##Using pretrained Bert for classification


In [39]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(df['medical_specialty'].unique()))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


##Tokenizing and creating labels

In [40]:
max_seq_length = 1024
train_texts = X_train.tolist()
test_texts = X_test.tolist()

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_seq_length, return_tensors='pt')
val_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=max_seq_length, return_tensors='pt')

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()



train_labels = label_encoder.fit_transform(y_train)
val_labels = label_encoder.transform(y_test)

##PyTorch data loaders


In [41]:
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(train_labels))
val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], torch.tensor(val_labels))
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

##Training params

In [42]:
epochs = 100
learning_rate = 2e-5
adam_epsilon = 1e-8

optimizer = AdamW(model.parameters(), lr=learning_rate, eps=adam_epsilon)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs)




#Training loop

In [43]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    average_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1} - Average Training Loss: {average_train_loss}")


Epoch 1 - Average Training Loss: 3.028245277404785
Epoch 2 - Average Training Loss: 2.5115823097229004
Epoch 3 - Average Training Loss: 2.2120892362594606
Epoch 4 - Average Training Loss: 1.996423505783081
Epoch 5 - Average Training Loss: 1.8281959981918334
Epoch 6 - Average Training Loss: 1.6948720455169677
Epoch 7 - Average Training Loss: 1.5901589574813844
Epoch 8 - Average Training Loss: 1.4918160123825073
Epoch 9 - Average Training Loss: 1.4178487005233764
Epoch 10 - Average Training Loss: 1.3614891738891601
Epoch 11 - Average Training Loss: 1.2940166158676147
Epoch 12 - Average Training Loss: 1.2430125331878663
Epoch 13 - Average Training Loss: 1.2018304438591003
Epoch 14 - Average Training Loss: 1.156374162197113
Epoch 15 - Average Training Loss: 1.1214402017593383
Epoch 16 - Average Training Loss: 1.0961581621170045
Epoch 17 - Average Training Loss: 1.0617342200279236
Epoch 18 - Average Training Loss: 1.0423839845657348
Epoch 19 - Average Training Loss: 1.0161161584854126
Epoch

##Evaluation

In [44]:
model.eval()
val_predictions = []
val_true_labels = []

with torch.no_grad():
    for batch in val_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        val_predictions.extend(logits.argmax(dim=1).tolist())
        val_true_labels.extend(labels.tolist())

##Classification Metrics

In [45]:
accuracy = accuracy_score(val_true_labels, val_predictions)
print(f"Validation Accuracy: {accuracy}")

class_names = df['medical_specialty'].unique()
class_names = [str(class_name) for class_name in class_names]
report = classification_report(val_true_labels, val_predictions, target_names=class_names)
print(report)

Validation Accuracy: 0.094
                                precision    recall  f1-score   support

          Allergy / Immunology       0.00      0.00      0.00         1
                    Bariatrics       1.00      0.50      0.67         2
    Cardiovascular / Pulmonary       0.00      0.00      0.00         3
                     Neurology       0.13      0.16      0.15        69
                     Dentistry       0.00      0.00      0.00         1
                       Urology       0.06      0.06      0.06       107
              General Medicine       0.00      0.00      0.00         4
                       Surgery       0.14      0.12      0.13         8
             Speech - Language       0.00      0.00      0.00         3
 SOAP / Chart / Progress Notes       0.00      0.00      0.00         1
                Sleep Medicine       0.08      0.10      0.09        21
                  Rheumatology       0.24      0.20      0.22        25
                     Radiology      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Saving the model


In [46]:
model.save_pretrained("medical_specialty_classifier")