## Use expanded dataset

In [1]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [2]:
import pandas as pd
data = pd.read_excel("User complain input_top11.xlsx")

In [3]:
data.head()

Unnamed: 0,common complains,specialities
0,Chest pain or discomfort (angina)\nShortness o...,Cardiology
1,Abnormal vaginal bleeding or discharge\nMenstr...,Obstetrics and Gynecology (OBGYN)
2,Pregnancy-related issues such as morning sickn...,Obstetrics
3,Atopic dermatitis or eczema\nAcne\nPsoriasis\n...,Pediatric Dermatology
4,"Acne, rosacea, and other skin conditions\nEcze...",Dermatology


In [4]:
data["common complains"] = data["common complains"].map(lambda x: x.split("\n"))

### Expand the data

In [9]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

#### Substitute word by spelling mistake words dictionary

In [10]:
def spelling_aug(complains):
    res = complains.copy()
    for c in complains:
        aug = naw.SpellingAug()
        augmented_texts = aug.augment(c, n=5)
        res+=augmented_texts
    return list(set(res))

In [11]:
for i in range(len(data)):
    data["common complains"][i] = spelling_aug(data["common complains"][i])

#### Substitute word by word2vec similarity

In [12]:
import gensim

In [13]:
def word_embedding_aug(complains):
    res = complains.copy()
    for c in complains:
        aug = naw.WordEmbsAug(model_type='word2vec', model_path='GoogleNews-vectors-negative300.bin',
        action="substitute")
        augmented_text = aug.augment(c)
        res+=augmented_text
    return list(set(res))

In [14]:
for i in range(len(data)):
    data["common complains"][i] = word_embedding_aug(data["common complains"][i])

In [15]:
data = data.explode("common complains")

In [16]:
data.head()

Unnamed: 0,common complains,specialities
0,Chest aches visit_www.ncfta.org discomfort (re...,Cardiology
0,Atrial fibrillation,Cardiology
0,Cardiac Nathan_Helburn,Cardiology
0,Angevine_Middle blodd presssure (hypertension ),Cardiology
0,Heart attach,Cardiology


In [17]:
len(data)

1317

In [20]:
data.to_excel("User complain input_top11_expanded.xlsx")

In [21]:
specialities = list(set([i for i in data["specialities"]]))

In [22]:
len(specialities)

11

In [23]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data.dataset import random_split
import transformers
# BERT Related Libraries
from sklearn import metrics
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# Python
import pandas as pd
import numpy as np
import os
import time

In [11]:
#X,y = data["common complains split"],data["specialities"]

In [12]:
#from sklearn.model_selection import train_test_split

In [13]:
#x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [24]:
new_data_dummy = pd.get_dummies(data.specialities)

In [27]:
data["list"] = new_data_dummy[new_data_dummy.columns[0:]].values.tolist()

In [29]:
data = data.drop("specialities",axis=1)

In [30]:
data.head()

Unnamed: 0,common complains,list
0,Chest aches visit_www.ncfta.org discomfort (re...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
0,Atrial fibrillation,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
0,Cardiac Nathan_Helburn,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
0,Angevine_Middle blodd presssure (hypertension ),"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
0,Heart attach,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [31]:
len(data)

1317

In [None]:
### skip data preprocssing for now

from nltk.corpus import stopwords
import nltk as nltk
import re
nltk.download('punkt')
nltk.download('stopwords')
stopwords = set(nltk.corpus.stopwords.words('english'))
ps = nltk.wordnet.WordNetLemmatizer()
def pre_process(text):
    
    if type(text)!=str:
        text = ""
    #remove all urls andand hashtags
    # actually rarely happen
    text = re.sub(r'(?:\@|http?\://|https?\://|www)\S+','',text)
    
    #remove new_lines
    text = re.sub(r'(?:\n)',' ',text)
    
    # remove hashtags
    text = re.sub(r'#(?=\w+)','',text)

    # replace all number bullet points uses ('1.', '2.', etc)
    text = re.sub('\d+. ', ' ', text)
    
    # remove special characters
    text = re.sub('[^a-zA-Z]', ' ',text)
    
    text = text.lower()
    
    text = text.split()
    text = [ps.lemmatize(word) for word in text if not word in stopwords]
    
    text = " ".join(text)

    return text

In [31]:
# data["common complains split"] = data["common complains split"].apply(pre_process)

### Preparing the Dataset and Dataloader

In [32]:
from transformers import AutoTokenizer, AutoModel

In [39]:
MAX_LEN = 30
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
EPOCHS = 4
LEARNING_RATE = 1e-05
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v1')
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [34]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.complain_text = self.data["common complains"]
        self.targets = self.data.list
        self.max_len = max_len

    def __len__(self):
        return len(self.complain_text)

    def __getitem__(self, index):
        complain_text = str(self.complain_text[index])
        complain_text = " ".join(complain_text.split())

        inputs = self.tokenizer.encode_plus(
            complain_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [35]:
train_size = 0.8
data = data.reset_index(drop=True)
train_dataset=data.sample(frac=train_size,random_state=42)
test_dataset=data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (1317, 2)
TRAIN Dataset: (1054, 2)
TEST Dataset: (263, 2)


In [53]:
#train_params = {'batch_size': TRAIN_BATCH_SIZE,'shuffle': True,'num_workers': 0}

#test_params = {'batch_size': VALID_BATCH_SIZE,'shuffle': True,'num_workers': 0}
training_loader = DataLoader(training_set)
testing_loader = DataLoader(testing_set)
#training_loader = DataLoader(training_set, **train_params)
#testing_loader = DataLoader(testing_set, **test_params)

In [54]:
len(training_loader)

1054

### Creating the Neural Network for Fine Tuning

In [68]:
!pip install -q transformers

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0m[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/usr/local/opt/python@3.9/bin/python3.9 -m pip install --upgrade pip[0m


In [38]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v1')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(384, 11)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)

BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 384, padding_idx=0)
      (position_embeddings): Embedding(512, 384)
      (token_type_embeddings): Embedding(2, 384)
      (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    

In [43]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [44]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [45]:
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into       
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss 
    return model, optimizer, checkpoint['epoch'], valid_loss_min.item()

In [46]:
import shutil, sys   
def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

In [64]:
#to use as global variables
val_targets=[]
val_outputs=[] 

In [65]:
def train_model(start_epochs,  n_epochs, valid_loss_min_input, 
                training_loader, validation_loader, model, 
                optimizer, checkpoint_path, best_model_path):
   
  # initialize tracker for minimum validation loss
    valid_loss_min = valid_loss_min_input 
   
 
    for epoch in range(start_epochs, n_epochs+1):
        train_loss = 0
        valid_loss = 0
        model.train()
        print('############# Epoch {}: Training Start   #############'.format(epoch))
        for batch_idx, data in enumerate(training_loader):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)

            outputs = model(ids, mask, token_type_ids)

            optimizer.zero_grad()
            loss = loss_fn(outputs, targets)
            #if batch_idx%5000==0:
             #   print(f'Epoch: {epoch}, Training Loss:  {loss.item()}')

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            #print('before loss data in training', loss.item(), train_loss)
            train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
            #print('after loss data in training', loss.item(), train_loss)
    
    print('############# Epoch {}: Training End     #############'.format(epoch))
    
    print('############# Epoch {}: Validation Start   #############'.format(epoch))
    ######################    
    # validate the model #
    ######################
 
    model.eval()
   
    with torch.no_grad():
        for batch_idx, data in enumerate(validation_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
            val_targets.extend(targets.cpu().detach().numpy().tolist())
            
            val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

        print('############# Epoch {}: Validation End     #############'.format(epoch))
          # calculate average losses
          #print('before cal avg train loss', train_loss)
        train_loss = train_loss/len(training_loader)
        valid_loss = valid_loss/len(validation_loader)
          # print training/validation statistics 
        print('Epoch: {} \tAvgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
                epoch, 
                train_loss,
                valid_loss
                ))
      
          # create checkpoint variable and add important data
        checkpoint = {
                'epoch': epoch + 1,
                'valid_loss_min': valid_loss,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict()
          }
        
        # save checkpoint
        save_ckp(checkpoint, False, checkpoint_path, best_model_path)
        
          ## save the model if validation loss has decreased
        if valid_loss <= valid_loss_min:
            print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
            # save checkpoint as best model
            save_ckp(checkpoint, True, checkpoint_path, best_model_path)
            valid_loss_min = valid_loss

    print('############# Epoch {}  Done   #############\n'.format(epoch))

    return model

In [66]:
checkpoint_path = '/Users/ziyuewang/Desktop/Inference Analytics/checkpoint-expanded/current_checkpoint.pt'
best_model = '/Users/ziyuewang/Desktop/Inference Analytics/best_model-expanded/best_model.pt'
trained_model = train_model(1, 4, np.Inf, training_loader, testing_loader, model, 
                      optimizer,checkpoint_path,best_model)

############# Epoch 1: Training Start   #############
############# Epoch 2: Training Start   #############
############# Epoch 3: Training Start   #############
############# Epoch 4: Training Start   #############
############# Epoch 4: Training End     #############
############# Epoch 4: Validation Start   #############
############# Epoch 4: Validation End     #############
Epoch: 4 	Avgerage Training Loss: 0.000033 	Average Validation Loss: 0.000271
Validation loss decreased (inf --> 0.000271).  Saving model ...
############# Epoch 4  Done   #############



In [50]:
test_dataset

Unnamed: 0,common complains,list
0,Atrial fibrillation,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,Urogynecology hypertension,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,Congenital pacemaker_fitted disiase,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,Rapid or irregular heartbeats (arrhythmias),"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,Smile failfure,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
...,...,...
258,Crossed e tern eays (strabismus ),"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]"
259,Difficulty seing at nights or halos around lihgts,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]"
260,"Dry, itchy, ar rea yeys","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]"
261,Troubles With deepply fundamentally ow Wii_Nun...,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]"


In [67]:
val_outputs

[[0.029588667675852776,
  0.00874759629368782,
  0.00737781124189496,
  0.9526032209396362,
  0.006261111702769995,
  0.01210431382060051,
  0.009192438796162605,
  0.005405518226325512,
  0.007138160988688469,
  0.008265126496553421,
  0.010952235199511051],
 [0.9331015348434448,
  0.013875902630388737,
  0.023291273042559624,
  0.014129419811069965,
  0.004612415097653866,
  0.04867428541183472,
  0.008322812616825104,
  0.007691602688282728,
  0.012530763633549213,
  0.066799096763134,
  0.006596517749130726],
 [0.942841112613678,
  0.006639376748353243,
  0.01734035834670067,
  0.017376087605953217,
  0.006594824139028788,
  0.025100959464907646,
  0.009959610179066658,
  0.006673907861113548,
  0.03270119056105614,
  0.015456636436283588,
  0.007017927244305611],
 [0.9533045291900635,
  0.00999363698065281,
  0.021244609728455544,
  0.014017644338309765,
  0.007005675695836544,
  0.010546039789915085,
  0.014273169450461864,
  0.007864382117986679,
  0.023984627798199654,
  0.0195

In [76]:
val_preds = (np.array(val_outputs) > 0.5).astype(int)
val_preds

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [70]:
accuracy = metrics.accuracy_score(val_targets, val_preds)
f1_score_micro = metrics.f1_score(val_targets, val_preds, average='micro')
f1_score_macro = metrics.f1_score(val_targets, val_preds, average='macro')
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.8669201520912547
F1 Score (Micro) = 0.8795411089866156
F1 Score (Macro) = 0.8478133595313565


In [71]:
from sklearn.metrics import multilabel_confusion_matrix as mcm, classification_report

In [72]:
cm = mcm(val_targets, val_preds)

In [74]:
print(classification_report(val_targets, val_preds))

              precision    recall  f1-score   support

           0       1.00      0.84      0.91        19
           1       0.94      0.83      0.88        35
           2       0.85      0.71      0.77        24
           3       0.94      0.89      0.91        35
           4       0.89      1.00      0.94        17
           5       0.88      0.91      0.89        23
           6       0.96      0.87      0.91        30
           7       0.96      1.00      0.98        25
           8       0.92      0.96      0.94        25
           9       0.27      0.50      0.35         6
          10       0.78      0.88      0.82        24

   micro avg       0.88      0.87      0.88       263
   macro avg       0.85      0.85      0.85       263
weighted avg       0.90      0.87      0.88       263
 samples avg       0.87      0.87      0.87       263



  _warn_prf(average, modifier, msg_start, len(result))
