In [1]:
! pip install transformers==3.0.2

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0mCollecting transformers==3.0.2
  Downloading transformers-3.0.2-py3-none-any.whl (769 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m769.0/769.0 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting tokenizers==0.8.1.rc1
  Downloading tokenizers-0.8.1rc1.tar.gz (97 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.4/97.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[2K     [90m━━━━━━━━━━━

In [2]:
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertModel
import logging
logging.basicConfig(level=logging.ERROR)

In [3]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [4]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

### Original dataset

In [178]:
data = pd.read_excel("User complain input_top11_explode.xlsx")

In [179]:
data.head()

Unnamed: 0.1,Unnamed: 0,common complains,specialities
0,0,Chest pain or discomfort (angina),Cardiology
1,0,Shortness of breath,Cardiology
2,0,Rapid or irregular heartbeats (arrhythmias),Cardiology
3,0,Heart failure,Cardiology
4,0,High blood pressure (hypertension),Cardiology


In [180]:
data.drop(columns=["Unnamed: 0"],inplace=True)

In [181]:
new_data_dummy = pd.get_dummies(data.specialities)

In [182]:
data["list"] = new_data_dummy[new_data_dummy.columns[0:]].values.tolist()

In [183]:
data = data.drop("specialities",axis=1)

In [184]:
data.head()

Unnamed: 0,common complains,list
0,Chest pain or discomfort (angina),"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,Shortness of breath,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,Rapid or irregular heartbeats (arrhythmias),"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,Heart failure,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,High blood pressure (hypertension),"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [185]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4
EPOCHS = 40
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)
     

In [186]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe["common complains"]
        self.targets = self.data.list
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [187]:
train_size = 0.8
train_data=data.sample(frac=train_size,random_state=200)
test_data=data.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)
testing_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)

FULL Dataset: (159, 2)
TRAIN Dataset: (127, 2)
TEST Dataset: (32, 2)


In [188]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [189]:
class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, 11)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

model = DistilBERTClass()
model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


DistilBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_featu

In [190]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [191]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [192]:
def train(epoch):
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        loss.backward()
        optimizer.step()

In [193]:
for epoch in range(EPOCHS):
    train(epoch)

0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch: 0, Loss:  0.6818211674690247


32it [01:40,  3.15s/it]
0it [00:00, ?it/s]

Epoch: 1, Loss:  0.3841126561164856


32it [01:46,  3.32s/it]
0it [00:00, ?it/s]

Epoch: 2, Loss:  0.32754847407341003


32it [01:51,  3.48s/it]
0it [00:00, ?it/s]

Epoch: 3, Loss:  0.2817577123641968


32it [01:24,  2.63s/it]
0it [00:00, ?it/s]

Epoch: 4, Loss:  0.31751811504364014


32it [01:52,  3.51s/it]
0it [00:00, ?it/s]

Epoch: 5, Loss:  0.293416291475296


32it [01:51,  3.49s/it]
0it [00:00, ?it/s]

Epoch: 6, Loss:  0.2588900327682495


32it [01:50,  3.45s/it]
0it [00:00, ?it/s]

Epoch: 7, Loss:  0.26266583800315857


32it [02:05,  3.94s/it]
0it [00:00, ?it/s]

Epoch: 8, Loss:  0.2286326289176941


32it [02:20,  4.40s/it]
0it [00:00, ?it/s]

Epoch: 9, Loss:  0.21947216987609863


32it [01:39,  3.09s/it]
0it [00:00, ?it/s]

Epoch: 10, Loss:  0.18686968088150024


32it [01:36,  3.02s/it]
0it [00:00, ?it/s]

Epoch: 11, Loss:  0.1854100078344345


32it [01:39,  3.11s/it]
0it [00:00, ?it/s]

Epoch: 12, Loss:  0.16824181377887726


32it [01:52,  3.51s/it]
0it [00:00, ?it/s]

Epoch: 13, Loss:  0.14253562688827515


32it [02:01,  3.81s/it]
0it [00:00, ?it/s]

Epoch: 14, Loss:  0.12529468536376953


32it [01:49,  3.41s/it]
0it [00:00, ?it/s]

Epoch: 15, Loss:  0.09120436012744904


32it [01:47,  3.35s/it]
0it [00:00, ?it/s]

Epoch: 16, Loss:  0.0975562110543251


32it [01:36,  3.00s/it]
0it [00:00, ?it/s]

Epoch: 17, Loss:  0.13571734726428986


32it [01:29,  2.81s/it]
0it [00:00, ?it/s]

Epoch: 18, Loss:  0.08806805312633514


32it [03:02,  5.70s/it]
0it [00:00, ?it/s]

Epoch: 19, Loss:  0.07218506187200546


32it [01:43,  3.24s/it]
0it [00:00, ?it/s]

Epoch: 20, Loss:  0.09419343620538712


32it [01:33,  2.91s/it]
0it [00:00, ?it/s]

Epoch: 21, Loss:  0.07149429619312286


32it [01:29,  2.80s/it]
0it [00:00, ?it/s]

Epoch: 22, Loss:  0.061674121767282486


32it [01:39,  3.10s/it]
0it [00:00, ?it/s]

Epoch: 23, Loss:  0.05508360639214516


32it [02:19,  4.36s/it]
0it [00:00, ?it/s]

Epoch: 24, Loss:  0.06544960290193558


32it [02:13,  4.16s/it]
0it [00:00, ?it/s]

Epoch: 25, Loss:  0.0888465866446495


32it [01:49,  3.42s/it]
0it [00:00, ?it/s]

Epoch: 26, Loss:  0.06191366910934448


32it [01:43,  3.25s/it]
0it [00:00, ?it/s]

Epoch: 27, Loss:  0.055416204035282135


32it [01:46,  3.31s/it]
0it [00:00, ?it/s]

Epoch: 28, Loss:  0.052084218710660934


32it [01:48,  3.39s/it]
0it [00:00, ?it/s]

Epoch: 29, Loss:  0.07040351629257202


32it [01:45,  3.30s/it]
0it [00:00, ?it/s]

Epoch: 30, Loss:  0.04454401507973671


32it [01:39,  3.10s/it]
0it [00:00, ?it/s]

Epoch: 31, Loss:  0.038650479167699814


32it [02:11,  4.11s/it]
0it [00:00, ?it/s]

Epoch: 32, Loss:  0.03569065406918526


32it [01:36,  3.01s/it]
0it [00:00, ?it/s]

Epoch: 33, Loss:  0.03136682137846947


32it [01:45,  3.31s/it]
0it [00:00, ?it/s]

Epoch: 34, Loss:  0.029208488762378693


32it [01:07,  2.11s/it]
0it [00:00, ?it/s]

Epoch: 35, Loss:  0.03319096565246582


32it [01:38,  3.06s/it]
0it [00:00, ?it/s]

Epoch: 36, Loss:  0.07026325911283493


32it [01:14,  2.33s/it]
0it [00:00, ?it/s]

Epoch: 37, Loss:  0.03385933116078377


32it [01:12,  2.26s/it]
0it [00:00, ?it/s]

Epoch: 38, Loss:  0.03912796825170517


32it [01:18,  2.44s/it]
0it [00:00, ?it/s]

Epoch: 39, Loss:  0.029892582446336746


32it [01:19,  2.48s/it]


In [194]:
def validation(testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            loss = loss_fn(outputs, targets)
            if _%5000==0:
                print(f'Epoch: {epoch}, Loss:  {loss.item()}')
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            
    return fin_outputs, fin_targets

In [195]:
outputs, targets = validation(testing_loader)

1it [00:00,  1.57it/s]

Epoch: 39, Loss:  0.2509323060512543


8it [00:04,  1.62it/s]


In [90]:
def range_with_floats(start, stop, step):
    while stop > start:
        yield start
        start += step

In [196]:
outputs

[[0.02967296913266182,
  0.02961972914636135,
  0.014226323924958706,
  0.07588532567024231,
  0.02723970264196396,
  0.01687607169151306,
  0.02587633579969406,
  0.06535188853740692,
  0.6742538809776306,
  0.016136035323143005,
  0.025569645687937737],
 [0.018758021295070648,
  0.7552823424339294,
  0.01302087027579546,
  0.0332782119512558,
  0.026653112843632698,
  0.01479150727391243,
  0.014577707275748253,
  0.018366757780313492,
  0.014068931341171265,
  0.013364764861762524,
  0.021176205947995186],
 [0.7698275446891785,
  0.008709144778549671,
  0.025341909378767014,
  0.016520481556653976,
  0.011799785308539867,
  0.014695705845952034,
  0.015824249014258385,
  0.007609314750880003,
  0.00934426486492157,
  0.020456969738006592,
  0.015009932219982147],
 [0.8662148118019104,
  0.017134979367256165,
  0.013025546446442604,
  0.06902726739645004,
  0.01264997385442257,
  0.013328583911061287,
  0.015114621259272099,
  0.010961401276290417,
  0.012771531008183956,
  0.0107013

In [42]:
#threshold = []
#for i in range_with_floats(0.2,0.35,0.02):
#    tmp_preds = (np.array(outputs) > i).astype(int)
#    tmp_accuracy = metrics.accuracy_score(val_targets, tmp_preds)
#    threshold.append((i,tmp_accuracy))
    

8it [00:04,  1.87it/s]


In [197]:
final_outputs = np.array(outputs) >=0.5
val_hamming_loss = metrics.hamming_loss(targets, final_outputs)
val_hamming_score = hamming_score(np.array(targets), np.array(final_outputs))

print(f"Hamming Score = {val_hamming_score}")
print(f"Hamming Loss = {val_hamming_loss}")

Hamming Score = 0.40625
Hamming Loss = 0.07954545454545454


In [198]:
final_outputs

array([[False, False, False, False, False, False, False, False,  True,
        False, False],
       [False,  True, False, False, False, False, False, False, False,
        False, False],
       [ True, False, False, False, False, False, False, False, False,
        False, False],
       [ True, False, False, False, False, False, False, False, False,
        False, False],
       [ True, False, False, False, False, False, False, False, False,
        False, False],
       [False, False, False, False, False, False, False, False, False,
        False, False],
       [False,  True, False, False, False, False, False, False, False,
        False, False],
       [False,  True, False, False, False, False, False, False, False,
        False, False],
       [False, False, False,  True, False, False, False, False, False,
        False, False],
       [False,  True, False, False, False, False, False, False, False,
        False, False],
       [False, False, False, False, False, False, False, Fal

In [199]:
from sklearn.metrics import classification_report
for i in range(len(final_outputs[0])):
    print(new_data_dummy.columns[i]+"\n",classification_report([t[i] for t in targets], [p[i] for p in final_outputs]))

Cardiology
               precision    recall  f1-score   support

         0.0       0.89      0.96      0.93        26
         1.0       0.75      0.50      0.60         6

    accuracy                           0.88        32
   macro avg       0.82      0.73      0.76        32
weighted avg       0.87      0.88      0.86        32

Dermatology
               precision    recall  f1-score   support

         0.0       1.00      0.87      0.93        30
         1.0       0.33      1.00      0.50         2

    accuracy                           0.88        32
   macro avg       0.67      0.93      0.71        32
weighted avg       0.96      0.88      0.90        32

Internal Medicine
               precision    recall  f1-score   support

         0.0       0.94      0.97      0.95        30
         1.0       0.00      0.00      0.00         2

    accuracy                           0.91        32
   macro avg       0.47      0.48      0.48        32
weighted avg       0.88      0

### expanded dataset

In [200]:
expand_data = pd.read_excel("/Users/ziyuewang/Desktop/Inference Analytics/complain data/User complain input_top11_expanded.xlsx")

In [201]:
expand_data.drop(columns = ["Unnamed: 0"],axis=1,inplace = True)

In [202]:
expand_data

Unnamed: 0,common complains,specialities
0,Chest aches visit_www.ncfta.org discomfort (re...,Cardiology
1,Atrial fibrillation,Cardiology
2,Cardiac Nathan_Helburn,Cardiology
3,Angevine_Middle blodd presssure (hypertension ),Cardiology
4,Heart attach,Cardiology
...,...,...
1312,"Eye strain, headaches, e fatigue whih reading ...",Optometry
1313,Eye injuries home@timesdispatch.com spokespers...,Optometry
1314,Dizzying changhings Yandarbiyev_Chechnya_actin...,Optometry
1315,Sudden change ein vision ar loss of vision,Optometry


In [203]:
data = pd.read_excel("User complain input_0202_top11_explode.xlsx")

In [204]:
new_data_dummy = pd.get_dummies(data.specialities)

In [205]:
data["list"] = new_data_dummy[new_data_dummy.columns[0:]].values.tolist()

In [206]:
data.head()

Unnamed: 0.1,Unnamed: 0,common complains,specialities,list
0,0,Chest pain or discomfort (angina),Cardiology,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,0,Shortness of breath,Cardiology,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,0,Rapid or irregular heartbeats (arrhythmias),Cardiology,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,0,Heart failure,Cardiology,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,0,High blood pressure (hypertension),Cardiology,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [207]:
expanded_data_dummy = pd.get_dummies(expand_data.specialities)

In [208]:
expand_data["list"] = expanded_data_dummy[expanded_data_dummy.columns[0:]].values.tolist()

In [209]:
expand_data = expand_data.drop("specialities",axis=1)

In [210]:
expand_data.head()

Unnamed: 0,common complains,list
0,Chest aches visit_www.ncfta.org discomfort (re...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,Atrial fibrillation,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,Cardiac Nathan_Helburn,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,Angevine_Middle blodd presssure (hypertension ),"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,Heart attach,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [211]:
X,y = data["common complains"], data.list

In [212]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [213]:
train_data = expand_data[~expand_data['common complains'].isin(x_test)]

In [214]:
train_data.shape

(1236, 2)

In [215]:
train_data.reset_index(inplace=True)

In [216]:
test_data = pd.DataFrame()

In [217]:
test_data["common complains"] = x_test

In [218]:
test_data["list"] = y_test

In [219]:
test_data.reset_index(inplace=True)

In [220]:
training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)
testing_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)

In [221]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [222]:
model = DistilBERTClass()
model.to(device)
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [123]:
for epoch in range(EPOCHS):
    train(epoch)

0it [00:00, ?it/s]

Epoch: 0, Loss:  0.6804958581924438


295it [14:31,  2.95s/it]


KeyboardInterrupt: 

In [None]:
outputs, targets = validation(testing_loader)

In [None]:
outputs

In [None]:
final_outputs = np.array(outputs) >=0.2
val_hamming_loss = metrics.hamming_loss(targets, final_outputs)
val_hamming_score = hamming_score(np.array(targets), np.array(final_outputs))

print(f"Hamming Score = {val_hamming_score}")
print(f"Hamming Loss = {val_hamming_loss}")

In [None]:
from sklearn.metrics import classification_report
for i in range(len(final_outputs[0])):
    print(new_data_dummy.columns[i]+"\n",classification_report([t[i] for t in targets], [p[i] for p in final_outputs]))

### Kaggle dataset

In [251]:
import pandas as pd
data = pd.read_csv("/Users/ziyuewang/Desktop/Inference Analytics/mtsamples.csv")

In [252]:
data.drop(columns=["Unnamed: 0"],inplace = True)

In [253]:
data.head()

Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


In [254]:
from nltk.corpus import stopwords
import nltk as nltk
import re
nltk.download('punkt')
nltk.download('stopwords')
stopwords = set(nltk.corpus.stopwords.words('english'))
ps = nltk.wordnet.WordNetLemmatizer()
def pre_process(text):
    
    if type(text)!=str:
        text = ""
    #remove all urls andand hashtags
    # actually rarely happen
    text = re.sub(r'(?:\@|http?\://|https?\://|www)\S+','',text)
    
    #remove new_lines
    text = re.sub(r'(?:\n)',' ',text)
    
    # remove hashtags
    text = re.sub(r'#(?=\w+)','',text)

    # replace all number bullet points uses ('1.', '2.', etc)
    text = re.sub('\d+. ', ' ', text)
    
    # remove special characters
    text = re.sub('[^a-zA-Z]', ' ',text)
    
    text = text.lower()
    
    text = text.split()
    text = [ps.lemmatize(word) for word in text if not word in stopwords]
    
    text = " ".join(text)

    return text

[nltk_data] Downloading package punkt to /Users/ziyuewang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ziyuewang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [255]:
data.transcription = data.transcription.apply(pre_process)

In [256]:
data.head()

Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,subjective year old white female present compl...,"allergy / immunology, allergic rhinitis, aller..."
1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,past medical history difficulty climbing stair...,"bariatrics, laparoscopic gastric bypass, weigh..."
2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,history present illness seen abc today pleasan...,"bariatrics, laparoscopic gastric bypass, heart..."
3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,mode left atrial enlargement left atrial diame...,"cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,left ventricular cavity size wall thickness ap...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


In [257]:
data.medical_specialty.unique()

array([' Allergy / Immunology', ' Bariatrics',
       ' Cardiovascular / Pulmonary', ' Neurology', ' Dentistry',
       ' Urology', ' General Medicine', ' Surgery', ' Speech - Language',
       ' SOAP / Chart / Progress Notes', ' Sleep Medicine',
       ' Rheumatology', ' Radiology', ' Psychiatry / Psychology',
       ' Podiatry', ' Physical Medicine - Rehab',
       ' Pediatrics - Neonatal', ' Pain Management', ' Orthopedic',
       ' Ophthalmology', ' Office Notes', ' Obstetrics / Gynecology',
       ' Neurosurgery', ' Nephrology', ' Letters',
       ' Lab Medicine - Pathology', ' IME-QME-Work Comp etc.',
       ' Hospice - Palliative Care', ' Hematology - Oncology',
       ' Gastroenterology', ' ENT - Otolaryngology', ' Endocrinology',
       ' Emergency Room Reports', ' Discharge Summary',
       ' Diets and Nutritions', ' Dermatology',
       ' Cosmetic / Plastic Surgery', ' Consult - History and Phy.',
       ' Chiropractic', ' Autopsy'], dtype=object)

In [258]:
data.medical_specialty = data.medical_specialty.map(lambda x: x.strip().split(" / "))

In [259]:
data.medical_specialty

0             [Allergy, Immunology]
1                      [Bariatrics]
2                      [Bariatrics]
3       [Cardiovascular, Pulmonary]
4       [Cardiovascular, Pulmonary]
                   ...             
4994          [Allergy, Immunology]
4995          [Allergy, Immunology]
4996          [Allergy, Immunology]
4997          [Allergy, Immunology]
4998          [Allergy, Immunology]
Name: medical_specialty, Length: 4999, dtype: object

In [260]:
X = data["transcription"]

In [261]:
specialities = data['medical_specialty'].explode().unique()

In [262]:
from sklearn.preprocessing import MultiLabelBinarizer

multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(data['medical_specialty'])
# transform target variable
y = multilabel_binarizer.transform(data['medical_specialty'])

In [263]:
need_data = pd.DataFrame()
need_data["common complains"] = X

In [264]:
need_data["list"] = [i for i in y]

In [265]:
need_data

Unnamed: 0,common complains,list
0,subjective year old white female present compl...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,past medical history difficulty climbing stair...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,history present illness seen abc today pleasan...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,mode left atrial enlargement left atrial diame...,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,left ventricular cavity size wall thickness ap...,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
4994,history pleasure meeting evaluating patient re...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4995,admitting diagnosis kawasaki disease discharge...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4996,subjective year old white female come today co...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4997,chief complaint year old male present child ho...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [266]:
train_size = 0.8
train_data = need_data.sample(frac=train_size,random_state=200)
test_data = need_data.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(need_data.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)
testing_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)

FULL Dataset: (4999, 2)
TRAIN Dataset: (3999, 2)
TEST Dataset: (1000, 2)


In [284]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [285]:
class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, 47)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

model = DistilBERTClass()
model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


DistilBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_featu

In [286]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [287]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [289]:
def train(epoch):
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        loss.backward()
        optimizer.step()

In [290]:
for epoch in range(20):
    train(epoch)

0it [00:00, ?it/s]

Epoch: 0, Loss:  0.6851615309715271


1000it [41:27,  2.49s/it]
0it [00:00, ?it/s]

Epoch: 1, Loss:  0.09063061326742172


1000it [37:41,  2.26s/it]
0it [00:00, ?it/s]

Epoch: 2, Loss:  0.11559092998504639


1000it [38:04,  2.28s/it]
0it [00:00, ?it/s]

Epoch: 3, Loss:  0.0773862898349762


1000it [38:12,  2.29s/it]
0it [00:00, ?it/s]

Epoch: 4, Loss:  0.09848082065582275


97it [03:45,  2.32s/it]


KeyboardInterrupt: 

In [291]:
outputs, targets = validation(testing_loader)

1it [00:00,  1.40it/s]

Epoch: 4, Loss:  0.09801194071769714


250it [02:51,  1.46it/s]


In [292]:
outputs

[[0.0015786588191986084,
  0.0021914388053119183,
  0.002785635180771351,
  0.019222380593419075,
  0.007512817159295082,
  0.0025683685671538115,
  0.016063297167420387,
  0.004883551504462957,
  0.0064170784316957,
  0.005832377355545759,
  0.0014066509902477264,
  0.008876882493495941,
  0.027113232761621475,
  0.006788407452404499,
  0.0038435093592852354,
  0.07251493632793427,
  0.010807329788804054,
  0.035383440554142,
  0.017510944977402687,
  0.002097234595566988,
  0.0029004025273025036,
  0.0013569612056016922,
  0.0015796686057001352,
  0.003533173818141222,
  0.011800185777246952,
  0.013523953035473824,
  0.03194915130734444,
  0.026497453451156616,
  0.0068394485861063,
  0.02011769264936447,
  0.09106988459825516,
  0.01531815342605114,
  0.009888600558042526,
  0.0038402851205319166,
  0.006492396350950003,
  0.012143840081989765,
  0.007262757513672113,
  0.004627486225217581,
  0.004812471568584442,
  0.018633294850587845,
  0.010341779328882694,
  0.001872294466011

In [307]:
final_outputs = np.array(outputs) >=0.16
val_hamming_loss = metrics.hamming_loss(targets, final_outputs)
val_hamming_score = hamming_score(np.array(targets), np.array(final_outputs))

print(f"Hamming Score = {val_hamming_score}")
print(f"Hamming Loss = {val_hamming_loss}")

Hamming Score = 0.3522166666666667
Hamming Loss = 0.035936170212765954


In [308]:
from sklearn.metrics import classification_report
for i in range(len(final_outputs[0])):
    print(classification_report([t[i] for t in targets], [p[i] for p in final_outputs]))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       999
         1.0       0.00      0.00      0.00         1

    accuracy                           1.00      1000
   macro avg       0.50      0.50      0.50      1000
weighted avg       1.00      1.00      1.00      1000

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1000

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000

              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00       995
         1.0       0.00      0.00      0.00         5

    accuracy                           0.99      1000
   macro avg       0.50      0.50      0.50      1000
weighted avg       0.99      0.99      0.99      1000

              precision    recall  f1-score   support

         0.0      