# **Used 15, 000 reports from the pickle file to train and test the model**

In [None]:
!pip install stanza
!pip install tqdm
!pip install transformers

In [None]:
# Test if CUDA is available
!nvidia-smi

Mon Nov 30 07:41:44 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.38       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   65C    P8    11W /  70W |      0MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# Linear algebra and array datastructures
import numpy as np

# For easy handling of dataset
import pandas as pd

# For Accuracy and F1 metrics
from sklearn import metrics

# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# PyTorch dependencies
import torch
import transformers
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

# Supress unnecessary warnings from Transformers and Matplotlib
import re
import warnings
import logging

In [None]:
def warn(*args, **kwargs):
  pass

def set_global_logging_level(level=logging.ERROR, prefices=[""]):
    prefix_re = re.compile(fr'^(?:{ "|".join(prefices) })')
    for name in logging.root.manager.loggerDict:
      if re.match(prefix_re, name):
        logging.getLogger(name).setLevel(level)

warnings.warn = warn
set_global_logging_level(logging.ERROR)

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
# Convert the labels into One Hot Encoding Vectors

df = pd.read_csv('/content/product_dataset_50k.csv')

# One hot encoder
one_hot = pd.get_dummies(df["Label"])

#print(len(df), len(one_hot))

df = df.drop('Label',axis = 1)
# Join the encoded df

df = df.join(one_hot)
df.to_csv("./product_dataset_onehot.csv", sep=',', index=True)

del(one_hot)

df['list'] = df[df.columns[1:]].values.tolist()   # Don't foget to set the index to 1 when running on 50k datset (not req anymore)
new_df = df[['Text', 'list']].copy()

#print(len(new_df))

new_df.head()

Unnamed: 0,Text,list
0,currently unknown whether device may cause con...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
1,clinician report day implant place fdi 26 fail...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
2,insulin pump button response due flattened dom...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
3,currently unknown whether device may cause con...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
4,implant fayl due loss osseointegrate,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"


In [None]:
def learning_plots(train_loss, validation_loss):
    plt.style.use('seaborn')
    plt.suptitle('Loss Curves', fontsize=35)
    plt.plot(train_loss, label='Training Loss')
    plt.plot(validation_loss, color='orange', label='Validation Loss')
    plt.legend()
    #plt.legend(['Training Loss'])
    plt.xlabel('Steps/Batches')
    plt.ylabel('Loss')
    plt.show()
  
def print_confusion_matrix(confusion_matrix, class_names, figsize = (10,7), fontsize=14):
    df_cm = pd.DataFrame(confusion_matrix, index=class_names, columns=class_names)
    fig = plt.figure(figsize=figsize)

    heatmap = sns.heatmap(df_cm, annot=True, fmt="d", cbar = True)

    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
    plt.ylabel('True Labels')
    plt.xlabel('Predicted Labels')
    plt.show()

In [None]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 300   # Very crucial parameter. But, due to limited GPU mememory can't push this value further upto 512!!!!!
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 24  # Default can be set to 8 (safe with basic GPU and low RAM)
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [None]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.Text
        self.targets = self.data.list
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            truncation = True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
# Creating the dataset and dataloader for the neural network
# Split ratios: Training (70%), Validation (15%), Testing (15%)

train_size = 0.7

train_dataset = new_df.sample(frac=train_size, random_state=200)

validation_dataset = new_df.drop(train_dataset.index)

# 50% of Validation dataset and not from the entire dataset
test_dataset = validation_dataset.sample(frac=0.5, random_state=200)

# Remove samples from validation dataset who are also seen in testset
validation_dataset = validation_dataset.drop(test_dataset.index)

# test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True)

train_dataset = train_dataset.reset_index(drop=True)
test_dataset = test_dataset.reset_index(drop=True)
validation_dataset = validation_dataset.reset_index(drop=True)

print(f"Unique samples in Trainset: {len(train_dataset)}, Testset: {len(test_dataset)} and Validationset: {len(validation_dataset)}")

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)
validation_set = CustomDataset(validation_dataset, tokenizer, MAX_LEN)

Unique samples in Trainset: 35000, Testset: 7500 and Validationset: 7500


In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

validation_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)
validation_loader = DataLoader(validation_set, **validation_params)

In [None]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 13) # 13 output classes for 50,000 documents
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
def validation(epoch):
    model.eval()
    val_epoch_loss = []
    #print("\nValidating the model:")
    with torch.no_grad():
      for _,data in enumerate(validation_loader, 0):
          ids = data['ids'].to(device, dtype = torch.long)
          mask = data['mask'].to(device, dtype = torch.long)
          token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
          targets = data['targets'].to(device, dtype = torch.float)

          outputs = model(ids, mask, token_type_ids)
          #optimizer.zero_grad()

          loss = loss_fn(outputs, targets)
          #if _ % 100==0:
              #print(f'Epoch: {epoch}, Step: {_}, Validation Loss:  {loss.item()}')

          val_epoch_loss.append(loss.item())

    # print("\n-------------------------------------------------------------------")
    return val_epoch_loss

In [None]:
def train(epoch):
    epoch_loss = []
    validation_epoch_loss = []
    #print("Training the model:\n")
    for _,data in enumerate(training_loader, 0):
        model.train()
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)

        loss.backward()
        optimizer.step()

        # Performing validation with each step/epoch of training
        val_epoch_loss = validation(epoch)
        validation_loss = sum(val_epoch_loss)/len(val_epoch_loss)

        if _ % 200 == 0:
            print(f'Epoch: {epoch}, Step: {_}, Training Loss:  {loss.item()}, Validation Loss:  {validation_loss}\n')
        

        epoch_loss.append(loss.item())
        validation_epoch_loss.append(validation_loss)

    print("\nTraining Complete!")
    return epoch_loss, validation_epoch_loss

In [None]:
#train_loss = []

for epoch in range(EPOCHS):
    train_epoch_loss, validation_epoch_loss = train(epoch)
    # train_loss.append(sum(epoch_loss)/len(epoch_loss)) -> Uncomment and use this variable to plot the final Training Loss curve when running for more than 1 epochs

Epoch: 0, Step: 0, Training Loss:  0.6909028887748718, Validation Loss:  0.6726009318242058



In [None]:
# Currently printing loss curve only for 1 epoch (which has close to 400 batches or steps) -> in our case 1 epoch is more than enough for training

learning_plots(train_epoch_loss, validation_epoch_loss)
#learning_plots(np.linspace(1, len(train_epoch_loss), len(train_epoch_loss)).astype(int), train_epoch_loss, val_epoch_loss)

In [None]:
def testing(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
for epoch in range(EPOCHS):
    outputs, targets = testing(epoch)
    outputs = np.array(outputs) >= 0.3
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    f1_score_weighted = metrics.f1_score(targets, outputs, average='weighted')

    print(f"\nOverall Accuracy = {accuracy}")
    print(f"\nOverall F1 Score (Micro) = {f1_score_micro}")
    print(f"\nOverall F1 Score (Macro) = {f1_score_macro}")
    print(f"\nOverall F1 Score (Weighted) = {f1_score_weighted}")


    labels_15k = [0,1,2,3,4,5,6,7,8]
    label_names_15k = ["CAW", "DXZ", "DZE", "FTM", "GAS", "HRY", "JAA", "MRD", "OYC"]

    labels_50k = [0,1,2,3,4,5,6,7,8,9,10,11,12]
    label_names_50k = ["BYG", "CAW", "CCN", "DXZ", "DZE", "FTM", "GAS", "HRY", "HWC", "JAA", "LWQ", "MRD", "OYC"]

    targets = np.asarray(targets)

    confusion_mtx = metrics.confusion_matrix(targets.argmax(axis=1), outputs.argmax(axis=1), labels = labels_50k)


In [None]:
print("\nOverall Classification Report:")
print(f"\n {metrics.classification_report(targets.argmax(axis=1), outputs.argmax(axis=1), labels = labels_50k, target_names = label_names_50k)}")

In [None]:
print_confusion_matrix(confusion_mtx, label_names_50k)

**Experiments** (Still debugging some issues with torch tensor shapes)

In [None]:
def get_item_tensor(sentence):
  inputs = tokenizer.encode_plus(sentence, None, add_special_tokens=True, max_length=200, pad_to_max_length=True, return_token_type_ids=True)
  ids = inputs['input_ids']
  mask = inputs['attention_mask']
  token_type_ids = inputs["token_type_ids"]


  return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long)
        }

In [None]:
# Real-time inference to map "event descriptions to correspnding product codes"

print("\n1. Classify Sentence")
print("\n2. Exit")
while True:
  option = int(input("\nPlease select any option: "))
  if option == 1:
    sentence = input("\nPlease enter a description which you want to classify: ")
    tokenized_output = get_item_tensor(sentence)

    #print(tokenized_output["ids"].shape, tokenized_output["mask"].shape, tokenized_output["token_type_ids"].shape)

    ids = tokenized_output["ids"].to(device, dtype = torch.long)
    mask = tokenized_output["mask"].to(device, dtype = torch.long)
    token_type_ids = tokenized_output["token_type_ids"].to(device, dtype = torch.long)

    output = model(ids.unsqueeze(0),mask.unsqueeze(0),token_type_ids)
    print("\n")
    print(output.tolist())
  
  else:
    print("\nBye")
    break

----------------------------------------------------------------------------------------------------------------------------------------------------------------

# **Code to create Dataset (csv) with One-hot Embedded Labels**

In [None]:
import string
import stanza
import pickle
import nltk
from tqdm import tqdm
from pprint import pprint
nltk.download('stopwords')
from nltk.corpus import stopwords
#from spacy_stanza import StanzaLanguage
#from spacy.attrs import ORTH, NORM


class MaudePreprocessor():

    def __init__(self):
        stanza.download('en', package='craft', processors='tokenize,pos,lemma') #package='mimic'
        self.nlp = stanza.Pipeline('en', package='craft', processors='tokenize,pos,lemma')
        #snlp = stanza.Pipeline(lang='en', package='craft', processors='tokenize,pos,lemma')
        #self.nlp = StanzaLanguage(snlp)
        #self.nlp.tokenizer.add_special_case("(B)(4)", [{ORTH: "(B)(4)"}])
        #self.nlp.tokenizer.add_special_case("(B)(6)", [{ORTH: "(B)(6)"}])


    # doesn't store tokens, ssplit=True
    def tokenizer_lemmatizer(self, text):
        doc = self.nlp(text)
        seg_text = []
        for sent in doc.sentences:
            sentence = [token.lemma.lower() for token in sent.words]
            seg_text.append(sentence)
        return seg_text

        '''
        #for spacy language objects
        doc = self.nlp(text)
        seg_text = []
        # no sentence splitting (yet)
        for sent in doc.sents:
            sent = [word.lemma_ for word in sent]
            #sent = [word for word in sent.words]
            seg_text.append(sent)
        return seg_text
        '''


    def clean(self, seg_text):
        clean_text = []
        for sent in seg_text:
            clean_sent = []
            for tok in sent:
                # / should probably not be in punctuation, (B)(4) should be one token
                if tok not in stopwords.words('english') and tok not in string.punctuation and len(tok)>1:
                    clean_sent.append(tok)
            clean_text.append(clean_sent)
        return clean_text

    def pipe(self, text, ssplit=False):
        t = self.tokenizer_lemmatizer(text)
        t = self.clean(t)
        if not ssplit:
            t = [token for sent in t for token in sent]
        return t


if __name__ == "__main__":
    path = '/content/drive/MyDrive/100000_random_entries_prod_codes (1).pkl'
    P = MaudePreprocessor()
    f = open(path, 'rb')
    subset = pickle.load(f)
    final_texts = []
    label_list = []

    # Don't forget to add TQDM when code is moved to .py instead of these bloody notebooks!!!
    for key in (subset):
        label = subset[key]["device"][0]['device_report_product_code']
        if "mdr_text" in subset[key]:
            per_device_texts = []
            for entry in subset[key]["mdr_text"]:
                text = entry['text']
                processed_text = " ".join(P.pipe(text))
                per_device_texts.append(processed_text)

        final_texts.append(" ".join(per_device_texts))
        label_list.append(label)

        # When TQDM is used, remove this print block
        if len(label_list) % 5000 == 0:
          print(f"Processed {len(label_list)} files so far")

        if len(label_list) >= 50000: 
          print("Complete!!!")
          break

In [None]:
# Dataset csv file dumper

df = pd.DataFrame(data={"Text": final_texts, "Label": label_list})
df.to_csv("./product_dataset.csv", sep=',', index=False)
print("Dump complete!")