# Generated Data Predictions

Importing dependecies for the synthetic dataset to be used 

In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import time

The link to the dataset which has been categorised into the `train data`, `test data` and finally the `developmental set data`.

In [2]:
train_df = pd.read_csv(
    "https://raw.githubusercontent.com/halfmoonliu/InjuryNoteLabel/main/Data/TrainSetGen.csv"
)
test_df = pd.read_csv(
    "https://raw.githubusercontent.com/halfmoonliu/InjuryNoteLabel/main/Data/TestSetGen.csv"
)
dev_df = pd.read_csv(
    "https://raw.githubusercontent.com/halfmoonliu/InjuryNoteLabel/main/Data/DevSetGen.csv"
)

In [3]:
# import matplotlib.pyplot as plt
# train_df['event'].hist()

# # Adding titles and labels to the histogram
# plt.title('Event Distribution in Synthetic Data')
# plt.xlabel('Event')
# plt.ylabel('Frequency')

# plt.show()

#### Tokenisation

tokenising synthetic dataset using the Bert Model

In [4]:
# load pretrained tokenizer
PRE_TRAINED_MODEL_NAME = "bert-base-cased"
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

#### Training the synthetic dataset

In [5]:
# Create injury report corpus, i.e. list of lists of totokenized
# make it lower casings and tokenise

train_report_l = train_df["text"].tolist()
training_corpus = [tokenizer.tokenize(report.lower()) for report in train_report_l]
print(len(training_corpus))  # checking the length of tokens
print(training_corpus[0])  # printing the list of the first tokens

183856
['37', 'works', 'd', 'shoulder', '#', '#', 'f', 'as', 's', 'at', 'p', '#', '#', 'pet', '2', '1', 'upper', 'with', 'car', '39', 'back', 'shoulder', 'tight', 'his', 'fire', '#', '#', 'u', '##ro', 'r', 'day', 'yo', 'worse', 'pain', 'yo', '#', '#', 'f', 'hands', 'noticed', 'hard', 'o', 'living', 'strain', '#', '#', 'm', '31', '41', 'leg', 'motion', 'hands', 'pain', 'does', '28', 'm', 'but', 'construction', 'work', 'bent', '43', 'while', 'at', 'back', 'factory', '#', '#', 't', '##re', '#', '#', 'm', 'at', '49', 'cook', 'works']


In [6]:
# create training vocab dictionary
tok_ind = 0  # numbering for dictionary
tok_id_d = dict()  # map token to token_ind
id_tok_d = dict()  # map token_ind to token

# looping over the training corpus
for si in range(len(training_corpus)):
    # loop over tokens in the sentence
    for tok in training_corpus[si]:
        # update vocab dict
        if tok not in tok_id_d:
            tok_id_d[tok] = tok_ind
            id_tok_d[tok_ind] = tok
            # update token index
            tok_ind += 1
tok_id_d["UNK"] = tok_ind  # if its not in the training corpus assign it to "UNK"
id_tok_d[tok_ind] = "UNK"

In [7]:
# get vocab length
all_toks = id_tok_d.keys()
print("length of the vocabulary:")
print(len(all_toks))
print(tok_id_d["UNK"])
print(id_tok_d[1000])

length of the vocabulary:
7475
7474
chains


#### Preparring the synthetic dataset

In [8]:
def MapTokInd(tok, tokMap):
    if tok in tokMap:
        return tokMap[tok]
    else:
        return tokMap["UNK"]

In [9]:
# create x_train, y_train
train_corpus_frag = training_corpus
x_train = np.empty((len(train_corpus_frag), len(all_toks)))
y_train = train_df["event"].to_numpy()

# looping over the training corpus
for si in range(len(train_corpus_frag)):
    word_count_sent = np.zeros(len(all_toks))
    # loop over tokens in the sentence
    for tok in training_corpus[si]:
        tokInd = MapTokInd(tok, tok_id_d)
        word_count_sent[tokInd] += 1
    x_train[si] = word_count_sent

In [10]:
NBModel = MultinomialNB()


start_time = time.time()

NBModel.fit(x_train, y_train)

# Record the end time
end_time = time.time()

# Calculate and print the time spent
elapsed_time = end_time - start_time
print(f"Time spent training the corpus: {elapsed_time} seconds")

Time spent training the corpus: 11.92273211479187 seconds


#### Training Accuracy and Predictions


In [11]:
# Predict labels for the training data
y_train_pred = NBModel.predict(x_train)

# Calculate accuracy on the training data
train_accuracy = np.sum(y_train == y_train_pred) / x_train.shape[0]

print("Training Accuracy:", train_accuracy)

Training Accuracy: 0.9163856496388478


#### Test Set Accuracy and Predictions

In [12]:
# Create test corpus

test_report_l = test_df["text"].tolist()
test_corpus = [tokenizer.tokenize(report.lower()) for report in test_report_l]
print(len(test_corpus))
print(test_corpus[0])

22982
['patient', '#', '#', 's', '51', 'and', '32', 'o', '#', '#', 'x', '#', '#', 't', '#', '#', 'f', 'wrist', '#', '#', 'f', '4', '#', '#', 'yo', 'and', 'lifting', 'bakery', '29', 'back', 'ne', '#', '#', 'y', '##m', 'upper', '#', '#', 'um', '##ba', '#', '#', 'r', '##s', 'and', 'at', 's', 'd', '#', '#', 'c', '#', '#', 'm', 'l', '#', '#', 'x', 'who', 'to', 'at', 'strain', 'r', '35', '50', 'of', 'strained', 'work', 'and', 'of']


In [13]:
x_test = np.empty((len(test_corpus), len(all_toks)))
y_test = test_df["event"].to_numpy()

# looping over the training corpus
for si in range(len(test_corpus)):
    word_count_sent = np.zeros(len(all_toks))
    # loop over tokens in the sentence
    for tok in test_corpus[si]:
        tokInd = MapTokInd(tok, tok_id_d)
        word_count_sent[tokInd] += 1
    x_test[si] = word_count_sent

In [14]:
# the PREDICTION is done using the TEST data

y_pred = NBModel.predict(x_test)

In [15]:
y_pred

array([71, 63, 63, ..., 55, 27, 99])

In [16]:
accuracy = np.sum(y_test == y_pred) / x_test.shape[0] * 100
accuracy

90.7666869724132

In [17]:
test_df["test predictions"] = y_pred
test_df

Unnamed: 0,id,text,event,event_ind,test predictions
0,11826,patient ##s 51 and 32 o ##x ##t ##f wrist ##f ...,71,41,71
1,10112,striking male ##ture d finger concussion la at...,63,34,63
2,2165,metal 19 r on while while wound ##m ##c finger...,63,34,63
3,28845,##ym work ##yo o a pain neck which pain lower ...,71,41,71
4,16989,l when construction ##on ##x bucket lifting ##...,71,41,71
...,...,...,...,...,...
22977,13437,wound w of urine when body ##w eye was ##l ##t...,55,28,55
22978,381,ankle 40 p 59 at ##pra leg last w rolled ##in ...,41,17,41
22979,9819,the ##ing s up soap ##t work her needle stick ...,55,28,55
22980,326,a 4th him video woods to ##x over ##en i drive...,27,11,27


#### Checking the accuracy for the dev set

In [18]:
dev_report_l = dev_df["text"].tolist()
dev_corpus = [tokenizer.tokenize(report.lower()) for report in dev_report_l]
print(len(dev_corpus))
print(dev_corpus[0])

22982
['radiating', '26', '#', '#', 'm', 'back', 'at', 'and', 'work', 'd', 'l', '33', '#', '#', 'p', '##ra', '#', '#', 'in', 'sitting', '#', '#', 'x', 'd', 'd', 'went', 'foot', 'ankle', '#', '#', 'x', 'w', 'p', 'facility', 'work', 'works', 'o', 'on', 'climbs', 'd', 'just', 'and', 'work', 'injury', '30', 'shelf', 'muscle', 'time', 'has', 'she', 'o', 'when']


In [19]:
x_test_dev = np.empty((len(dev_corpus), len(all_toks)))
y_test_dev = dev_df["event"].to_numpy()

# looping over the training corpus
for si in range(len(dev_corpus)):
    word_count_sent_dev = np.zeros(len(all_toks))
    # loop over tokens in the sentence
    for tok in dev_corpus[si]:
        tokInd = MapTokInd(tok, tok_id_d)
        word_count_sent_dev[tokInd] += 1
    x_test_dev[si] = word_count_sent_dev

In [20]:
# the PREDICTION is done using the TEST data

y_pred_dev = NBModel.predict(x_test_dev)

In [23]:
dev_df["predictions"] = y_pred_dev
dev_df

Unnamed: 0,id,text,event,event_ind,predictions
0,4139,radiating 26 ##m back at and work d l 33 ##pra...,73,43,73
1,10073,##mento robe pain days ##to states was cars ##...,73,43,73
2,20004,work l chest ankle con o if at fell knee knee ...,42,18,42
3,36266,drill work ##m to 17 tire opening in using on ...,62,33,62
4,22571,56 ##al m toe work left back ##ture ##tus con ...,62,33,62
...,...,...,...,...,...
22977,5734,def got machine ##g ##bra ##d 2 lace thumb dis...,64,35,64
22978,27513,his him thigh f grin closed con,62,33,62
22979,1624,when ##fo o o l w work pain his ##yo at ##m ##...,70,40,71
22980,10704,open ##x ##l wood wood today 15 water her ##yo...,62,33,62


In [22]:
accuracy = np.sum(y_test_dev == y_pred_dev) / x_test_dev.shape[0]
accuracy

0.9086676529457837