###**Necessary Imports and installations**


In [None]:
!pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.63.6-py3-none-any.whl (249 kB)
[K     |████████████████████████████████| 249 kB 5.2 MB/s 
[?25hCollecting streamlit
  Downloading streamlit-1.8.1-py2.py3-none-any.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 24.5 MB/s 
Collecting wandb>=0.10.32
  Downloading wandb-0.12.14-py2.py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 38.6 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 37.5 MB/s 
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 2.1 MB/s 
Collecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 50.3 MB/s 
[?25hCollecting tokenizers
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux20

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

from simpletransformers.ner import NERModel, NERArgs

import warnings
warnings.filterwarnings("ignore")

###**Data Preperation**

In [None]:
# reading the data
df = pd.read_csv('./Summer Internship - Homework Exercise.csv')

In [None]:
# displaying all rows for visual observation
pd.set_option('display.max_rows', 300)

In [None]:
# Splitting data into train-val-test
train_df = df[df['dataset']=='train']
val_df = df[df['dataset']=='validation']
test_df = df[df['dataset']=='test']

In [None]:
print(train_df.head())
print(val_df.head())

           transaction_descriptor store_number dataset
0  DOLRTREE 2257 00022574 ROSWELL         2257   train
1                  AUTOZONE #3547         3547   train
2           TGI FRIDAYS 1485 0000         1485   train
3          BUFFALO WILD WINGS 003            3   train
4                  J. CREW #568 0          568   train
                       transaction_descriptor store_number     dataset
100                              DEL TACO 833          833  validation
101                 NNT BURLNGTON STORE472605       472605  validation
102                            WENDY'S #05320         5320  validation
103                        DUNKIN #337734 Q35       337734  validation
104  MCDONALD'S F565          CLARKSVILLE  TN         F565  validation


###**Auxiliary functions for data processing**

In [None]:
# removing all punctuations and multiple whitespaces

def remove_punctuation(text):
    text = ''.join([char if char.isalnum() or char == ' ' else ' ' for char in text])
    text = ' '.join(text.split())  # remove multiple whitespace
    return text

In [None]:
# as non of the store numbers start with zeros, we remove all leading zeros from numbers

def removezeros(str):
    str = str.split()
    for k in range(len(str)):
        while str[k][0] == '0':
            if len(str[k]) > 1:
                str[k] = str[k][1:]
            elif str[k] == '0':
                str.pop(k)
                break
            else:
                break
    return str

In [None]:
# making each word of the sentence(from columns 'transaction_descriptor') a unique row of the dataframe and assigning them labels

def sentence_to_words(df):
  S = [] # sentence list
  W = [] # words list
  L = [] # labels list
  for x in range(len(df)):
      for k in range(len(df['transaction_descriptor'].iloc[x])):
          S.append(f'sentence: {x}')
          W.append(df['transaction_descriptor'].iloc[x][k])
          if df['store_number'].iloc[x] == df['transaction_descriptor'].iloc[x][k]:
              L.append('This') # each correct store number is mapped as 'This' 
          else:
              L.append('O') # anything else is mapped as 'O'
  return S,W,L

In [None]:
# function applies all necessary preparations to the DataFrame
def apply_preprocessing(df):
  df['transaction_descriptor'] = df['transaction_descriptor'].apply(remove_punctuation)
  df['transaction_descriptor'] = df['transaction_descriptor'].apply(removezeros)
  S,W,L = sentence_to_words(df)
  df_processed = pd.concat([pd.Series(W), pd.Series(S), pd.Series(L)], axis=1)
  df_processed.rename(columns={0: "words", 1: "sentence_id", 2:'labels'}, inplace=True)
  return df_processed

In [None]:
# preprocessing train-val-test DataFrames
train_df_processed = apply_preprocessing(train_df)
val_df_processed = apply_preprocessing(val_df)
apply_preprocessing(test_df)

Unnamed: 0,words,sentence_id,labels
0,IN,sentence: 0,O
1,N,sentence: 0,O
2,OUT,sentence: 0,O
3,BURGER,sentence: 0,O
4,242,sentence: 0,This
...,...,...,...
363,REPUBLIC,sentence: 98,O
364,8109,sentence: 98,This
365,BOSTON,sentence: 99,O
366,MARKET,sentence: 99,O


### **Loading model and training**

In [None]:
#assigning sentence_ids numerical value by label encoder
train_df_processed["sentence_id"] = LabelEncoder().fit_transform(train_df_processed["sentence_id"] )

In [None]:
train_df_processed.head(2)

Unnamed: 0,words,sentence_id,labels
0,DOLRTREE,0,O
1,2257,0,This


In [None]:
label = train_df_processed['labels'].unique().tolist()
label

['O', 'This']



*   Defining Trining Hyperparameters



In [None]:
args = NERArgs()
args.num_train_epochs = 10
args.learning_rate = 1e-4
args.overwrite_output_dir =True # saving model outputs 
args.train_batch_size = 16
args.eval_batch_size = 16

In [None]:
# loading model, setting device to GPU, giving labels and all training arguments
model = NERModel('bert','bert-base-uncased', labels=label, args = args, use_cuda=True)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [None]:
# model training and evaluating on Validation dataset
model.train_model(train_df_processed,eval_data=val_df_processed, acc = accuracy_score)
result, model_outputs, preds_lst = model.eval_model(val_df_processed)

  0%|          | 0/1 [00:00<?, ?it/s]

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 0 of 10:   0%|          | 0/7 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/7 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/7 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/7 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/7 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/7 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/7 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/7 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/7 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
#printing accuracy scores on validation dataset
result

{'eval_loss': 0.2570812867571866,
 'f1_score': 0.8717948717948718,
 'precision': 0.8173076923076923,
 'recall': 0.9340659340659341}

###**Predicting on Test Data**

In [None]:
# preparing test data for a prediction
test_df['transaction_descriptor'] = test_df['transaction_descriptor'].apply(lambda x: ' '.join(x))
test_df.head(2)

Unnamed: 0,transaction_descriptor,store_number,dataset
200,IN N OUT BURGER 242,242,test
201,BP 9442088LIBERTYVILLE B,9442088,test


In [None]:
# taking predictions with ['This'] label and mapping to the store number  
def map_outputs(sentence_lst):
  predictions, raw_outputs = model.predict(sentence_lst)
  print(predictions[0]) # one example of prediction output
  mapped_outputs = []
  for i in predictions:
    for j in i:
      tmp = 0
      if list(j.items())[0][1] == 'This':
        tmp = 1
        mapped_outputs.append(list(j.items())[0][0])
        break
    if tmp == 0:
      mapped_outputs.append(np.nan)
  return mapped_outputs

# getting the predictions on the test daaset
test_df['pred_store_number'] = map_outputs(test_df['transaction_descriptor'])

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/7 [00:00<?, ?it/s]

[{'IN': 'O'}, {'N': 'O'}, {'OUT': 'O'}, {'BURGER': 'O'}, {'242': 'This'}]


In [None]:
# counting number of right and wrong predictions: mapping (1,0) respectively
test_df['correct_preds'] = np.where((test_df['pred_store_number'] == test_df['store_number']), 1,0)

In [None]:
test_df.head()

Unnamed: 0,transaction_descriptor,store_number,dataset,pred_store_number,correct_preds
200,IN N OUT BURGER 242,242,test,242,1
201,BP 9442088LIBERTYVILLE B,9442088,test,B,0
202,JCPENNEY 1419,1419,test,1419,1
203,ROSS STORES 1019,1019,test,1019,1
204,WM SUPERCENTER 38,38,test,38,1


In [None]:
# calculating the accuracy for test data
accuracy = test_df['correct_preds'].sum()/len(test_df)
accuracy

0.88