# Installing and importing libraries

In [None]:
#!pip install simpletransformers

In [2]:
# connecting Google Drive with data
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import numpy as np
import pandas as pd
from scipy.special import softmax
from sklearn.utils import class_weight
from pathlib import *

from simpletransformers.ner import NERModel, NERArgs

# Train data

In [5]:
train = pd.read_csv('/content/drive/MyDrive/MIPT/Texts_NER_train.csv')

In [6]:
train.columns = ["sentence_id", "words", "labels"]
train.head(5)

Unnamed: 0,sentence_id,words,labels
0,0,using,O
1,0,information,O
2,0,dialogue,O
3,0,participants,O
4,0,setsuo,O


In [7]:
# deleting incorrect data
data_nan = []
for row, data in train.iterrows():
    if not (type(data['words']) is str):
        data_nan.append(row)

for row in data_nan:
    train.drop(labels = row,axis = 0, inplace = True)
train.tail(5)

Unnamed: 0,sentence_id,words,labels
106380,7481,november,O
106381,7481,pages,O
106382,7481,los,O
106383,7481,altos,O
106384,7482,kaufmann,O


# Model training

In [None]:
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(train.labels), y=train.labels)
model_args = NERArgs(labels_list = ['B', 'O'], num_train_epochs = 4,\
                     max_seq_length = 512, use_early_stopping = True,\
                     overwrite_output_dir = True)

model = NERModel("bert", "bert-base-cased", args=model_args,\
                  weight = list(class_weights), use_cuda=True, cuda_device=-1)

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

In [None]:
model.train_model(train)

  return [


  0%|          | 0/2 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/866 [00:00<?, ?it/s]



Running Epoch 1 of 4:   0%|          | 0/866 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/866 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/866 [00:00<?, ?it/s]

(3464, 0.059945422730687745)

# Evaluation of the result

In [None]:
input_path = '/content/drive/MyDrive/MIPT/Texts_NER_1'

# training data
paths = ['00/A00-1006.txt', '01/H01-1001.txt', '02/C02-1001.txt', '03/J03-1001.txt', '04/N04-1002.txt', '05/I05-2002.txt',\
        '06/E06-1001.txt', '65/C65-1002.txt', '67/C67-1002.txt', '69/C69-0101.txt', '73/C73-1001.txt', '75/T75-1002.txt',\
        '78/T78-1001.txt', '79/P79-1002.txt', '80/P80-1001.txt', '81/J81-1001.txt', '82/P82-1001.txt', '83/A83-1001.txt',\
        '84/J84-1001.txt', '85/E85-1002.txt', '86/H86-1001.txt', '87/T87-1004.txt', '88/A88-1001.txt', '89/H89-1006.txt',\
        '90/W90-0101.txt', '91/M91-1001.txt', '92/M92-1001.txt', '93/X93-1001.txt', '94/W94-0101.txt', '95/M95-1001.txt',\
        '96/X96-1002.txt', '97/A97-1001.txt', '98/X98-1001.txt', '99/E99-1001.txt']

precision = []
recall = []
count = 0

for direct in Path(input_path).iterdir():
    number = direct.stem
    new_input_path = Path(input_path + '/' + number)
    for item in Path(new_input_path).iterdir():
        if number + '/' + item.name not in paths:
            sentence = ''
            sentences = []
            truth = []

            # creating a list of terms for a document
            with open(item, 'r', errors="ignore") as fin:
                num = 0
                for line in fin.readlines():
                    words = line.split()
                    if num != int(words[0]):
                        sentences.append(sentence)
                        sentence = ''
                        num = int(words[0])
                    sentence += ' ' + words[1]
                    if len(words) > 2:
                        if words[2] == 'B':
                            truth.append(words[1])
            truth = set(truth)

            # prediction
            predictions, raw_outputs = model.predict(sentences)
            result=[]

            for n, (preds, outs) in enumerate(zip(predictions, raw_outputs)):
                for pred, out in zip(preds, outs):
                    key = list(pred.keys())[0]
                    new_out = out[key]
                    preds = list(softmax(np.mean(new_out, axis=0)))
                    if pred[key] == 'B':
                        result.append(key)

            result = set(result)
            precision.append(len(result & truth) / len(result))
            recall.append(len(result & truth) / len(truth))

            # progress
            if count % 100 == 0:
                print('Progress: ', count)
            count += 1

In [None]:
# counting the result
prec = sum(precision) / len(precision)
print('precision: ', prec)
rec = sum(recall) / len(recall)
print('recall: ', rec)
print('F1: ', 2 * prec * rec / (prec + rec))

precision:  0.7632082708233291
recall:  0.8897341990300788
F1:  0.8216287159641416


In [None]:
# the second way of evaluation
sentence = ''
sentences = []
truth = []
with open('Texts_NER.txt', 'r') as fin:
    num = 0
    for line in fin.readlines():
        words = line.split()
        if num != int(words[0]):
            sentences.append(sentence)
            sentence = ''
            num = int(words[0])
        sentence += ' ' + words[1]
        if words[2] == 'B':
            truth.append(words[1])
truth = set(truth)
print(sentences[5])

 these customer service centers use interactive voice response ivr systems the frontend for determining the users need providing list options that the user can choose from and then routing the call appropriately the


In [None]:
predictions, raw_outputs = model.predict(sentences)
result=[]

for n, (preds, outs) in enumerate(zip(predictions, raw_outputs)):
    for pred, out in zip(preds, outs):
        key = list(pred.keys())[0]
        new_out = out[key]
        preds = list(softmax(np.mean(new_out, axis=0)))
        if pred[key] == 'B':
            result.append(key)

result = set(result)
print(len(result))
print(len(truth))
print(len(result & truth))
prec = len(result & truth) / len(result)
print('precision: ', prec)
rec = len(result & truth) / len(truth)
print('recall: ', rec)
print('F1: ', 2 * prec * rec / (prec + rec))

  0%|          | 0/2 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1830 [00:00<?, ?it/s]

550
478
350
precision:  0.6363636363636364
recall:  0.7322175732217573
F1:  0.6809338521400777
