# Setup

In [2]:
!pip install fasttext
!pip install pytorch-crf
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[K     |████████████████████████████████| 68 kB 3.1 MB/s 
[?25hCollecting pybind11>=2.2
  Using cached pybind11-2.10.1-py3-none-any.whl (216 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3157832 sha256=170ce0e52fa4d14bbf805099bb07805a4063889458b60d913d081ff18ac48597
  Stored in directory: /root/.cache/pip/wheels/4e/ca/bf/b020d2be95f7641801a6597a29c8f4f19e38f9c02a345bab9b
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.10.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch-crf
  Downloading pytorch_crf-0.7.2-py3-none-any.whl (9

In [3]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [4]:
import io
from math import log
from numpy import array
from numpy import argmax
import torch
import random
from math import log
from numpy import array
from numpy import argmax
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.optim import Adam
from torchcrf import CRF
from torch.optim.lr_scheduler import ExponentialLR, CyclicLR
from typing import List, Tuple, AnyStr
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import precision_recall_fscore_support
import matplotlib.pyplot as plt
from copy import deepcopy
import datasets as ds
from datasets import load_dataset, load_metric
from sklearn.metrics import confusion_matrix
import torch.nn.functional as F
import heapq

In [5]:
!apt-get install mecab mecab-ipadic-utf8 libmecab-dev swig

!pip install mecab-python3

!pip install unidic-lite

!pip install --no-binary :all: mecab-python3

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
The following additional packages will be installed:
  libmecab2 mecab-ipadic mecab-jumandic mecab-jumandic-utf8 mecab-utils
  swig3.0
Suggested packages:
  swig-doc swig-examples swig3.0-examples swig3.0-doc
The following NEW packages will be installed:
  libmecab-dev libmecab2 mecab mecab-ipadic mecab-ipadic-utf8 mecab-jumandic
  mecab-jumandic-utf8 mecab-utils swig swig3.0
0 upgraded, 10 newly installed, 0 to remove and 4 not upgraded.
Need to get 30.1 MB of archives.
After this operation, 282 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libmecab2 amd64 0.996-5 [257 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libmecab-dev amd64 0.996-5 [308 kB]
Get:3 http://archive.ubuntu.com/ub

In [6]:
def enforce_reproducibility(seed=42):
    # Sets seed manually for both CPU and CUDA
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # For atomic operations there is currently 
    # no simple way to enforce determinism, as
    # the order of parallel operations is not known.
    # CUDNN
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # System based
    random.seed(seed)
    np.random.seed(seed)

enforce_reproducibility()

# Data preprocessing


## Download and prepare the data

We'll use a small set of Wikipedia data labelled with people, locations, organizations, and "miscellaneous" entities.

In [7]:
ds = load_dataset("copenlu/answerable_tydiqa")
ds

Downloading metadata:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/4.94k [00:00<?, ?B/s]



Downloading and preparing dataset None/None (download: 75.43 MiB, generated: 131.78 MiB, post-processed: Unknown size, total: 207.21 MiB) to /root/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/71.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.49M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url'],
        num_rows: 116067
    })
    validation: Dataset({
        features: ['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url'],
        num_rows: 13325
    })
})

In [9]:
train_set = ds['train'].to_pandas()
validation_set = ds['validation'].to_pandas()

In [11]:
eng_train_set = train_set[train_set['language'] == 'english'].reset_index(drop=True)
fin_train_set = train_set[train_set['language'] == 'finnish'].reset_index(drop=True)
jap_train_set = train_set[train_set['language'] == 'japanese'].reset_index(drop=True)

eng_validation_set = validation_set[validation_set['language'] == 'english'].reset_index(drop=True)
fin_validation_set = validation_set[validation_set['language'] == 'finnish'].reset_index(drop=True)
jap_validation_set = validation_set[validation_set['language'] == 'japanese'].reset_index(drop=True)

In [13]:
def get_answerability(annotations):
    answerable = []
    for annot in annotations:
        if -1 in annot['answer_start']:
            answerable.append(0)
        else:
            answerable.append(1)
    return answerable

eng_train_annotations = eng_train_set['annotations'].tolist()
fin_train_annotations = fin_train_set['annotations'].tolist()
jap_train_annotations = jap_train_set['annotations'].tolist()

eng_validation_annotations = eng_validation_set['annotations'].tolist()
fin_validation_annotations = fin_validation_set['annotations'].tolist()
jap_validation_annotations = jap_validation_set['annotations'].tolist()


eng_train_set['answerable'] = get_answerability(eng_train_annotations)
fin_train_set['answerable'] = get_answerability(fin_train_annotations)
jap_train_set['answerable'] = get_answerability(jap_train_annotations)

eng_validation_set['answerable'] = get_answerability(eng_validation_annotations)
fin_validation_set['answerable'] = get_answerability(fin_validation_annotations)
jap_validation_set['answerable'] = get_answerability(jap_validation_annotations)



# datasets['train'].features['answerable'] = get_answerability(train_annotations)
# datasets['validation'].features['answerable'] = get_answerability(validation_annotations)

# t_ans = get_answerability(train_annotations)
# v_ans = get_answerability(validation_annotations)

# datasets['train'] = datasets['train'].add_column('answerable', t_ans)
# datasets['validation'] = datasets['validation'].add_column('answerable', v_ans)

In [14]:
eng_ans_text_train = np.empty((len(eng_train_set), 1), dtype=object)
fin_ans_text_train = np.empty((len(fin_train_set), 1), dtype=object)
jap_ans_text_train = np.empty((len(jap_train_set), 1), dtype=object)

for i in range(len(eng_train_set)):
    eng_ans_text_train[i] = eng_train_annotations[i]['answer_text']
eng_train_set['answer_text'] = eng_ans_text_train

for i in range(len(fin_train_set)):
    fin_ans_text_train[i] = fin_train_annotations[i]['answer_text']
fin_train_set['answer_text'] = fin_ans_text_train

for i in range(len(jap_train_set)):
    jap_ans_text_train[i] = jap_train_annotations[i]['answer_text']
jap_train_set['answer_text'] = jap_ans_text_train


eng_ans_text_val = np.empty((len(eng_validation_set), 1), dtype=object)
fin_ans_text_val = np.empty((len(fin_validation_set), 1), dtype=object)
jap_ans_text_val = np.empty((len(jap_validation_set), 1), dtype=object)

for i in range(len(eng_validation_set)):
    eng_ans_text_val[i] = eng_validation_annotations[i]['answer_text']
eng_validation_set['answer_text'] = eng_ans_text_val

for i in range(len(fin_validation_set)):
    fin_ans_text_val[i] = fin_validation_annotations[i]['answer_text']
fin_validation_set['answer_text'] = fin_ans_text_val

for i in range(len(jap_validation_set)):
    jap_ans_text_val[i] = jap_validation_annotations[i]['answer_text']
jap_validation_set['answer_text'] = jap_ans_text_val

In [15]:
jap_train_set['answer_text']

0            カリフォルニア州サンフランシスコ
1                 ケント州ダートフォード
2                        ラーオ族
3       桝太一（日本テレビアナウンサー）と関根麻里
4                 懲役2年・執行猶予5年
                ...          
8773                         
8774                         
8775                         
8776                         
8777                         
Name: answer_text, Length: 8778, dtype: object

In [16]:
from nltk.tokenize import WordPunctTokenizer, TreebankWordTokenizer, RegexpTokenizer
import MeCab

# punct = WordPunctTokenizer()
# tree = TreebankWordTokenizer()
reg = RegexpTokenizer('\w+[\'\-+]?\w*|\[\d+\]')
# reg = RegexpTokenizer('\w+[\'\-+]?\w*|[1-9]\d{0,2}(,\d{3})*(\.\d+)?|\[\d+\]')

wakati = MeCab.Tagger("-Owakati")

eng_train_set['answer_tokens'] = [reg.tokenize(i) for i in eng_train_set.answer_text.values]
eng_validation_set['answer_tokens'] = [reg.tokenize(i) for i in eng_validation_set.answer_text.values]
eng_train_set['tokens'] = [reg.tokenize(i) for i in eng_train_set.document_plaintext.values]
eng_validation_set['tokens'] = [reg.tokenize(i) for i in eng_validation_set.document_plaintext.values]
eng_train_set['que_tokens'] = [reg.tokenize(i) for i in eng_train_set.question_text.values]
eng_validation_set['que_tokens'] = [reg.tokenize(i) for i in eng_validation_set.question_text.values]

fin_train_set['answer_tokens'] = [reg.tokenize(i) for i in fin_train_set.answer_text.values]
fin_validation_set['answer_tokens'] = [reg.tokenize(i) for i in fin_validation_set.answer_text.values]
fin_train_set['tokens'] = [reg.tokenize(i) for i in fin_train_set.document_plaintext.values]
fin_validation_set['tokens'] = [reg.tokenize(i) for i in fin_validation_set.document_plaintext.values]
fin_train_set['que_tokens'] = [reg.tokenize(i) for i in fin_train_set.question_text.values]
fin_validation_set['que_tokens'] = [reg.tokenize(i) for i in fin_validation_set.question_text.values]

jap_train_set['answer_tokens'] = [wakati.parse(i).split() for i in jap_train_set.answer_text.values]
jap_validation_set['answer_tokens'] = [wakati.parse(i).split() for i in jap_validation_set.answer_text.values]
jap_train_set['tokens'] = [wakati.parse(i).split() for i in jap_train_set.document_plaintext.values]
jap_validation_set['tokens'] = [wakati.parse(i).split() for i in jap_validation_set.document_plaintext.values]
jap_train_set['que_tokens'] = [wakati.parse(i).split() for i in jap_train_set.question_text.values]
jap_validation_set['que_tokens'] = [wakati.parse(i).split() for i in jap_validation_set.question_text.values]






# datasets['train'].features['answer_text_tokenized'] = [reg.tokenize(i) for i in train_set.answer_text.values]
# datasets['validation'].features['answer_text_tokenized'] = validation_set['answer_text_tokenized']
# datasets['train'].features['doc_plaintext_tokenized_temp'] = train_set['doc_plaintext_tokenized_temp']
# datasets['validation'].features['doc_plaintext_tokenized_temp'] = validation_set['doc_plaintext_tokenized_temp']

In [17]:
jap_train_set['que_tokens']

0       [“, ダン, ”, ダニエル, ・, ジャド, ソン, ・, キャラハン, の, 出身, ...
1           [サー, ・, マイケル, ・, フィリップ, ・, ジャガー, の, 出身, は, ？]
2                [ラオス, 王国, の, 王族, の, 名前, は, なん, です, か, ？]
3       [日本, テレビ, 系列, 『, ZIP, !, 』, の, 初代, の, 司会, は, 誰...
4       [西村, 眞悟, 弁護, 士, 法, 違反, 事件, で, 西村, が, 受け, た, 罰則...
                              ...                        
8773                             [日本, 最大, の, 宗教, は, 何, ？]
8774                           [コソボ, の, 初代, 大統領, は, 誰, ？]
8775              [ギュスターヴ, ・, シャルパンティエ, は, いつ, 生まれ, た, ？]
8776                            [彭, 徳懐, は, いつ, 生まれ, た, ？]
8777                             [ブラームス, の, 出身, は, どこ, ？]
Name: que_tokens, Length: 8778, dtype: object

In [18]:
eng_train_set['cls'] = [['[CLS]'] for i in range(len(eng_train_set))]
eng_validation_set['cls'] = [['[CLS]'] for i in range(len(eng_validation_set))]

fin_train_set['cls'] = [['[CLS]'] for i in range(len(fin_train_set.answerable))]
fin_validation_set['cls'] = [['[CLS]'] for i in range(len(fin_validation_set))]

jap_train_set['cls'] = [['[CLS]'] for i in range(len(jap_train_set))]
jap_validation_set['cls'] = [['[CLS]'] for i in range(len(jap_validation_set))]

In [19]:
eng_train_set['que_doc_tokens'] = eng_train_set['que_tokens'] + eng_train_set['cls'] + eng_train_set['tokens']
eng_validation_set['que_doc_tokens'] = eng_validation_set['que_tokens'] + eng_validation_set['cls'] + eng_validation_set['tokens']

fin_train_set['que_doc_tokens'] = fin_train_set['que_tokens'] + fin_train_set['cls'] + fin_train_set['tokens']
fin_validation_set['que_doc_tokens'] = fin_validation_set['que_tokens'] + fin_validation_set['cls'] + fin_validation_set['tokens']

jap_train_set['que_doc_tokens'] = jap_train_set['que_tokens'] + jap_train_set['cls'] + jap_train_set['tokens']
jap_validation_set['que_doc_tokens'] = jap_validation_set['que_tokens'] + jap_validation_set['cls'] + jap_validation_set['tokens']

In [20]:
eng_train_set['que_doc_text'] = eng_train_set['question_text'].map(str) + ' [CLS] ' + fin_train_set['document_plaintext'].map(str)
eng_validation_set['que_doc_text'] = eng_validation_set['question_text'].map(str) + ' [CLS] ' + fin_validation_set['document_plaintext'].map(str)

fin_train_set['que_doc_text'] = fin_train_set['question_text'].map(str) + ' [CLS] ' + fin_train_set['document_plaintext'].map(str)
fin_validation_set['que_doc_text'] = fin_validation_set['question_text'].map(str) + ' [CLS] ' + fin_validation_set['document_plaintext'].map(str)

jap_train_set['que_doc_text'] = jap_train_set['question_text'].map(str) + ' [CLS] ' + jap_train_set['document_plaintext'].map(str)
jap_validation_set['que_doc_text'] = jap_validation_set['question_text'].map(str) + ' [CLS] ' + jap_validation_set['document_plaintext'].map(str)

In [21]:
print(jap_validation_set['answer_tokens'][0])
print(jap_validation_set['que_doc_tokens'][0])

['パリ']
['化学', '兵器', '禁止', '条約', 'は', 'どこ', 'で', '採択', 'さ', 'れ', 'た', '？', '[CLS]', '1993', '年', '1', '月', '13', '日', 'に', 'パリ', 'に', 'おい', 'て', '署名', 'が', 'なさ', 'れ', '、', '1997', '年', '4', '月', '29', '日', 'に', '発効', 'し', 'た', '[', '1', ']。', '実効', '的', 'な', '検証', '制度', 'を', '有する', 'こと', 'も', '特徴', 'で', 'あり', '[', '1', ']、', '条約', 'の', '発効', 'と', 'とも', 'に', '、', 'その', '第', '8', '条', 'に', '基づき', '、', 'オランダ', 'の', 'ハーグ', 'に', '査察', '実施', '機関', 'の', '化学', '兵器', '禁止', '機関', '（', 'OPCW', '）', 'が', '設置', 'さ', 'れ', 'た', '。']


In [22]:
def get_iob_tags(ans_text_token, doc_text_token, que_text_token):
    IOB = [ [] for _ in range(len(doc_text_token)) ]
    for i, list in enumerate(doc_text_token):
        for j, elm in enumerate(list):
            if j <= (len(que_text_token[i])):
                IOB[i].append(0)
            elif len(ans_text_token[i]) != 0 and elm == ans_text_token[i][0] and 2 not in IOB[i]:
                if len(ans_text_token[i]) > 1 and list[j+1] == ans_text_token[i][1]:
                    IOB[i].append(2)
                else:
                    IOB[i].append(2)
            elif IOB[i][j-1] == 2 and elm in ans_text_token[i] or IOB[i][j-1] == 1 and elm in ans_text_token[i]:
                IOB[i].append(1)
            else:
                IOB[i].append(0)
    return IOB

# def get_iob_tags(ans_text_token, doc_text_token, que_text_token):
#     IOB = [ [] for _ in range(len(doc_text_token)) ]
#     for i, elm in enumerate(doc_text_token):
#         for j, x in enumerate(elm):
#             if j <= (len(que_text_token[i])):
#                 IOB[i].append(3)
#             elif x in ans_text_token[i] and (j-1) >= 0 and IOB[i][j-1]==0 or x in ans_text_token[i] and j==0:
#                 IOB[i].append(2)
#             elif x in ans_text_token[i] and (j-1) >= 0 and IOB[i][j-1]==2 or x in ans_text_token[i] and (j-1) >= 0 and IOB[i][j-1]==1:
#                 IOB[i].append(1)
#             else:
#                 IOB[i].append(0)
#     return IOB

eng_train_ans_token = eng_train_set['answer_tokens'].tolist()
eng_validation_ans_token = eng_validation_set['answer_tokens'].tolist()
eng_train_doc_token = eng_train_set['tokens'].tolist()
eng_validation_doc_token = eng_validation_set['tokens'].tolist()
eng_train_que_token = eng_train_set['que_tokens'].tolist()
eng_validation_que_token = eng_validation_set['que_tokens'].tolist()
eng_train_que_doc_token = eng_train_set['que_doc_tokens'].tolist()
eng_validation_que_doc_token = eng_validation_set['que_doc_tokens'].tolist()

fin_train_ans_token = fin_train_set['answer_tokens'].tolist()
fin_validation_ans_token = fin_validation_set['answer_tokens'].tolist()
fin_train_doc_token = fin_train_set['tokens'].tolist()
fin_validation_doc_token = fin_validation_set['tokens'].tolist()
fin_train_que_token = fin_train_set['que_tokens'].tolist()
fin_validation_que_token = fin_validation_set['que_tokens'].tolist()
fin_train_que_doc_token = fin_train_set['que_doc_tokens'].tolist()
fin_validation_que_doc_token = fin_validation_set['que_doc_tokens'].tolist()

jap_train_ans_token = jap_train_set['answer_tokens'].tolist()
jap_validation_ans_token = jap_validation_set['answer_tokens'].tolist()
jap_train_doc_token = jap_train_set['tokens'].tolist()
jap_validation_doc_token = jap_validation_set['tokens'].tolist()
jap_train_que_token = jap_train_set['que_tokens'].tolist()
jap_validation_que_token = jap_validation_set['que_tokens'].tolist()
jap_train_que_doc_token = jap_train_set['que_doc_tokens'].tolist()
jap_validation_que_doc_token = jap_validation_set['que_doc_tokens'].tolist()




eng_train_set['IOB'] = get_iob_tags(eng_train_ans_token, eng_train_que_doc_token, eng_train_que_token)
eng_validation_set['IOB'] = get_iob_tags(eng_validation_ans_token, eng_validation_que_doc_token, eng_validation_que_token)

fin_train_set['IOB'] = get_iob_tags(fin_train_ans_token, fin_train_que_doc_token, fin_train_que_token)
fin_validation_set['IOB'] = get_iob_tags(fin_validation_ans_token, fin_validation_que_doc_token, fin_validation_que_token)

jap_train_set['IOB'] = get_iob_tags(jap_train_ans_token, jap_train_que_doc_token, jap_train_que_token)
jap_validation_set['IOB'] = get_iob_tags(jap_validation_ans_token, jap_validation_que_doc_token, jap_validation_que_token)

In [24]:
print(fin_train_set['IOB'][4])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]


In [25]:
eng_validation_set.columns

Index(['question_text', 'document_title', 'language', 'annotations',
       'document_plaintext', 'document_url', 'answerable', 'answer_text',
       'answer_tokens', 'tokens', 'que_tokens', 'cls', 'que_doc_tokens',
       'que_doc_text', 'IOB'],
      dtype='object')

In [26]:
from datasets import DatasetDict, Dataset

def make_df(train_set, validation_set):
	# training_data = train_set['tokens']#.tolist()
	training_data = train_set['que_doc_tokens']#.tolist()
	training_labels = train_set['IOB'].tolist()

	# validation_data = validation_set['tokens']#.tolist()
	validation_data = validation_set['que_doc_tokens']#.tolist()
	validation_labels = validation_set['IOB'].tolist()

	data_set = {}
	sets = [['train',training_data, 
			# training_que, 
			training_labels], ['val', validation_data, 
			# validation_que, 
			validation_labels]]
	for meta in sets:
		data_set[meta[0]] = {}
		data_set[meta[0]]['text_tokens'] = []
		# data_set[meta[0]]['que_tokens'] = []
		data_set[meta[0]]['IOB_tags'] = []
		
		for ind, text in enumerate(meta[1]):
			data_set[meta[0]]['text_tokens'].append(text)
			# data_set[meta[0]]['que_tokens'].append(meta[2][ind])
			data_set[meta[0]]['IOB_tags'].append(meta[2][ind])
			
	data_set = DatasetDict({'train':Dataset.from_dict(data_set['train']),
                          'validation':Dataset.from_dict(data_set['val'])\
                        })
						
	return data_set

In [27]:
eng_set = make_df(eng_train_set, eng_validation_set)
fin_set = make_df(fin_train_set, fin_validation_set)
jap_set = make_df(jap_train_set, jap_validation_set)

print(eng_set['train'].features['IOB_tags'])

Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)


In [28]:
jap_set

DatasetDict({
    train: Dataset({
        features: ['text_tokens', 'IOB_tags'],
        num_rows: 8778
    })
    validation: Dataset({
        features: ['text_tokens', 'IOB_tags'],
        num_rows: 1036
    })
})

In [29]:
print(eng_set['train']['IOB_tags'][4])

[0, 0, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [30]:
import datasets

eng_set['train'] = eng_set['train'].cast_column("IOB_tags", datasets.Sequence(datasets.ClassLabel(names=["O", "I", "B"])))
eng_set['validation'] = eng_set['train'].cast_column("IOB_tags", datasets.Sequence(datasets.ClassLabel(names=["O", "I", "B"])))

fin_set['train'] = fin_set['train'].cast_column("IOB_tags", datasets.Sequence(datasets.ClassLabel(names=["O", "I", "B"])))
fin_set['validation'] = fin_set['validation'].cast_column("IOB_tags", datasets.Sequence(datasets.ClassLabel(names=["O", "I", "B"])))

jap_set['train'] = jap_set['train'].cast_column("IOB_tags", datasets.Sequence(datasets.ClassLabel(names=["O", "I", "B"])))
jap_set['validation'] = jap_set['validation'].cast_column("IOB_tags", datasets.Sequence(datasets.ClassLabel(names=["O", "I", "B"])))

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/2 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

In [31]:
print(eng_set['train'].features['IOB_tags'])

Sequence(feature=ClassLabel(names=['O', 'I', 'B'], id=None), length=-1, id=None)


# Sequence labeller

In [32]:
%pip install transformers
%pip install sentencepiece
%pip install fugashi[unidic-lite]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 5.0 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 49.2 MB/s 
Installing collected packages: tokenizers, transformers
Successfully installed tokenizers-0.13.1 transformers-4.24.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 5.1 MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.97
Looking in indexes: https://pypi.org/simple, https://u

In [33]:
%pip install ipadic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ipadic
  Downloading ipadic-1.0.0.tar.gz (13.4 MB)
[K     |████████████████████████████████| 13.4 MB 5.3 MB/s 
[?25hBuilding wheels for collected packages: ipadic
  Building wheel for ipadic (setup.py) ... [?25l[?25hdone
  Created wheel for ipadic: filename=ipadic-1.0.0-py3-none-any.whl size=13556723 sha256=ee88a9852513cbc9dcb8f4ef913624c5484d4f1726bd81310c1b45eb58990359
  Stored in directory: /root/.cache/pip/wheels/33/8b/99/cf0d27191876637cd3639a560f93aa982d7855ce826c94348b
Successfully built ipadic
Installing collected packages: ipadic
Successfully installed ipadic-1.0.0


In [34]:
from transformers import AutoTokenizer, BertForTokenClassification, BertTokenizer

eng_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")

fin_tokenizer = AutoTokenizer.from_pretrained("TurkuNLP/bert-base-finnish-cased-v1")

jap_tokenizer = AutoTokenizer.from_pretrained("jurabi/bert-ner-japanese") #"cl-tohoku/bert-base-japanese-v2")

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/424k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/816k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/498 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/236k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [35]:
def eng_tokenize_and_align_labels(examples):
    tokenized_inputs = eng_tokenizer(examples["text_tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"IOB_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

def fin_tokenize_and_align_labels(examples):
    tokenized_inputs = fin_tokenizer(examples["text_tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"IOB_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

def jap_tokenize_and_align_labels(examples):
    tokenized_inputs = jap_tokenizer(examples["text_tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"IOB_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [36]:
tokenized_eng_dataset= eng_set.map(
    eng_tokenize_and_align_labels,
    batched=True,
    remove_columns=eng_set["train"].column_names
)

tokenized_fin_dataset= fin_set.map(
    fin_tokenize_and_align_labels,
    batched=True,
    remove_columns=fin_set["train"].column_names
)

# tokenized_jap_dataset= jap_set.map(
#     jap_tokenize_and_align_labels,
#     batched=True,
#     remove_columns=jap_set["train"].column_names
# )

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/14 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [37]:
tokenized_eng_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 7389
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 7389
    })
})

In [38]:
from transformers import DataCollatorForTokenClassification

eng_data_collator = DataCollatorForTokenClassification(tokenizer=eng_tokenizer)

fin_data_collator = DataCollatorForTokenClassification(tokenizer=fin_tokenizer)

In [39]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

label_names = eng_set['train'].features["IOB_tags"].feature.names
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

num_labels = len(eng_set['train'].features[f'IOB_tags'].feature.names)

eng_model = AutoModelForTokenClassification.from_pretrained("distilbert-base-cased-distilled-squad", id2label=id2label, label2id=label2id)

fin_model = AutoModelForTokenClassification.from_pretrained("TurkuNLP/bert-base-finnish-cased-v1", id2label=id2label, label2id=label2id)

jap_model = AutoModelForTokenClassification.from_pretrained("cl-tohoku/bert-base-japanese-v2", id2label=id2label, label2id=label2id)

Downloading:   0%|          | 0.00/261M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-cased-distilled-squad were not used when initializing DistilBertForTokenClassification: ['qa_outputs.bias', 'qa_outputs.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased-distilled-squad and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at TurkuNLP/bert-base-finnish-cased-v1 were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpo

In [40]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

eng_trainer = Trainer(
    model=eng_model,
    args=training_args,
    train_dataset=tokenized_eng_dataset['train'],
    eval_dataset=tokenized_eng_dataset['validation'],
    tokenizer=eng_tokenizer,
    data_collator=eng_data_collator,
)

fin_trainer = Trainer(
    model=fin_model,
    args=training_args,
    train_dataset=tokenized_fin_dataset['train'],
    eval_dataset=tokenized_fin_dataset['validation'],
    tokenizer=fin_tokenizer,
    data_collator=fin_data_collator,
)

jap_trainer = Trainer(
    model=jap_model,
    args=training_args,
    train_dataset=tokenized_fin_dataset['train'],
    eval_dataset=tokenized_fin_dataset['validation'],
    tokenizer=fin_tokenizer,
    data_collator=fin_data_collator,
)

In [41]:
eng_trainer.train()

***** Running training *****
  Num examples = 7389
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1386
  Number of trainable parameters = 64800003
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,0.03898
2,0.071200,0.027569
3,0.041000,0.023043


***** Running Evaluation *****
  Num examples = 7389
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 7389
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 7389
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1386, training_loss=0.0491092772710891, metrics={'train_runtime': 912.5609, 'train_samples_per_second': 24.291, 'train_steps_per_second': 1.519, 'total_flos': 1740327952117050.0, 'train_loss': 0.0491092772710891, 'epoch': 3.0})

In [42]:
fin_trainer.train()

In [None]:
jap_trainer.train()

In [43]:
eng_trainer.save_model('./saved_model') #For reuse
fin_trainer.save_model('./saved_model') #For reuse
jap_trainer.save_model('./saved_model') #For reuse

Saving model checkpoint to ./saved_model
Configuration saved in ./saved_model/config.json
Model weights saved in ./saved_model/pytorch_model.bin
tokenizer config file saved in ./saved_model/tokenizer_config.json
Special tokens file saved in ./saved_model/special_tokens_map.json


In [44]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "./saved_model"
eng_token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)

loading configuration file ./saved_model/config.json
Model config DistilBertConfig {
  "_name_or_path": "./saved_model",
  "activation": "gelu",
  "architectures": [
    "DistilBertForTokenClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "O",
    "1": "I",
    "2": "B"
  },
  "initializer_range": 0.02,
  "label2id": {
    "B": "2",
    "I": "1",
    "O": "0"
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": true,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "vocab_size": 28996
}

loading configuration file ./saved_model/config.json
Model config DistilBertConfig {
  "_name_or_path": "./saved_model",
  "activation": "gelu",
  "architectures": [
    "DistilBertForTokenClassification"
  ],
  "atte

In [45]:
print(eng_validation_set['document_plaintext'][6])
print(eng_validation_set['annotations'][6])

Guinness World Records lists Christie as the best-selling novelist of all time. Her novels have sold roughly 2 billion copies, and her estate claims that her works come third in the rankings of the world's most-widely published books,[6] behind only Shakespeare's works and the Bible. According to Index Translationum, she remains the most-translated individual author, having been translated into at least 103 languages.[7] And Then There Were None is Christie's best-selling novel, with 100 million sales to date, making it the world's best-selling mystery ever, and one of the best-selling books of all time.[8] Christie's stage play The Mousetrap holds the world record for longest initial run. It opened at the Ambassadors Theatre in the West End on 25 November 1952, and as of September 2018 is still running after more than 27,000 performances.[9][10]
{'answer_start': array([109]), 'answer_text': array(['2 billion'], dtype=object)}


In [46]:
print(eng_validation_set['que_doc_tokens'][6])

['How', 'many', 'units', 'has', 'Agatha', 'Christie', 'sold', '[CLS]', 'Guinness', 'World', 'Records', 'lists', 'Christie', 'as', 'the', 'best-selling', 'novelist', 'of', 'all', 'time', 'Her', 'novels', 'have', 'sold', 'roughly', '2', 'billion', 'copies', 'and', 'her', 'estate', 'claims', 'that', 'her', 'works', 'come', 'third', 'in', 'the', 'rankings', 'of', 'the', "world's", 'most-widely', 'published', 'books', '[6]', 'behind', 'only', "Shakespeare's", 'works', 'and', 'the', 'Bible', 'According', 'to', 'Index', 'Translationum', 'she', 'remains', 'the', 'most-translated', 'individual', 'author', 'having', 'been', 'translated', 'into', 'at', 'least', '103', 'languages', '[7]', 'And', 'Then', 'There', 'Were', 'None', 'is', "Christie's", 'best-selling', 'novel', 'with', '100', 'million', 'sales', 'to', 'date', 'making', 'it', 'the', "world's", 'best-selling', 'mystery', 'ever', 'and', 'one', 'of', 'the', 'best-selling', 'books', 'of', 'all', 'time', '[8]', "Christie's", 'stage', 'play', 

In [57]:
print(len(eng_token_classifier(eng_validation_set['tokens'][4])))

206


In [48]:
predictions = [ [] ] * len(eng_validation_set)

for i in range(len(eng_validation_set)):
    for elm in eng_token_classifier(eng_validation_set['tokens'][i]):
        if elm == []:
            predictions[i].append(0)
        elif elm[0]['entity_group'] == 'I':
            predictions[i].append(1)
        elif elm[0]['entity_group'] == 'B':
            predictions[i].append(2)
        # elif elm[0]['entity_group'] == 'X':
        #     predictions[i].append(3)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [65]:
test = [ [] ] * 3

for i in range(3):
    for elm in eng_token_classifier(eng_validation_set['tokens'][i]):
        if elm == []:
            test[i].append(0)
        elif elm[0]['entity_group'] == 'I':
            test[i].append(1)
        elif elm[0]['entity_group'] == 'B':
            test[i].append(2)

In [66]:
print(len(test[0]))
print(len(test[1]))
print(len(test[2]))

330
330
330


In [68]:
eng_token_classifier(eng_validation_set['tokens'][0])

[[],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 []]