In [8]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [9]:
import matplotlib.pyplot as plt
%matplotlib inline
import os
import json
import pickle
import torch
import numpy as np
import re
from tqdm.notebook import tqdm
from sklearn.utils import shuffle
from transformers import AutoTokenizer
import warnings
warnings.filterwarnings("ignore")

In [67]:
model_type = 'bert-base-uncased' #albert-base-v1, bert-base-cased, bert-base-uncased
data_path = "/content/"

with open(data_path + 'dev_texts.txt', 'r', encoding='utf-8') as f:
    train_text = f.readlines()



In [68]:
datasets = train_text 

In [69]:
[len(ds) for ds in datasets]

[23373, 7374, 13116, 6654, 17521, 6064, 19131, 3231]

In [73]:
def clean_text(text):
    text = text.replace('!', '.')
    text = text.replace(':', ',')
    text = text.replace('--', ',')
    
    reg = "(?<=[a-zA-Z])-(?=[a-zA-Z]{2,})"
    r = re.compile(reg, re.DOTALL)
    text = r.sub(' ', text)
    
    text = re.sub(r'\s-\s', ' , ', text)
    
#     text = text.replace('-', ',')
    text = text.replace(';', '.')
    text = text.replace(' ,', ',')
    text = text.replace('♫', '')
    text = text.replace('...', '')
  
    text = text.replace('"', ',')

    text = re.sub(r'--\s?--', '', text)
    text = re.sub(r'\s+', ' ', text)
    
    text = re.sub(r',\s?,', ',', text)
    text = re.sub(r',\s?\.', '.', text)
    text = re.sub(r'\?\s?\.', '?', text)
    text = re.sub(r'\s+', ' ', text)
    
    text = re.sub(r'\s+\?', '?', text)
    text = re.sub(r'\s+,', ',', text)
    text = re.sub(r'\.[\s+\.]+', '. ', text)
    text = re.sub(r'\s+\.', '.', text)
    
    return text.strip().lower()

In [74]:
datasets = [[clean_text(text) for text in ds] for ds in datasets]

In [75]:
[len([t for t in ds if len(t)>0]) for ds in datasets]

[19268, 6044, 10763, 5457, 14315, 4943, 15578, 2644]

In [76]:
[len(' '.join(ds).split(' ')) for ds in datasets]

[23373, 7374, 13116, 6654, 17521, 6064, 19131, 3231]

In [77]:
tokenizer = AutoTokenizer.from_pretrained(model_type)

In [78]:
target_ids = tokenizer.encode("./,")[1:-1]
target_ids

[1012, 1013, 1010]

In [79]:

target_token2id = {t: tokenizer.encode(t)[-2] for t in "./,"}
target_token2id

{',': 1010, '.': 1012, '/': 1013}

In [80]:
target_ids = list(target_token2id.values())
target_ids

[1012, 1013, 1010]

In [81]:
id2target = {
    0: 0,
    -1: -1,
}
for i, ti in enumerate(target_ids):
    id2target[ti] = i+1
target2id = {value: key for key, value in id2target.items()}

def create_target(text):
    encoded_words, targets = [], []
    
    words = text.split(' ')

    for word in words:
        target = 0
        for target_token, target_id in target_token2id.items():
            if word.endswith(target_token):
                word = word.rstrip(target_token)
                target = id2target[target_id]

        encoded_word = tokenizer.encode(word, add_special_tokens=False)
        
        for w in encoded_word:
            encoded_words.append(w)
        for _ in range(len(encoded_word)-1):
            targets.append(-1)
        targets.append(target)
        
#         print([tokenizer._convert_id_to_token(ew) for ew in encoded_word], target)
        assert(len(encoded_word)>0)

    encoded_words = [tokenizer.cls_token_id or tokenizer.bos_token_id] +\
                    encoded_words +\
                    [tokenizer.sep_token_id or tokenizer.eos_token_id]
    targets = [-1] + targets + [-1]
    
    return encoded_words, targets

In [82]:
s = "Tyranosaurus: kill me? Not enough/ said the co-pilot -- ..."
print(s)
s = clean_text(s)
print(s)
data, targets = create_target(s)
print(targets)
[tokenizer._convert_id_to_token(d) for d in data[1:-1]]

Tyranosaurus: kill me? Not enough/ said the co-pilot -- ...
tyranosaurus, kill me? not enough/ said the co pilot,
[-1, -1, -1, 3, 0, -1, 0, 0, 2, 0, 0, 0, 3, -1]


['ty',
 '##rano',
 '##saurus',
 'kill',
 'me',
 '?',
 'not',
 'enough',
 'said',
 'the',
 'co',
 'pilot']

In [83]:

encoded_texts, targets = [], []

for ds in datasets:
    x = list(zip(*(create_target(ts) for ts in tqdm(ds))))
    encoded_texts.append(x[0])
    targets.append(x[1])

  0%|          | 0/23373 [00:00<?, ?it/s]

AssertionError: ignored

In [44]:
for te, ta in zip(encoded_texts[0][0], targets[0][0]):
    print(f"{tokenizer._convert_id_to_token(te):15}\t{ta}")

IndexError: ignored

In [47]:
import os
import xml.etree.ElementTree as ET
import re

In [61]:
data_path = "/content/"
file_path = data_path + "IWSLT12.TALK.dev2010.en-fr.en.xml"

In [62]:
xmlp = ET.XMLParser(encoding="utf-8")
tree = ET.parse(file_path, parser=xmlp)
root = tree.getroot()

In [63]:
for child in root:
    print(child.tag, child.attrib)

srcset {'setid': 'iwslt2012-dev2010', 'srclang': 'english'}


In [64]:

docs = []

for doc_id in range(len(root[0])):
    doc_segs = []
    doc = root[0][doc_id]
    for seg in doc.iter('seg'):
        doc_segs.append(seg.text)
    docs.append(doc_segs)

In [65]:
dev_texts = [re.sub(r'\s+', ' ', ''.join(d)).strip() for d in docs]

In [66]:
with open(data_path + 'dev_texts.txt', 'w', encoding='utf-8') as f:
    for text in dev_texts:
        f.write(text + '\n')