In [34]:
import os
import xml.etree.ElementTree as ET
import re

In [35]:
data_path = "/content/"
file_path = data_path + "IWSLT12.TALK.dev2010.en-fr.en.xml"

In [36]:

xmlp = ET.XMLParser(encoding="utf-8")
tree = ET.parse(file_path, parser=xmlp)
root = tree.getroot()

In [37]:
for child in root:
    print(child.tag, child.attrib)

srcset {'setid': 'iwslt2012-dev2010', 'srclang': 'english'}


In [38]:
docs = []

for doc_id in range(len(root[0])):
    doc_segs = []
    doc = root[0][doc_id]
    for seg in doc.iter('seg'):
        doc_segs.append(seg.text)
    docs.append(doc_segs)

In [39]:
dev_texts = [re.sub(r'\s+', ' ', ''.join(d)).strip() for d in docs]

In [40]:
with open(data_path + 'dev_texts.txt', 'w', encoding='utf-8') as f:
    for text in dev_texts:
        f.write(text + '\n')

In [41]:
file_path = data_path + "IWSLT12.TED.MT.tst2012.en-fr.en.xml"

xmlp = ET.XMLParser(encoding="utf-8")
tree = ET.parse(file_path, parser=xmlp)
root = tree.getroot()

In [42]:
docs = []

for doc_id in range(len(root[0])):
    doc_segs = []
    doc = root[0][doc_id]
    for seg in doc.iter('seg'):
        doc_segs.append(seg.text)
    docs.append(doc_segs)

test_texts_2012 = [re.sub(r'\s+', ' ', ''.join(d)).strip() for d in docs]

with open(data_path + 'test_texts_2012.txt', 'w', encoding='utf-8') as f:
    for text in test_texts_2012:
        f.write(text + '\n')


In [49]:

file_path = data_path + "train.tags.en-fr.en"
xmlp = ET.XMLParser(encoding="UTF-8")
tree = ET.parse(file_path, parser=xmlp)
root = tree.getroot()

In [50]:
docs = []

for doc in root.iter('transcript'):
    docs.append(doc.text)

In [51]:
train_texts = [re.sub(r'\s+', ' ', d.replace('\n', ' ')).strip() for d in docs]

with open(data_path + 'train_texts.txt', 'w', encoding='utf-8') as f:
    for text in train_texts:
        f.write(text + '\n')

In [52]:
import matplotlib.pyplot as plt
%matplotlib inline
import os
import json
import pickle
import torch
import numpy as np
import re
from tqdm.notebook import tqdm
from sklearn.utils import shuffle
from transformers import AutoTokenizer
import warnings
warnings.filterwarnings("ignore")

In [63]:
model_type = 'bert-base-uncased' #albert-base-v1, bert-base-cased, bert-base-uncased
data_path = "/content/"

with open(data_path + 'data1.txt', 'r', encoding='utf-8') as f:
    train_text = f.readlines()
with open(data_path + 'dev_texts.txt', 'r', encoding='utf-8') as f:
    valid_text = f.readlines()
with open(data_path + 'test_texts_2012.txt', 'r', encoding='utf-8') as f:
    test_text = f.readlines()

In [64]:
datasets = train_text, valid_text, test_text

In [65]:
[len(ds) for ds in datasets]

[2205, 8, 11]

In [83]:
def clean_text(text):
    text = text.replace('!', '/')
    text = text.replace(':', ',')
    text = text.replace('--', ',')
    text = text.replace('?', '/')
    reg = "(?<=[a-zA-Z])-(?=[a-zA-Z]{2,})"
    r = re.compile(reg, re.DOTALL)
    text = r.sub(' ', text)
    
    text = re.sub(r'\s-\s', ' , ', text)
    
#     text = text.replace('-', ',')
    text = text.replace(';', '.')
    text = text.replace(' ,', ',')
    text = text.replace('♫', '')
    text = text.replace('...', '')
    text = text.replace('.\"', ',')
    text = text.replace('"', ',')
    text = re.sub(r'--\s?--', '', text)
    text = re.sub(r'\s+', ' ', text)
    
    text = re.sub(r',\s?,', ',', text)
    text = re.sub(r',\s?\.', '.', text)
    text = re.sub(r'\?\s?\.', '?', text)
    text = re.sub(r'\s+', ' ', text)
    
    text = re.sub(r'\s+\?', '?', text)
    text = re.sub(r'\s+,', ',', text)
    text = re.sub(r'\.[\s+\.]+', '. ', text)
    text = re.sub(r'\s+\.', '.', text)
    
    return text.strip().lower()

In [84]:
datasets = [[clean_text(text) for text in ds] for ds in datasets]

In [85]:
[len([t for t in ds if len(t)>0]) for ds in datasets]

[2205, 8, 11]

In [86]:
[len(' '.join(ds).split(' ')) for ds in datasets]

[31261, 17347, 18477]

In [87]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [88]:
tokenizer = AutoTokenizer.from_pretrained(model_type)

In [96]:
target_ids = tokenizer.encode("./,")[1:-1]
target_ids

[1012, 1013, 1010]

In [97]:
target_token2id = {t: tokenizer.encode(t)[-2] for t in "./,"}
target_token2id

{',': 1010, '.': 1012, '/': 1013}

In [98]:
target_ids = list(target_token2id.values())
target_ids

[1012, 1013, 1010]

In [99]:
id2target = {
    0: 0,
    -1: -1,
}
for i, ti in enumerate(target_ids):
    id2target[ti] = i+1
target2id = {value: key for key, value in id2target.items()}

def create_target(text):
    encoded_words, targets = [], []
    
    words = text.split(' ')

    for word in words:
        target = 0
        for target_token, target_id in target_token2id.items():
            if word.endswith(target_token):
                word = word.rstrip(target_token)
                target = id2target[target_id]

        encoded_word = tokenizer.encode(word, add_special_tokens=False)
        
        for w in encoded_word:
            encoded_words.append(w)
        for _ in range(len(encoded_word)-1):
            targets.append(-1)
        targets.append(target)
        
#         print([tokenizer._convert_id_to_token(ew) for ew in encoded_word], target)
        

    encoded_words = [tokenizer.cls_token_id or tokenizer.bos_token_id] +\
                    encoded_words +\
                    [tokenizer.sep_token_id or tokenizer.eos_token_id]
    targets = [-1] + targets + [-1]
    
    return encoded_words, targets

In [100]:
s = "Tyranosaurus: kill me? Not enough,/ -- said the co-pilot -- ..."
print(s)
s = clean_text(s)
print(s)
data, targets = create_target(s)
print(targets)
[tokenizer._convert_id_to_token(d) for d in data[1:-1]]

Tyranosaurus: kill me? Not enough,/ -- said the co-pilot -- ...
tyranosaurus, kill me/ not enough,/, said the co pilot,
[-1, -1, -1, 3, 0, 2, 0, -1, -1, 3, 0, 0, 0, 3, -1]


['ty',
 '##rano',
 '##saurus',
 'kill',
 'me',
 'not',
 'enough',
 ',',
 '/',
 'said',
 'the',
 'co',
 'pilot']

In [101]:
encoded_texts, targets = [], []

for ds in datasets:
    x = list(zip(*(create_target(ts) for ts in tqdm(ds))))
    encoded_texts.append(x[0])
    targets.append(x[1])

  0%|          | 0/2205 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

In [102]:
for te, ta in zip(encoded_texts[0][0], targets[0][0]):
    print(f"{tokenizer._convert_id_to_token(te):15}\t{ta}")

[CLS]          	-1
finally        	0
tonight        	3
it             	-1
'              	-1
s              	0
prom           	0
season         	0
and            	2
steve          	0
hartman        	0
met            	0
the            	0
queen          	0
and            	0
her            	0
king           	0
on             	3
the            	2
road           	0
[SEP]          	0


In [103]:
os.makedirs(data_path + model_type, exist_ok=True)

for i, name in enumerate(('train', 'valid', 'test')):
    with open(data_path + f'{model_type}/{name}_data.pkl', 'wb') as f:
        pickle.dump((encoded_texts[i], targets[i]), f)

In [104]:
from collections import Counter

for ds_targets in targets:
    c = Counter((target for t in ds_targets for target in t))
    print('\t'.join([str(c[i]) for i in (1,2,3,0,-1)]))

3	5661	1496	24101	6904
6	974	1225	15142	1928
9	1138	1120	16210	2129


In [105]:
e = []
i = 0

raw_words = datasets[1][2].split(' ')

for te, ta in zip(encoded_texts[1][2], targets[1][2]):
    if ta == -1:
        e.append(te)
    else:
        e.append(te)
        print(f"{tokenizer.decode(e):15}\t{tokenizer.decode(target2id[ta]):10}\t{raw_words[i]}")
        e = []
        i += 1
print(f"{tokenizer.decode(e):15}\t{tokenizer.decode(target2id[ta]):10}\t")

[CLS] you      	[PAD]     	you
know           	,         	know,
i've           	[PAD]     	i've
talked         	[PAD]     	talked
about          	[PAD]     	about
some           	[PAD]     	some
of             	[PAD]     	of
these          	[PAD]     	these
projects       	[PAD]     	projects
before         	,         	before,
about          	[PAD]     	about
the            	[PAD]     	the
human          	[PAD]     	human
genome         	[PAD]     	genome
and            	[PAD]     	and
what           	[PAD]     	what
that           	[PAD]     	that
might          	[PAD]     	might
mean           	,         	mean,
and            	[PAD]     	and
discovering    	[PAD]     	discovering
new            	[PAD]     	new
sets           	[PAD]     	sets
of             	[PAD]     	of
genes          	/         	genes/
we're          	[PAD]     	we're
actually       	[PAD]     	actually
starting       	[PAD]     	starting
at             	[PAD]     	at
a              	[PAD]     	a
new            	[P

OverflowError: ignored

In [106]:
print(tokenizer.decode(encoded_texts[1][2]))

[CLS] you know i've talked about some of these projects before about the human genome and what that might mean and discovering new sets of genes we're actually starting at a new point we've been digitizing biology and now we're trying to go from that digital code into a new phase of biology with designing and synthesizing life so we've always been trying to ask big questions /, what is life / is something that i think many biologists have been trying to understand at various levels we've tried various approaches paring it down to minimal components we've been digitizing it now for almost 20 years when we sequenced the human genome it was going from the analog world of biology into the digital world of the computer now we're trying to ask can we regenerate life or can we create new life out of this digital universe this is the map of a small organism mycoplasma genitalium that has the smallest genome for a species that can self replicate in the laboratory and we've been trying to just s

In [107]:
cd /content/drive/MyDrive/src/neural_punctuator

/content/drive/MyDrive/src/neural_punctuator


In [108]:
cd trainers/

/content/drive/MyDrive/src/neural_punctuator/trainers


In [110]:
!python BertPunctuatorTrainer.py

Traceback (most recent call last):
  File "BertPunctuatorTrainer.py", line 6, in <module>
    from neural_punctuator.base.BaseTrainer import BaseTrainer
ModuleNotFoundError: No module named 'neural_punctuator'


In [32]:
cd /content/drive/MyDrive/src

/content/drive/MyDrive/src


In [2]:
pip install dotmap

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [9]:
pip install scikit-learn==0.24.2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-learn==0.24.2
  Downloading scikit_learn-0.24.2-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
[K     |████████████████████████████████| 22.3 MB 1.2 MB/s 
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.0.2
    Uninstalling scikit-learn-1.0.2:
      Successfully uninstalled scikit-learn-1.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
yellowbrick 1.4 requires scikit-learn>=1.0.0, but you have scikit-learn 0.24.2 which is incompatible.[0m
Successfully installed scikit-learn-0.24.2


In [10]:
!python main.py

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2022-07-07 06:33:38,777 INFO      Epoch #0
100% 17/17 [00:09<00:00,  1.72it/s, loss=0.615, grads=1.41]
100% 9/9 [00:01<00:00,  8.16it/s]
              prec

In [3]:
import pickle
import torch
from neural_punctuator.utils.data import get_config_from_yaml
import numpy as np
import re

torch.manual_seed(69)
np.random.seed(69)

data_path = "/content/drive/MyDrive/src/neural_punctuator/dataset/"

In [18]:
config = get_config_from_yaml('/content/drive/MyDrive/src/neural_punctuator/configs/config-bert-base-uncased-unfreeze.yaml')
config.trainer.load_model = "bert-base-uncased-epoch-1.pth"

In [5]:
with open(data_path + 'test_texts_2012.txt', 'r', encoding='utf-8') as f:
    raw_text = f.readlines()

In [6]:
print(raw_text[:1000])
print()

['You know, cadaver dissection is the traditional way of learning human anatomy. For students, it\'s quite an experience, but for a school, it could be very difficult or expensive to maintain. So we learned the majority of anatomic classes taught, they do not have a cadaver dissection lab. Maybe those reasons, or depending on where you are, cadavers may not be easily available. So to address this, we developed with a Dr. Brown in Stanford: virtual dissection table. So we call this Anatomage Table. So with this Anatomage Table, students can experience the dissection without a human cadaver. And the table form is important, and since it\'s touch-interactive, just like the way they do dissections in the lab, or furthermore just the way a surgeon operates on a patient you can literally interact with your table. Our digital body is one-to-one life size, so this is exactly the way students will see the real anatomy. I\'m going to do some demonstrations. As you can see, I use my finger to int

In [7]:
def clean_text(text):

    text = text.replace('!', '.')
    text = text.replace(':', ',')
    text = text.replace('--', ',')
    text = text.replace('-', ',')
    text = text.replace(';', '.')
    text = text.replace(' ,', ',')
    text = text.replace('♫', '')
    text = text.replace('...', '')

    text = re.sub(r'--\s?--', '', text)
    text = re.sub(r'\s+', ' ', text)
    
    text = re.sub(r'\s+\?', '?', text)
    text = re.sub(r'\s+,', ',', text)
    text = re.sub(r'\.[\s+\.]+', '. ', text)
    
    text = re.sub(r',\s?,', ',', text)
    
    return text.strip().lower()

def remove_punctuation(text):
    text = clean_text(text)
    
    text = text.replace('.', '')
    text = text.replace(',', '')
    text = text.replace('?', '')
    
    return text.lower()

In [8]:
test_test = remove_punctuation(raw_text[0])
print(test_test[:1000])
print()
print(raw_text[0][:1000])

you know cadaver dissection is the traditional way of learning human anatomy for students it's quite an experience but for a school it could be very difficult or expensive to maintain so we learned the majority of anatomic classes taught they do not have a cadaver dissection lab maybe those reasons or depending on where you are cadavers may not be easily available so to address this we developed with a dr brown in stanford virtual dissection table so we call this anatomage table so with this anatomage table students can experience the dissection without a human cadaver and the table form is important and since it's touchinteractive just like the way they do dissections in the lab or furthermore just the way a surgeon operates on a patient you can literally interact with your table our digital body is onetoone life size so this is exactly the way students will see the real anatomy i'm going to do some demonstrations as you can see i use my finger to interact with my digital body i'm goi

In [25]:
pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 18.9 MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.96


In [26]:
pip install sacremoses

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 10.3 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=895260 sha256=a8127570acbe22835ce24a4f0d9744522862b0bc638c96e6f59d135ffdd434f5
  Stored in directory: /root/.cache/pip/wheels/87/39/dd/a83eeef36d0bf98e7a4d1933a4ad2d660295a40613079bafc9
Successfully built sacremoses
Installing collected packages: sacremoses
Successfully installed sacremoses-0.0.53


In [2]:
cd /content/drive/MyDrive/src/

/content/drive/MyDrive/src


In [11]:
from neural_punctuator.models.BertPunctuator import BertPunctuator
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')
model = BertPunctuator(config)

Using cache found in /root/.cache/torch/hub/huggingface_pytorch-transformers_main
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [74]:
device = torch.device('cuda:0')
torch.cuda.set_device(device)

In [75]:
model.to(device);

In [76]:
from neural_punctuator.utils.io import load
load(model, None, config)

In [77]:
tokenizer.decode(0)

'[PAD]'

In [78]:
encoded = tokenizer.encode(test_test, padding=True, pad_to_multiple_of=512)
len(encoded)

1024

In [79]:
x = torch.tensor(encoded).view(-1, 512).to(device)
x.shape

torch.Size([2, 512])

In [80]:
x.size()

torch.Size([2, 512])

In [102]:
output = model(x)


In [104]:

output.shape

AttributeError: ignored

In [105]:


output = output.argmax(-1)[0].detach().cpu()

AttributeError: ignored

In [106]:

output

(tensor([[[-1.2536, -1.3449, -1.4188, -1.5514],
          [-1.5243, -1.1447, -1.2355, -1.7533],
          [-1.3753, -1.4654, -1.2468, -1.4748],
          ...,
          [-1.4809, -1.2118, -1.2945, -1.6051],
          [-1.4391, -1.3773, -1.2205, -1.5347],
          [-1.3244, -1.3735, -1.3114, -1.5541]],
 
         [[-1.2006, -1.3172, -1.6269, -1.4501],
          [-1.2367, -1.4653, -1.3346, -1.5355],
          [-1.4051, -1.4550, -1.1497, -1.5870],
          ...,
          [-1.4385, -1.4595, -1.4340, -1.2310],
          [-1.3288, -1.4757, -1.4116, -1.3362],
          [-1.3231, -1.3754, -1.4331, -1.4172]]], device='cuda:0',
        grad_fn=<LogSoftmaxBackward0>), tensor([[[0.5228],
          [0.5283],
          [0.4878],
          ...,
          [0.5424],
          [0.5856],
          [0.5228]],
 
         [[0.4838],
          [0.5265],
          [0.5156],
          ...,
          [0.5387],
          [0.5054],
          [0.5122]]], device='cuda:0', grad_fn=<SigmoidBackward0>))

In [26]:
id2target = {-1: 0,
              9: 1, # .
              60: 2, 
              15: 3,  # ,
              -2: -1, # will be masked
             }
target2id = {value: key for key, value in id2target.items()}

In [100]:
generated_output = []

for token_id, target in zip(encoded, output):
    generated_output.append(token_id)
    
    token = tokenizer._convert_id_to_token(token_id)
    if token == '<pad>':
        break
    if target > 0:
        target_id = target2id[target.item()]
        target_token = tokenizer._convert_id_to_token(target_id)
        
        generated_output.append(target_id)
    else:
        target_token = ""
    print(token, target_token)

TypeError: ignored

In [None]:
tokenizer.decode(generated_output[1:])