**Link to Google Drive**

In [1]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


**Install Repositories**

In [2]:
!git clone "https://github.com/jyanivaddi/ERA_V1.git"
!git -C ERA_V1 pull
!git clone "https://github.com/jyanivaddi/dl_hub.git"
!git -C dl_hub pull
!git pull

!pip install --quiet "torchinfo" "seaborn" "pytorch-lightning" "torchmetrics" "lightning-bolts" "torchtext" "datasets" "tokenizers" "transformers"

Cloning into 'ERA_V1'...
remote: Enumerating objects: 1479, done.[K
remote: Counting objects: 100% (72/72), done.[K
remote: Compressing objects: 100% (68/68), done.[K
remote: Total 1479 (delta 24), reused 28 (delta 4), pack-reused 1407[K
Receiving objects: 100% (1479/1479), 201.34 MiB | 11.18 MiB/s, done.
Resolving deltas: 100% (716/716), done.
Already up to date.
Cloning into 'dl_hub'...
remote: Enumerating objects: 577, done.[K
remote: Counting objects: 100% (329/329), done.[K
remote: Compressing objects: 100% (139/139), done.[K
remote: Total 577 (delta 212), reused 293 (delta 183), pack-reused 248[K
Receiving objects: 100% (577/577), 160.44 KiB | 6.17 MiB/s, done.
Resolving deltas: 100% (356/356), done.
Already up to date.
fatal: not a git repository (or any of the parent directories): .git
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m727.7/727.7 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m764.

In [3]:
import sys
sys.path.append("ERA_V1/session_17")
sys.path.append("dl_hub")

**Imports**

In [4]:
import torch.nn.functional as F
from collections import Counter
from os.path import exists
import torch.optim as optim
import torch.nn as nn
import numpy as np
import torch
import math
import re
from dl_hub.transformer_models.BERT_utils import SentencesDataset, get_batch
from dl_hub.transformer_models.transformer_models import BERT


**Config Parameters**

In [5]:
# =============================================================================
# #Init
# =============================================================================
print('initializing..')
batch_size = 1024
seq_len = 20
embed_size = 128
inner_ff_size = embed_size * 4
n_heads = 8
n_code = 8
n_vocab = 40000
dropout = 0.1
# n_workers = 12

#optimizer
optim_kwargs = {'lr':1e-4, 'weight_decay':1e-4, 'betas':(.9,.999)}

initializing..


**Load data and build dataloader**

In [6]:
# =============================================================================
# Input
# =============================================================================
#1) load text
print('loading text...')
pth = '/content/gdrive/MyDrive/Datasets/Bert/training.txt'
sentences = open(pth).read().lower().split('\n')

#2) tokenize sentences (can be done during training, you can also use spacy udpipe)
print('tokenizing sentences...')
special_chars = ',?;.:/*!+-()[]{}"\'&'
sentences = [re.sub(f'[{re.escape(special_chars)}]', ' \g<0> ', s).split(' ') for s in sentences]
sentences = [[w for w in s if len(w)] for s in sentences]

#3) create vocab if not already created
print('creating/loading vocab...')
pth = 'vocab.txt'
if not exists(pth):
    words = [w for s in sentences for w in s]
    vocab = Counter(words).most_common(n_vocab) #keep the N most frequent words
    vocab = [w[0] for w in vocab]
    open(pth, 'w+').write('\n'.join(vocab))
else:
    vocab = open(pth).read().split('\n')

#4) create dataset
print('creating dataset...')
dataset = SentencesDataset(sentences, vocab, seq_len)
# kwargs = {'num_workers':n_workers, 'shuffle':True,  'drop_last':True, 'pin_memory':True, 'batch_size':batch_size}
kwargs = {'shuffle':True,  'drop_last':True, 'pin_memory':True, 'batch_size':batch_size}
data_loader = torch.utils.data.DataLoader(dataset, **kwargs)


loading text...
tokenizing sentences...
creating/loading vocab...
creating dataset...


**Define Model**

In [7]:
# =============================================================================
# Model
# =============================================================================
#init model
print('initializing model...')
model = BERT(n_code, n_heads, embed_size, inner_ff_size, len(dataset.vocab), seq_len, dropout)
model = model.cuda()

# =============================================================================
# Optimizer
# =============================================================================
print('initializing optimizer and loss...')
optimizer = optim.Adam(model.parameters(), **optim_kwargs)
loss_model = nn.CrossEntropyLoss(ignore_index=dataset.IGNORE_IDX)


initializing model...
initializing optimizer and loss...


**Train Model**

In [8]:
# =============================================================================
# Train
# =============================================================================
print('training...')
print_each = 10
model.train()
batch_iter = iter(data_loader)
n_iteration = 10000
for it in range(n_iteration):

    #get batch
    batch, batch_iter = get_batch(data_loader, batch_iter)

    #infer
    masked_input = batch['input']
    masked_target = batch['target']

    masked_input = masked_input.cuda(non_blocking=True)
    masked_target = masked_target.cuda(non_blocking=True)
    output = model(masked_input)

    #compute the cross entropy loss
    output_v = output.view(-1,output.shape[-1])
    target_v = masked_target.view(-1,1).squeeze()
    loss = loss_model(output_v, target_v)

    #compute gradients
    loss.backward()

    #apply gradients
    optimizer.step()

    #print step
    if it % print_each == 0:
        print('it:', it,
              ' | loss', np.round(loss.item(),2),
              ' | Δw:', round(model.embeddings.weight.grad.abs().sum().item(),3))

    #reset gradients
    optimizer.zero_grad()



training...
it: 0  | loss 10.19  | Δw: 1.291
it: 10  | loss 9.48  | Δw: 0.559
it: 20  | loss 9.23  | Δw: 0.321
it: 30  | loss 9.09  | Δw: 0.249
it: 40  | loss 8.95  | Δw: 0.205
it: 50  | loss 8.77  | Δw: 0.185
it: 60  | loss 8.63  | Δw: 0.166
it: 70  | loss 8.45  | Δw: 0.147
it: 80  | loss 8.3  | Δw: 0.143
it: 90  | loss 8.18  | Δw: 0.142
it: 100  | loss 8.04  | Δw: 0.129
it: 110  | loss 7.85  | Δw: 0.123
it: 120  | loss 7.66  | Δw: 0.121
it: 130  | loss 7.57  | Δw: 0.111
it: 140  | loss 7.45  | Δw: 0.102
it: 150  | loss 7.35  | Δw: 0.099
it: 160  | loss 7.3  | Δw: 0.098
it: 170  | loss 7.09  | Δw: 0.092
it: 180  | loss 7.07  | Δw: 0.089
it: 190  | loss 6.98  | Δw: 0.089
it: 200  | loss 6.92  | Δw: 0.085
it: 210  | loss 6.8  | Δw: 0.086
it: 220  | loss 6.73  | Δw: 0.088
it: 230  | loss 6.69  | Δw: 0.083
it: 240  | loss 6.74  | Δw: 0.08
it: 250  | loss 6.56  | Δw: 0.083
it: 260  | loss 6.56  | Δw: 0.08
it: 270  | loss 6.62  | Δw: 0.085
it: 280  | loss 6.61  | Δw: 0.08
it: 290  | loss 6.

KeyboardInterrupt: ignored

**Results Analysis**

In [9]:
# =============================================================================
# Results analysis
# =============================================================================
print('saving embeddings...')
N = 3000
np.savetxt('values.tsv', np.round(model.embeddings.weight.detach().cpu().numpy()[0:N], 2), delimiter='\t', fmt='%1.2f')
s = [dataset.rvocab[i] for i in range(N)]
open('names.tsv', 'w+').write('\n'.join(s) )


print('end')


saving embeddings...
end
