In [None]:
 # ILM framework documentation: https://github.com/chrisdonahue/ilm
!pip uninstall -y tensorflow
!pip install git+https://github.com/huggingface/transformers

from google.colab import drive
drive.mount('/content/gdrive')

%cd gdrive/My Drive/ILM/ilm

Found existing installation: tensorflow 2.6.0
Uninstalling tensorflow-2.6.0:
  Successfully uninstalled tensorflow-2.6.0
Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-b22_l7u2
  Running command git clone -q https://github.com/huggingface/transformers /tmp/pip-req-build-b22_l7u2
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 2.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 7.3 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.

In [None]:
# ./sberbank_small
# ./train_scratch
# ./rus_gpt2
# ./scratch_full
# ./train_scratch_lemma
MODEL_DIR = './sberbank_small'
MASK_CLS = 'ilm.mask.hierarchical.MaskHierarchical'

In [None]:
# Download pretrained model

# if MODEL_DIR is None:
#     !python acl20_repro.py model sto ilm | bash
#     MODEL_DIR = '/tmp/ilm/models/sto_ilm'

File ‘/tmp/ilm/models/sto_ilm/pytorch_model.bin’ already there; not retrieving.
File ‘/tmp/ilm/models/sto_ilm/config.json’ already there; not retrieving.
File ‘/tmp/ilm/models/sto_ilm/additional_ids_to_tokens.pkl’ already there; not retrieving.


In [None]:
# Prepare tokenizer

import os
import pickle

import ilm.tokenize_util

# tokenizer_name = 'GPT2'
tokenizer_name = 'GPT2'
tokenizer_custom_vocab_fp = 'rus_gpt2/vocab.json'
tokenizer = ilm.tokenize_util.Tokenizer[tokenizer_name.upper()]

# if custom tokenizer
ilm.tokenize_util.set_custom_vocab_fp(tokenizer_custom_vocab_fp)

with open(os.path.join(MODEL_DIR, 'additional_ids_to_tokens.pkl'), 'rb') as f:
    additional_ids_to_tokens = pickle.load(f)
additional_tokens_to_ids = {v:k for k, v in additional_ids_to_tokens.items()}
try:
    ilm.tokenize_util.update_tokenizer(additional_ids_to_tokens, tokenizer)
except ValueError:
    print('Already updated')
print(additional_tokens_to_ids)

Tokenizer is in _TOKENIZER_TO_STATE
Tokenizer is GPT2
<ilm.official_gpt2_encoder.encoder.Encoder object at 0x7faa8c87c390>
vocab_size_before 50257 vocab_size_after 50264 difference 7
{'<|startofinfill|>': 50257, '<|endofinfill|>': 50258, '<|infill_document|>': 50259, '<|infill_paragraph|>': 50260, '<|infill_sentence|>': 50261, '<|infill_ngram|>': 50262, '<|infill_word|>': 50263}


In [None]:
# Load model

import torch
from transformers import GPT2LMHeadModel

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GPT2LMHeadModel.from_pretrained(MODEL_DIR)
model.eval()
_ = model.to(device)

In [None]:
# Create context

"""

"""

context = "В этом _ мы изучали проблему _.".strip()

context_ids = ilm.tokenize_util.encode(context, tokenizer)

# Replace blanks with appropriate tokens from left to right
_blank_id = ilm.tokenize_util.encode(" _", tokenizer)
print(context_ids)

for i in reversed(range(len(context_ids)-1)):
  if context_ids[i] != _blank_id[0]:
    continue
  if context_ids[i+1] == _blank_id[1]:
    context_ids[i+1] = additional_tokens_to_ids['<|infill_ngram|>']
    # context_ids[i+1] = additional_tokens_to_ids['<|infill_word|>']
    context_ids.pop(i)
# print(context_ids.find(_blank_id))
# blank_idx = context_ids.index(_blank_id) 
  # context_ids[blank_idx] = additional_tokens_to_ids['<|infill_ngram|>']
# context_ids[context_ids.index(_blank_id)] = additional_tokens_to_ids['<|infill_ngram|>']
# context_ids.pop(3)
# context_ids[4] = additional_tokens_to_ids['<|infill_word|>']
# context_ids[context_ids.index(_blank_id)] = additional_tokens_to_ids['<|infill_sentence|>']
# context_ids[context_ids.index(_blank_id)] = additional_tokens_to_ids['<|infill_sentence|>']
# context_ids[context_ids.index(_blank_id)] = additional_tokens_to_ids['<|infill_sentence|>']
print(ilm.tokenize_util.decode(context_ids, tokenizer))

[677, 803, 225, 67, 656, 45166, 10784, 225, 67, 18]
В этом<|infill_ngram|> мы изучали проблему<|infill_ngram|>.


In [None]:
from ilm.infer import infill_with_ilm

# https://github.com/chrisdonahue/ilm/blob/master/ilm/infer.py#L49

generated = infill_with_ilm(
    model,
    additional_tokens_to_ids,
    context_ids,
    num_infills=10)
for g in generated:
    print('-' * 80)
    print(ilm.tokenize_util.decode(g, tokenizer))

--------------------------------------------------------------------------------
В этом случае мы изучали проблему эффективности.
--------------------------------------------------------------------------------
В этом понимании мы изучали проблему диалектики.
--------------------------------------------------------------------------------
В этом последнем мы мы изучали проблему становления гражданского общества.
--------------------------------------------------------------------------------
В этом смысле многие мы изучали проблему экономики электроэнергетики.
--------------------------------------------------------------------------------
В этом его умении выразительно мы изучали проблему вербализации русского языка.
--------------------------------------------------------------------------------
В этом случае выделение прав на самоопределение вообще не мы изучали проблему правового статуса.
--------------------------------------------------------------------------------
В этом контек