In [1]:
import collections
import logging

import torch
from transformers import AutoConfig, GPT2LMHeadModel, GPT2Tokenizer, set_seed

start = "<|startoftext|> "
sep = " <|sep|>"



def dict2obj(d):
    """Convert a dictionary to a class"""
    if isinstance(d, list):
        d = [dict2obj(x) for x in d]
    if not isinstance(d, dict):
        return d

    class Class:
        pass

    obj = Class()
    for k in d:
        obj.__dict__[k] = dict2obj(d[k])
    return obj


def adjust_length_to_model(length, max_sequence_length):
    if length < 0 and max_sequence_length > 0 or 0 < max_sequence_length < length:
        length = max_sequence_length
    elif length < 0:
        length = MAX_LENGTH
    return length


def generate(args, tokenizer, model, prompt):
    args.length = adjust_length_to_model(
        args.length, max_sequence_length=model.config.max_position_embeddings
    )
    prompt_text = start + prompt.strip() + sep
    encoded_prompt = tokenizer.encode(
        prompt_text, add_special_tokens=False, return_tensors="pt"
    )
    encoded_prompt = encoded_prompt.to(args.device)

    input_ids = None if encoded_prompt.size()[-1] == 0 else encoded_prompt
    output_sequences = model.generate(
        input_ids=input_ids,
        max_length=args.length + len(encoded_prompt[0]),
        temperature=args.temperature,
        top_k=args.k,
        top_p=args.p,
        repetition_penalty=args.repetition_penalty,
        do_sample=True,
        num_return_sequences=1,
    )
    # Remove the batch dimension when returning multiple sequences
    if len(output_sequences.shape) > 2:
        output_sequences.squeeze_()

    generated_sequences = []

    for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
        # print("=== GENERATED SEQUENCE {} ===".format(generated_sequence_idx + 1))
        generated_sequence = generated_sequence.tolist()
        # Decode text
        text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
        # Remove all text after the stop token
        text = text[: text.find(args.stop_token) if args.stop_token else None]
        # Add the prompt at the beginning of the sequence. Remove the excess text that was used for pre-processing
        total_sequence = (
            prompt_text
            + text[
                len(
                    tokenizer.decode(
                        encoded_prompt[0], clean_up_tokenization_spaces=True
                    )
                ) :
            ]
        )
        generated_sequences.append(total_sequence)
        # print(total_sequence)
    return generated_sequences[0]


# Logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)
logger = logging.getLogger(__name__)

# Set max generation length
MAX_LENGTH = int(10000)  # Hardcoded max length to avoid infinite loop

# Define model class
MODEL_CLASSES = {"gpt2": (GPT2LMHeadModel, GPT2Tokenizer)}

# Generation arguments
args = collections.defaultdict(
    model_type="gpt2",
    model_name_or_path='model_ml_full/checkpoint-80000',
    prompt="",
    length=512,
    stop_token="<|endoftext|>",
    temperature=1.0,
    repetition_penalty=1.0,
    k=0,
    p=0.97,  # use nucleus sampling
    seed=42,
    no_cuda=False,
    num_return_sequences=1,
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    n_gpu=torch.cuda.device_count(),
)

# Convert dict to object
args = dict2obj(args)

# Set seed
set_seed(args.seed)

# Load tokenizer and model
args.model_type = args.model_type.lower()
config_class = AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=None)
model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
tokenizer = tokenizer_class.from_pretrained(
    args.model_name_or_path,
    from_tf=bool(".ckpt" in args.model_name_or_path),
    config=config_class,
    cache_dir=None,
)


model = model_class.from_pretrained(args.model_name_or_path)
model.to(args.device)






GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50260, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )


In [2]:
from transformers import AutoTokenizer, AutoConfig

tokenizer = AutoTokenizer.from_pretrained('gpt2')
# Add special tokens
tokenizer.add_special_tokens({"sep_token": sep})
tokenizer.add_special_tokens({"bos_token": start})
model.resize_token_embeddings(len(tokenizer))

if tokenizer.pad_token is None:
    print("no pad token")
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    print("pad token added!")
    
tokenizer.save_pretrained('model_ml_full/checkpoint-80000')

tokenizer = AutoTokenizer.from_pretrained('model_ml_full/checkpoint-80000')

Using pad_token, but it is not set yet.


no pad token
pad token added!


In [None]:
import os
from tqdm import tqdm


end = "<|endoftext|>"
# Generate
path = "test_classifier/ml/"
count = 0
with open(os.path.join(path,"all_titles.txt"),"r") as titles:   
    titlelines = titles.readlines() 
    with open(os.path.join(path,"generated_7500_0.9.txt"),"w+") as gen:
        for i in tqdm(range(7500)):
            title = titlelines[i]
            generated = generate(args, tokenizer, model, title)
            gen.write(f"{generated}{end}\n")
            count += 1 
    gen.close()
titles.close()

print(f"{count} abstracts generation done ")



  0%|          | 0/7500 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 1/7500 [00:02<5:46:47,  2.77s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 2/7500 [00:04<4:30:54,  2.17s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 3/7500 [00:06<4:48:00,  2.31s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 4/7500 [00:07<3:41:04,  1.77s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 5/7500 [00:09<3:40:34,  1.77s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 6/7500 [00:11<3:24:06,  1.63s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 7/7500 [00:11<2:45:45,  1.33s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 8/7500 [00:14<3:3

In [None]:
print(generated_sequence)

In [3]:
import os
end = "<|endoftext|>"
generated = generate(args,tokenizer,model,"BERTSCORE: EVALUATING TEXT GENERATION WITH BERT")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [4]:
print(generated)

<|startoftext|> BERTSCORE: EVALUATING TEXT GENERATION WITH BERT <|sep|>   We introduce BERT, a transformer-based transformer-based text generation architecture that is able to output sentences with the intention of generating balanced natural language utterances, almost always using single or both evaluated cases. BERT is a powerful yet automated transformer-based text generator based on BERT and has large community compared to ELMoGenerator and ELMoPlain R-models. Furthermore, including a reinforced version of BERT in a two-stage model consists of a supplementary processing block for detecting and handling missing clauses that only have metadata about the original corpus. This further helps to justify replacing references and redundant sentences in the considered sentence generation stage. We find that by combining the pre-processing and improving the enhancement network, BERT's output over ELMoGenerator is able to consistently improve the quality of natural-language extraction system

In [None]:
import os

path = "test_classifier/hep/"
count=0
end = "<|endoftext|>"


with open(os.path.join(path,"generated_1000.txt"),"r") as gen:   
    lines = gen.readlines() 
    with open(os.path.join(path,"generated_1000_new.txt"),"w") as new:
        for line in lines:
            count+=1
            new.write(line.strip()+"<|endoftext|>"+'\n')
    new.close()
gen.close()

print(count)

In [None]:
import os
with open("test_classifier/ml/generated_7500_0.97.txt","r") as all:
    lines=all.readlines()
    print(len(lines))
all.close()