# GPT2

In [1]:
import os
import time
import datetime
# from google.colab import drive

import pandas as pd
import seaborn as sns
import numpy as np
import random

import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
torch.manual_seed(42)

from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup

from datasets import load_dataset
import torch
from tqdm import tqdm


output_dir = './model_save_GPT/'

model = GPT2LMHeadModel.from_pretrained(output_dir).cuda()

tokenizer = GPT2Tokenizer.from_pretrained(output_dir)

device = torch.device("cuda")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The GPT-2 model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:2]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[2:14]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-2:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The GPT-2 model has 148 different named parameters.

==== Embedding Layer ====

transformer.wte.weight                                  (50259, 768)
transformer.wpe.weight                                   (1024, 768)

==== First Transformer ====

transformer.h.0.ln_1.weight                                   (768,)
transformer.h.0.ln_1.bias                                     (768,)
transformer.h.0.attn.c_attn.weight                       (768, 2304)
transformer.h.0.attn.c_attn.bias                             (2304,)
transformer.h.0.attn.c_proj.weight                        (768, 768)
transformer.h.0.attn.c_proj.bias                              (768,)
transformer.h.0.ln_2.weight                                   (768,)
transformer.h.0.ln_2.bias                                     (768,)
transformer.h.0.mlp.c_fc.weight                          (768, 3072)
transformer.h.0.mlp.c_fc.bias                                (3072,)
transformer.h.0.mlp.c_proj.weight                        (3072

# Generate Text

In [4]:
model.eval()

# Change the noun between the = = signs. We kept this in data so it is a symbol of word to be explained
prompt = "<|startoftext|> = Soap = :"

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)

print(generated)

sample_outputs = model.generate(
                                generated,
                                do_sample=True,
                                top_k=1000,
                                max_length = 1000,
                                top_p=2,
                                num_return_sequences=3
                                )

for i, sample_output in enumerate(sample_outputs):
  print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[50257,   796,  1406,   499,   796,  1058]], device='cuda:0')
0:  = Soap = : Post-fire microorganisms again have a long and rich history in the environment, now characterized by few neighbors. scale spores areolate, allowing for specific prey influx
 fungi to grow with Earthworms and suck up flowers. In the absence of microorganisms, the pathogenic spores multiply and slow to find new host cells. The fungus's function depends on the host species being present for a given area, but a strong affinity for the microorganisms has been observed for defense purposes. The fungus grows on far deeper ground than the native forest, but the strength of its mushroom invasion depends on the availability of suitable substrates.  A similar rapid growth event may be found with soil plumes, which are warmer than with artificial stagnant air. Widespread reduced temperature and other seasonal influences, such as moisture and humidity, can increase fungus activity. The insects consume organic matte

Perplexity

In [5]:
max_length = model.config.n_positions
stride = 512

def ppl(model, input_ids_all, stride):
  nlls = []
  for i in tqdm(range(0, input_ids_all.size(1), stride)):
      begin_loc = max(i + stride - max_length, 0)
      end_loc = min(i + stride, input_ids_all.size(1))
      trg_len = end_loc - i  # may be different from stride on last loop
      input_ids = input_ids_all[:, begin_loc:end_loc].to("cuda:0")
      target_ids = input_ids.clone()
      target_ids[:, :-trg_len] = -100

      with torch.no_grad():
          outputs = model(input_ids, labels=target_ids)
          neg_log_likelihood = outputs[0] * trg_len

      nlls.append(neg_log_likelihood)

  ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
  return ppl

In [6]:
import json
with open("./data/test.json", "r") as json_file:
    dealt_test = json.load(json_file)

In [7]:
encodings = tokenizer("\n\n".join(dealt_test), return_tensors="pt")
ppl(model, encodings.input_ids, stride)

Token indices sequence length is longer than the specified maximum sequence length for this model (277118 > 1024). Running this sequence through the model will result in indexing errors
100%|██████████| 542/542 [00:33<00:00, 16.26it/s]


tensor(46.0095, device='cuda:0')

# BioGPT

In [11]:
import os
import time
import datetime
# from google.colab import drive

import pandas as pd
import seaborn as sns
import numpy as np
import random

import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler


from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, BioGptForCausalLM


In [12]:
output_dir = './model_save_Bio/'

model = BioGptForCausalLM.from_pretrained(output_dir).cuda()

tokenizer = AutoTokenizer.from_pretrained(output_dir)

device = torch.device("cuda")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
model.eval()

prompt = "<|startoftext|> = Soap = :"

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)

print(generated)

sample_outputs = model.generate(
                                generated,
                                do_sample=True,
                                top_k=1000,
                                max_length = 1000,
                                top_p=2,
                                num_return_sequences=3
                                )

for i, sample_output in enumerate(sample_outputs):
  print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

tensor([[    2, 42384,    43, 10583,  8081,    43,    20]], device='cuda:0')
0: = Soap =: Soap is the sixth annual "railroad in Southern California, the largest city on the Kakapo River in the United States." The nineteenth named road in Southern California in the 18th century, it spread northeastward to eastern Washington, including the Corn Exchange. Due to the combined population of the Kakapo River and the Kakapo River, the area was annexed by the Mustang people as early as the 18th century. A railroad between the Gulf of Mexico and Kakapo was established in 1851, using the South Pacific Railroad as its eastern boundary. These changes made the area more populated by Europeans, and yielded many northern turns. Today, the railway is primarily used for rail traffic and road heads by European and French natives. an Soap, Soap is given a Soap, Inc. The size, the decision by taxed locally, Star of the area declarations of the time, the original belt, the U.S. Fish and of the area declara

In [14]:
max_length = model.config.max_position_embeddings
stride = 512

def ppl(model, input_ids_all, stride):
  nlls = []
  for i in tqdm(range(0, input_ids_all.size(1), stride)):
      begin_loc = max(i + stride - max_length, 0)
      end_loc = min(i + stride, input_ids_all.size(1))
      trg_len = end_loc - i  # may be different from stride on last loop
      input_ids = input_ids_all[:, begin_loc:end_loc].to("cuda:0")
      target_ids = input_ids.clone()
      target_ids[:, :-trg_len] = -100

      with torch.no_grad():
          outputs = model(input_ids, labels=target_ids)
          neg_log_likelihood = outputs[0] * trg_len

      nlls.append(neg_log_likelihood)

  ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
  return ppl

In [15]:
import json
with open("./data/test.json", "r") as json_file:
    dealt_test = json.load(json_file)

In [16]:
encodings = tokenizer("\n\n".join(dealt_test), return_tensors="pt")
ppl(model, encodings.input_ids, stride)

Token indices sequence length is longer than the specified maximum sequence length for this model (310653 > 1024). Running this sequence through the model will result in indexing errors
100%|██████████| 607/607 [01:29<00:00,  6.75it/s]


tensor(451.4723, device='cuda:0')