In [3]:
import argparse
import os

from bs4 import BeautifulSoup
#from googlesearch import search
import numpy as np
import requests
from transformers import GPT2Config, GPT2LMHeadModel
import torch
from tqdm import tnrange, tqdm_notebook

from dataset import GPT21024Dataset 
from utils import add_special_tokens, beam_search, generate_beam_sample, generate_sample, sample_seq, set_seed, top_k_top_p_filtering

In [4]:
#please change default arguments if needed

parser = argparse.ArgumentParser()

parser.add_argument("--seed",default=42, type=int,  help="seed to replicate results")
parser.add_argument("--num_workers",default=4, type=int,  help="num of cpus available")
parser.add_argument("--device",default=torch.device('cuda'), help="torch.device object")
parser.add_argument("--output_dir",default='./output', type=str,  help="path to save evaluation results")
parser.add_argument("--model_dir",default='./weights', type=str,  help="path to save trained model")
parser.add_argument("--root_dir",default='./pubmed/gpt2_1024_data', type=str, help="location of json dataset.")
parser.add_argument("--ids_file",default='./pubmed/ids.json', type=str, help="location of train, valid and test file indexes")
args = parser.parse_args([])
print(args)

Namespace(seed=42, num_workers=4, device=device(type='cuda'), output_dir='./output', model_dir='./weights', root_dir='./pubmed/gpt2_1024_data', ids_file='./pubmed/ids.json')


In [5]:
# using the same validation and training data as during training
tokenizer = add_special_tokens()
# train_data = GPT21024Dataset(args.root_dir,args.ids_file,mode='train',length=3000)
# valid_data = GPT21024Dataset(args.root_dir,args.ids_file,mode='valid',length=500)
test_data = GPT21024Dataset(args.root_dir,args.ids_file,mode='test',length=770)


In [7]:
# model_file and config_file are files used to load finetuned model, change these name as per your file names

# model_file = os.path.join(args.model_dir, 'model_data{}_trained_after_{}_epochs_only_sum_loss_ignr_pad.bin'.format(len(train_data),args.num_train_epochs))
# config_file = os.path.join(args.model_dir, 'config_data{}_trained_after_{}_epochs_only_sum_loss_ignr_pad.json'.format(len(train_data),args.num_train_epochs))

# path to model and config files
model_file = os.path.join(args.model_dir, "model_data6100_trained_after_5_epochs_only_sum_loss_ignr_pad.bin")
config_file = os.path.join(args.model_dir, "config_data6100_trained_after_5_epochs_only_sum_loss_ignr_pad.json")

config = GPT2Config.from_json_file(config_file)
model = GPT2LMHeadModel(config)
state_dict = torch.load(model_file)
model.load_state_dict(state_dict)
model.eval()
model.to(args.device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50259, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Laye

In [10]:
generate_sample(test_data, tokenizer, model, num=2, length=100, temperature=1, top_k=10, top_p=0.5, device=args.device)

  0%|          | 0/100 [00:00<?, ?it/s]

new_article

The human ELL gene on chromosome 19 undergoes frequent translocations with the trithorax-like MLL gene on chromosome 11 in acute myeloid leukemias. Here, ELL was shown to encode a previously uncharacterized elongation factor that can increase the catalytic rate of RNA polymerase II transcription by suppressing transient pausing by polymerase at multiple sites along the DNA. Functionally, ELL resembles Elongin (SIII), a transcription elongation factor regulated by the product of the von Hippel-Lindau (VHL) tumor suppressor gene. The discovery of a second elongation factor implicated in oncogenesis provides further support for a close connection between the regulation of transcription elongation and cell growth.

generated_summary

 ELL enables DNA-binding transcription factor activity. ELL involved_in transcription by RNA polymerase II. ELL involved_in DNA-binding transcription factor activity. ELL involved_in DNA-binding transcription factor activity. ELL involved_in DNA-b

  0%|          | 0/100 [00:00<?, ?it/s]

new_article

The function of the neuronal high molecular weight microtubule-associated proteins (MAPs) MAP1b and MAP2 is regulated by the degree of their phosphorylation, which in turn is controlled by the activities of protein kinases and protein phosphatases (PP). To investigate the role of PP in the regulation of the phosphorylation of MAP1b and MAP2, we used okadaic acid and cyclosporin A to selectively inhibit PP2A and PP2B activities, respectively, in metabolically competent rat brain slices. The alteration of the phosphorylation levels of MAP1b and MAP2 was examined by Western blots using several phosphorylation-dependent antibodies to these proteins. The inhibition of PP2A, and to a lesser extent of PP2B, was found to induce an increased phosphorylation of MAP1b and inhibit its microtubule binding activity. Immunocytochemically, a marked increase in neuronal staining in inhibitor-treated tissue was observed with antibodies to the phosphorylated MAP1b. The inhibition of PP2A but

In [11]:
generate_beam_sample(test_data, tokenizer, model, num=2, length=100, beam_size=3, device=args.device)

  0%|          | 0/99 [00:00<?, ?it/s]

new_article

The human ELL gene on chromosome 19 undergoes frequent translocations with the trithorax-like MLL gene on chromosome 11 in acute myeloid leukemias. Here, ELL was shown to encode a previously uncharacterized elongation factor that can increase the catalytic rate of RNA polymerase II transcription by suppressing transient pausing by polymerase at multiple sites along the DNA. Functionally, ELL resembles Elongin (SIII), a transcription elongation factor regulated by the product of the von Hippel-Lindau (VHL) tumor suppressor gene. The discovery of a second elongation factor implicated in oncogenesis provides further support for a close connection between the regulation of transcription elongation and cell growth

actual_summary

An RNA polymerase II elongation factor encoded by the human ELL gene.ELL involved_in positive regulation of DNA-templated transcription, elongation. <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|

  0%|          | 0/99 [00:00<?, ?it/s]

new_article

The function of the neuronal high molecular weight microtubule-associated proteins (MAPs) MAP1b and MAP2 is regulated by the degree of their phosphorylation, which in turn is controlled by the activities of protein kinases and protein phosphatases (PP). To investigate the role of PP in the regulation of the phosphorylation of MAP1b and MAP2, we used okadaic acid and cyclosporin A to selectively inhibit PP2A and PP2B activities, respectively, in metabolically competent rat brain slices. The alteration of the phosphorylation levels of MAP1b and MAP2 was examined by Western blots using several phosphorylation-dependent antibodies to these proteins. The inhibition of PP2A, and to a lesser extent of PP2B, was found to induce an increased phosphorylation of MAP1b and inhibit its microtubule binding activity. Immunocytochemically, a marked increase in neuronal staining in inhibitor-treated tissue was observed with antibodies to the phosphorylated MAP1b. The inhibition of PP2A but

## Download An Article Given A Query

In [7]:
def sentences_from_query(query):
    # Get url
    if query.startswith("http"):
        url = query
    else:
        url = search(query, num_results=1)[0]
    print(url)
    page = requests.get(url).text
    soup = BeautifulSoup(page)
    # Get text from all <p> tags.
    p_tags = soup.find_all('p')
    # Get the text from each of the "p" tags and strip surrounding whitespace.
    p_tags_text = " ".join([tag.get_text().strip() for tag in p_tags])
    return p_tags_text

In [8]:
article = sentences_from_query("neural embedding")
article = tokenizer.encode(article)[:900]

https://towardsdatascience.com/neural-network-embeddings-explained-4d028e6f0526


Token indices sequence length is longer than the specified maximum sequence length for this model (1957 > 1024). Running this sequence through the model will result in indexing errors


In [9]:
generated_text = sample_seq(model, article, 50, args.device, temperature=1, top_k=10, top_p=0.5)
generated_text = generated_text[0, len(article):].tolist()
text = tokenizer.convert_ids_to_tokens(generated_text,skip_special_tokens=True)
text = tokenizer.convert_tokens_to_string(text)

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




In [10]:
print("Article: \n")
print(tokenizer.decode(article))
print("------------------------------------------------------------ \n")
print("Generated Summary: \n")
print(text)

Article: 

Applications of neural networks have expanded significantly in recent years from image segmentation to natural language processing to time-series forecasting. One notably successful use of deep learning is embedding, a method used to represent discrete variables as continuous vectors. This technique has found practical applications with word embeddings for machine translation and entity embeddings for categorical variables. In this article, I’ll explain what neural network embeddings are, why we want to use them, and how they are learned. We’ll go through these concepts in the context of a real problem I’m working on: representing all the books on Wikipedia as vectors to create a book recommendation system. An embedding is a mapping of a discrete — categorical — variable to a vector of continuous numbers. In the context of neural networks, embeddings are low-dimensional, learned continuous vector representations of discrete variables. Neural network embeddings are useful bec