In [34]:
# I will try to grab prompt words from existing patent claims and generate claims from there, 
# these generated text will be used for text classification
import pandas as pd
import re
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import numpy as np


In [35]:
data = pd.read_csv("../H01L.csv")
data.head()

Unnamed: 0,uid,url,claims,abstract,description,collection_time,write_date
0,1,http://patft.uspto.gov//netacgi/nph-Parser?Sec...,1. An organic light emitting diode display co...,An organic light emitting diode display includ...,This application claims priority to Korean P...,2020-03-03 05:37:01.550764,2020-03-02 18:37:01.547157
1,2,http://patft.uspto.gov//netacgi/nph-Parser?Sec...,1. An electric element cooling module compris...,Provided is an electric element cooling module...,CROSS-REFERENCE TO RELATED APPLICATIONS This...,2020-03-03 05:37:01.550849,2020-03-02 18:37:01.547352
2,3,http://patft.uspto.gov//netacgi/nph-Parser?Sec...,2. The organic light emitting diode display o...,An organic light emitting diode display includ...,This application claims priority to Korean P...,2020-03-03 05:37:01.603471,2020-03-02 18:37:01.548018
3,4,http://patft.uspto.gov//netacgi/nph-Parser?Sec...,2. The electric element cooling module of cla...,Provided is an electric element cooling module...,CROSS-REFERENCE TO RELATED APPLICATIONS This...,2020-03-03 05:37:01.604237,2020-03-02 18:37:01.548179
4,5,http://patft.uspto.gov//netacgi/nph-Parser?Sec...,3. The organic light emitting diode display o...,An organic light emitting diode display includ...,This application claims priority to Korean P...,2020-03-03 05:37:01.607406,2020-03-02 18:37:01.548057


In [40]:
def grab_random_prompts(data, numprompts, first_words=5, column='claims'):
    '''
    Inputs:
    numprompts int: Number of prompts we would want
    first_words int: How many first words would we want
    '''
    num_data = len(data)
    prompts = []
    while len(prompts) < numprompts:
        row = np.random.randint(0, num_data-1)
        try:
            words = data[column][row].split()
            firsts = " ".join(words[:first_words])
            prompts.append(firsts)
        except:
            #print("No claims")
            pass
    
#     for indx in rows:
#         words = data[column][indx].split()
#         firsts = " ".join(words[:first_words])
#         prompts.append(firsts)
    return prompts

In [41]:
grab_random_prompts(data, 20, first_words=5)

['5. The light emitting device',
 '20. The process according to',
 '8. The process according to',
 '7. The method of claim',
 '22. A light emitting device',
 '10. The apparatus of claim',
 '5. The pressure sensor of',
 '14. The method of claim',
 '9. The method of manufacturing',
 '14. The method of claim',
 '11. A display device, comprising',
 '19. The method of claim',
 '11. The photoelectric conversion element',
 '7. The magnetic structure according',
 '5. The integrated circuit device',
 '14. The display device according',
 '19. The carbon nanotube nanostructure',
 '15. The method of claim',
 '12. An electronic device comprising',
 '2. The system of claim']

In [38]:
OUTPUT_DIR = "./models"
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

tokenizer = GPT2Tokenizer.from_pretrained(OUTPUT_DIR)
model = GPT2LMHeadModel.from_pretrained(OUTPUT_DIR)
model = model.to(device)
def generate(input_str, length=250, n=5):
    cur_ids = torch.tensor(tokenizer.encode(input_str)).unsqueeze(0).long().to(device)
    model.eval()
    with torch.no_grad():
        for i in range(length):
            outputs = model(cur_ids[:, -1024:], labels=cur_ids[:, -1024:])
            loss, logits = outputs[:2]
            softmax_logits = torch.softmax(logits[0,-1], dim=0)
            next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=n)
            cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * next_token_id], dim=1)
        output_list = list(cur_ids.squeeze().to('cpu').numpy())
        output_text = tokenizer.decode(output_list)
        return output_text

def choose_from_top(probs, n=5):
    ind = np.argpartition(probs, -n)[-n:]
    top_prob = probs[ind]
    top_prob = top_prob / np.sum(top_prob) # Normalize
    choice = np.random.choice(n, 1, p = top_prob)
    token_id = ind[choice][0]
    return int(token_id)

In [42]:
# generate text for randomly retrieved prompt words
prompts = grab_random_prompts(data, 12, first_words=5)
for prompt in prompts:
    generated_text = generate(prompt, n=8)
    print(f"Prompt: {prompt}")
    print(generated_text)

Prompt: 14. The light-emitting device defined
14. The light-emitting device defined in claim 1, wherein the first light-emitting layer has a first width and a second width;  a first portion of the first portion is electrically connected to the second portion of the light-emitting layer;  and a second portion is electrically connected to the first portion of the light-emitting layer, wherein the first portion is electrically connected with the first portion of the second portion and the second portion is electrically connected with the second portion of the light-emitting layer. <EOS>
<BOS> 7.  An electronic device, comprising: a substrate;  and a gate stack comprising an interlayer substrate having a first gate electrode having a first width, and a second gate electrode having a second width;  wherein the substrate is formed of a first material and a second material;  each gate stack comprises a first conductive layer disposed between the first portion of the gate electrode and the sec

Prompt: 1. A multi-junction solar cell,
1. A multi-junction solar cell, comprising: a first conductive metal layer and a second conductive metal layer disposed between the first conductive layer and the second conductive metal layer;  a first conductive material layer disposed between the first conductive material layer and the second conductive material layer, the first conductive material layer being in contact with the first semiconductor die, the second conductive material layer including a first conductive layer and a second conductive material layer and comprising a first and a second insulating layer on each of the second conductive material layer and the first insulating material layer, the first conductive material layer including a first insulating material layer and a second insulating material layer;  a plurality of semiconductor layers, each having a first insulating film on one side of the first semiconductor film and an intermediate film on the other side of the first se