In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
# import from library
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import re
import torch
# from fake_claims_generator import generate, grab_random_prompts

In [3]:
#!pip3 install git+https://github.com/huggingface/transformers@v3.1.0 # there were errors in the later versions, this version allows us to do our job for now
!pip3 install transformers==3.3.1

Collecting transformers==3.3.1
[?25l  Downloading https://files.pythonhosted.org/packages/19/22/aff234f4a841f8999e68a7a94bdd4b60b4cebcfeca5d67d61cd08c9179de/transformers-3.3.1-py3-none-any.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 13.3MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/14/67/e42bd1181472c95c8cda79305df848264f2a7f62740995a46945d9797b67/sentencepiece-0.1.95-cp36-cp36m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 50.4MB/s 
Collecting tokenizers==0.8.1.rc2
[?25l  Downloading https://files.pythonhosted.org/packages/80/83/8b9fccb9e48eeb575ee19179e2bdde0ee9a1904f97de5f02d19016b8804f/tokenizers-0.8.1rc2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 60.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (

In [4]:
# import necessary GPT-2 model architecture and tokenizer
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [5]:
%cd drive/My\ Drive/School\ Work/Fourth\ Year/Capstone\ Everything

/content/drive/My Drive/School Work/Fourth Year/Capstone Everything


In [6]:
def grab_random_prompts(data, numprompts, first_words=5, column='claims'):
    '''
    Inputs:
    numprompts int: Number of prompts we would want
    first_words int: How many first words would we want
    '''
    num_data = len(data)
    prompts = []
    while len(prompts) < numprompts:
        row = np.random.randint(0, num_data-1)
        try:
            words = data[column][row].split()
            firsts = " ".join(words[:first_words])
            prompts.append(firsts)
        except:
            #print("No claims")
            pass
    
#     for indx in rows:
#         words = data[column][indx].split()
#         firsts = " ".join(words[:first_words])
#         prompts.append(firsts)
    return prompts

OUTPUT_DIR = "./Models/Experiments/Abstracts/10epochs" # the model for abstract is stored here
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

tokenizer = GPT2Tokenizer.from_pretrained(OUTPUT_DIR)
model = GPT2LMHeadModel.from_pretrained(OUTPUT_DIR)
model = model.to(device)
def generate(input_str, length=250, n=5):
    '''
    This is the main generation code using our model
    '''
    cur_ids = torch.tensor(tokenizer.encode(input_str)).unsqueeze(0).long().to(device)
    model.eval()
    with torch.no_grad():
        for i in range(length):
            outputs = model(cur_ids[:, -1024:], labels=cur_ids[:, -1024:])
            loss, logits = outputs[:2]
            softmax_logits = torch.softmax(logits[0,-1], dim=0)
            next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=n)
            cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * next_token_id], dim=1)
        output_list = list(cur_ids.squeeze().to('cpu').numpy())
        output_text = tokenizer.decode(output_list)
        return output_text

def choose_from_top(probs, n=5):
    ind = np.argpartition(probs, -n)[-n:]
    top_prob = probs[ind]
    top_prob = top_prob / np.sum(top_prob) # Normalize
    choice = np.random.choice(n, 1, p = top_prob)
    token_id = ind[choice][0]
    return int(token_id)

In [7]:
device

'cuda'

## Load Real Data and Label Them


In [8]:
data = pd.read_csv("./Models/Data/H04L.csv")
real_data = pd.DataFrame({"Abstract": data["abstract"]})
real_data["Label"] = 0
real_data = real_data[real_data['Abstract'].notnull()]

In [9]:
real_data # shows some of the real patent claims

Unnamed: 0,Abstract,Label
0,An organic light emitting diode display includ...,0
1,Provided is an electric element cooling module...,0
2,An organic light emitting diode display includ...,0
3,Provided is an electric element cooling module...,0
4,An organic light emitting diode display includ...,0
...,...,...
8942,A method including patterning a continuous fin...,0
8943,A method including patterning a continuous fin...,0
8944,A method including patterning a continuous fin...,0
8945,A method including patterning a continuous fin...,0


## Generate Fake Data and Label Them

In [None]:
# How many fake claims do you want to generate
# num_fake_claims = len(real_data)
# prompts = grab_random_prompts(data, num_fake_claims, first_words=5)
# fake_claims = []
# for i, prompt in enumerate(prompts):
#     print(f"Generating Prompt {i+1} ... Prompt: {prompt}")
#     generated_text = generate(prompt, n=8)
#     fake_claims.append(generated_text)
  

In [12]:
# write in chunks, and write to csv
'''
This code block is responsible for generating fake patent abstracts
I first deine the file paths where we will save the fake patent data
Then, I generate the fake patent prompts by sampling from the real abstracts
I save the prompts into a csv file.
Since I am running the generation on Colab and I'm generating thousands of fake patent abstracts,
there is a high frequency of interuption, I therefore need to save the prompts (to be accessed later),
and save the patent abstracts for every 100 that are generated.
The last_interupted variable stores where the generation was last interupted.
'''
import os.path
fake_data_file = "./Models/Experiments/Abstracts/10epochs/fake_patent_abstracts.csv"
prompt_file = "./Models/Experiments/Abstracts/10epochs/prompts.csv"
num_fake_claims = len(real_data)
if os.path.exists(prompt_file):
  prompts = pd.read_csv(prompt_file)
  prompts = prompts['Prompts'].tolist()
else:
  prompts = grab_random_prompts(data, num_fake_claims, first_words=5, column='abstract')
  # save prompts
  prompt_df = pd.DataFrame({"Prompts": prompts})
  prompt_df.to_csv(prompt_file)
fake_abstracts = []
last_interupted = 5400
for i, prompt in enumerate(prompts[last_interupted:]): # modify last_interupted if we got disconnected
    if i % 100 == 0 and i != 0: # for every 100 abstracts
      df = pd.DataFrame({"Abstract": fake_abstracts, "Label": 1})
      if os.path.exists(fake_data_file): # this means this csv is already created
        df.to_csv(fake_data_file, mode='a', header=False)
      else:
        df.to_csv(fake_data_file)
      fake_abstracts = []
    print(f"Generating Prompt {i+1} ... Prompt: {prompt}")
    generated_text = generate(prompt, n=8)
    fake_abstracts.append(generated_text)
# last batch (that doesn't go over 100)
df = pd.DataFrame({"Claims": fake_abstracts, "Label": 1})
df.to_csv(fake_data_file, mode='a', header=False)

Generating Prompt 1 ... Prompt: A solar cell includes a
Generating Prompt 2 ... Prompt: Disclosed herein is an organic
Generating Prompt 3 ... Prompt: A solid-state image pickup device
Generating Prompt 4 ... Prompt: A substrate support assembly for
Generating Prompt 5 ... Prompt: An organic light emitting display
Generating Prompt 6 ... Prompt: Disclosed is an image sensor.
Generating Prompt 7 ... Prompt: We describe a method for
Generating Prompt 8 ... Prompt: A method for fabricating semiconductor
Generating Prompt 9 ... Prompt: A method of manufacturing a
Generating Prompt 10 ... Prompt: A semiconductor device includes a
Generating Prompt 11 ... Prompt: A chip accommodation tray for
Generating Prompt 12 ... Prompt: A method of manufacturing a
Generating Prompt 13 ... Prompt: A photoelectric conversion device includes
Generating Prompt 14 ... Prompt: A group III-N nanowire is
Generating Prompt 15 ... Prompt: A three-dimensional memory device including
Generating Prompt 16 ... Prompt

In [13]:
fake_data = pd.read_csv("./Models/Experiments/Abstracts/10epochs/fake_patent_abstracts.csv")
fake_data = fake_data.loc[:, ~fake_data.columns.str.contains('^Unnamed')]
fake_data

Unnamed: 0,Abstract,Label
0,An organic light-emitting display including a ...,1
1,A method for transporting a cassette pod for c...,1
2,A semiconductor device includes a first word l...,1
3,Systems and methods that may be implemented to...,1
4,A structure containing a vertical light emitti...,1
...,...,...
8942,A semiconductor device includes a first word l...,1
8943,An organic light emitting device including: a ...,1
8944,The present invention relates to compounds of ...,1
8945,Quantum dot polymer composites for on-chip lig...,1


In [14]:
len(fake_data)

8947

In [15]:
fake_prompts = pd.read_csv("./Models/Experiments/Abstracts/10epochs/prompts.csv")

In [16]:
fake_data['Abstract'][0]

'An organic light-emitting display including a substrate, an insulating      layer on the substrate, the substrate and the insulating layer having an      opening therethrough penetrating, a pixel array on the insulating layer,      the pixel array including a plurality of pixels that surround the      opening, a first pixel adjacent to the opening from among the plurality      of pixels includes a pixel electrode layer, an intermediate layer on the      pixel electrode layer, and an opposite electrode layer on the      intermediate layer, and a stepped portion on the substrate and adjacent      to the opening, the stepped portion having an under-cut step, wherein the      intermediate layer including an organic emission layer, and wherein at      least one of the intermediate layer and the opposite electrode layer      extends toward the opening and is disconnected by the stepped portion. <EOS>\n<BOS> An organic light emitting diode and an organic light emitting display      panel, th

In [17]:
fake_prompts['Prompts'][0]

'An organic light-emitting display including'

In [18]:
frames = [real_data, fake_data]
full_data = pd.concat(frames)


full_data = full_data.sample(frac=1).reset_index(drop=True)


In [19]:
full_data


Unnamed: 0,Abstract,Label
0,An integrated circuit device includes: a first...,1
1,A hermetic capsule including a semiconductor/m...,0
2,An organic EL display device 1 includes a flex...,1
3,The present disclosure provides a package stru...,1
4,A digitally controlled oscillator (DCO) may in...,0
...,...,...
17889,"An organic electroluminescent device, a method...",1
17890,The object of the present invention is to make...,0
17891,Disclosed examples provide processes for fabri...,0
17892,An analog-to-digital converter includes a comp...,1


'Embodiments of the present disclosure include methods and      configurations for an image sensor capable of simultaneous integration of      electrons and holes. According to one example, the image sensor can process      signals emitted from holes split into mutually different sub-lenses that are      indistinguishable at multiple wavelengths. <EOS>\n<BOS> A semiconductor device is provided. The semiconductor device includes a      carrier. The carrier includes a first region. The first region includes      holes. The carrier further includes an oxygen exchange layer (OEL) between      the first region and the OEL. The oxygen exchange layer includes a first portion      of a trench that extends in a first direction crossing the horizontal      direction, and a second portion that is in a second direction crossing the      horizontal direction and extending in a third direction crossing the      horizontal direction. A distance between an edge of the first region and an edge      of 

In [20]:
full_data.to_csv("./Models/Experiments/Abstracts/10epochs/real_fake_abstracts.csv")