In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
# import from library
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import re
import torch
# from fake_claims_generator import generate, grab_random_prompts

In [3]:
#!pip3 install git+https://github.com/huggingface/transformers@v3.1.0 # there were errors in the later versions, this version allows us to do our job for now
!pip3 install transformers==3.3.1

Collecting transformers==3.3.1
[?25l  Downloading https://files.pythonhosted.org/packages/19/22/aff234f4a841f8999e68a7a94bdd4b60b4cebcfeca5d67d61cd08c9179de/transformers-3.3.1-py3-none-any.whl (1.1MB)
[K     |▎                               | 10kB 20.6MB/s eta 0:00:01[K     |▋                               | 20kB 21.0MB/s eta 0:00:01[K     |█                               | 30kB 10.5MB/s eta 0:00:01[K     |█▎                              | 40kB 8.7MB/s eta 0:00:01[K     |█▌                              | 51kB 4.2MB/s eta 0:00:01[K     |█▉                              | 61kB 4.8MB/s eta 0:00:01[K     |██▏                             | 71kB 5.0MB/s eta 0:00:01[K     |██▌                             | 81kB 5.2MB/s eta 0:00:01[K     |██▉                             | 92kB 5.6MB/s eta 0:00:01[K     |███                             | 102kB 5.9MB/s eta 0:00:01[K     |███▍                            | 112kB 5.9MB/s eta 0:00:01[K     |███▊                            | 1

In [4]:
# import necessary GPT-2 model architecture and tokenizer
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [5]:
%cd drive/My\ Drive/School\ Work/Fourth\ Year/Capstone\ Everything

/content/drive/My Drive/School Work/Fourth Year/Capstone Everything


In [6]:
def grab_random_prompts(data, numprompts, first_words=5, column='claims'):
    '''
    Inputs:
    numprompts int: Number of prompts we would want
    first_words int: How many first words would we want
    '''
    num_data = len(data)
    prompts = []
    while len(prompts) < numprompts:
        row = np.random.randint(0, num_data-1)
        try:
            words = data[column][row].split()
            firsts = " ".join(words[:first_words])
            prompts.append(firsts)
        except:
            #print("No claims")
            pass
    
#     for indx in rows:
#         words = data[column][indx].split()
#         firsts = " ".join(words[:first_words])
#         prompts.append(firsts)
    return prompts

OUTPUT_DIR = "./Models/Experiments/Abstracts/3epochs" # the model for abstract is stored here
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

tokenizer = GPT2Tokenizer.from_pretrained(OUTPUT_DIR)
model = GPT2LMHeadModel.from_pretrained(OUTPUT_DIR)
model = model.to(device)
def generate(input_str, length=250, n=5):
    '''
    This is the main generation code using our model
    '''
    cur_ids = torch.tensor(tokenizer.encode(input_str)).unsqueeze(0).long().to(device)
    model.eval()
    with torch.no_grad():
        for i in range(length):
            outputs = model(cur_ids[:, -1024:], labels=cur_ids[:, -1024:])
            loss, logits = outputs[:2]
            softmax_logits = torch.softmax(logits[0,-1], dim=0)
            next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=n)
            cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * next_token_id], dim=1)
        output_list = list(cur_ids.squeeze().to('cpu').numpy())
        output_text = tokenizer.decode(output_list)
        return output_text

def choose_from_top(probs, n=5):
    ind = np.argpartition(probs, -n)[-n:]
    top_prob = probs[ind]
    top_prob = top_prob / np.sum(top_prob) # Normalize
    choice = np.random.choice(n, 1, p = top_prob)
    token_id = ind[choice][0]
    return int(token_id)

In [7]:
device

'cuda'

## Load Real Data and Label Them


In [8]:
data = pd.read_csv("./Models/Data/H04L.csv")
real_data = pd.DataFrame({"Abstract": data["abstract"]})
real_data["Label"] = 0
real_data = real_data[real_data['Abstract'].notnull()]

In [9]:
real_data # shows some of the real patent claims

Unnamed: 0,Abstract,Label
0,An organic light emitting diode display includ...,0
1,Provided is an electric element cooling module...,0
2,An organic light emitting diode display includ...,0
3,Provided is an electric element cooling module...,0
4,An organic light emitting diode display includ...,0
...,...,...
8942,A method including patterning a continuous fin...,0
8943,A method including patterning a continuous fin...,0
8944,A method including patterning a continuous fin...,0
8945,A method including patterning a continuous fin...,0


## Generate Fake Data and Label Them

In [None]:
# How many fake claims do you want to generate
# num_fake_claims = len(real_data)
# prompts = grab_random_prompts(data, num_fake_claims, first_words=5)
# fake_claims = []
# for i, prompt in enumerate(prompts):
#     print(f"Generating Prompt {i+1} ... Prompt: {prompt}")
#     generated_text = generate(prompt, n=8)
#     fake_claims.append(generated_text)
  

In [12]:
# write in chunks, and write to csv
'''
This code block is responsible for generating fake patent abstracts
I first deine the file paths where we will save the fake patent data
Then, I generate the fake patent prompts by sampling from the real abstracts
I save the prompts into a csv file.
Since I am running the generation on Colab and I'm generating thousands of fake patent abstracts,
there is a high frequency of interuption, I therefore need to save the prompts (to be accessed later),
and save the patent abstracts for every 100 that are generated.
The last_interupted variable stores where the generation was last interupted.
'''
import os.path
fake_data_file = "./Models/Experiments/Abstracts/3epochs/fake_patent_abstracts.csv"
prompt_file = "./Models/Experiments/Abstracts/3epochs/prompts.csv"
num_fake_claims = len(real_data)
if os.path.exists(prompt_file):
  prompts = pd.read_csv(prompt_file)
  prompts = prompts['Prompts'].tolist()
else:
  prompts = grab_random_prompts(data, num_fake_claims, first_words=5, column='abstract')
  # save prompts
  prompt_df = pd.DataFrame({"Prompts": prompts})
  prompt_df.to_csv(prompt_file)
fake_abstracts = []
last_interupted = 7500
for i, prompt in enumerate(prompts[last_interupted:]): # modify last_interupted if we got disconnected
    if i % 100 == 0 and i != 0: # for every 100 abstracts
      df = pd.DataFrame({"Abstract": fake_abstracts, "Label": 1})
      if os.path.exists(fake_data_file): # this means this csv is already created
        df.to_csv(fake_data_file, mode='a', header=False)
      else:
        df.to_csv(fake_data_file)
      fake_abstracts = []
    print(f"Generating Prompt {i+1} ... Prompt: {prompt}")
    generated_text = generate(prompt, n=8)
    fake_abstracts.append(generated_text)
# last batch (that doesn't go over 100)
df = pd.DataFrame({"Claims": fake_abstracts, "Label": 1})
df.to_csv(fake_data_file, mode='a', header=False)

Generating Prompt 1 ... Prompt: A method of manufacturing a
Generating Prompt 2 ... Prompt: Systems and methods for detecting
Generating Prompt 3 ... Prompt: A light emitting diode is
Generating Prompt 4 ... Prompt: An integrated circuit device includes
Generating Prompt 5 ... Prompt: A data reading error is
Generating Prompt 6 ... Prompt: A stacked-die oscillator package includes
Generating Prompt 7 ... Prompt: A light emitting diode package
Generating Prompt 8 ... Prompt: A digitally controlled oscillator (DCO)
Generating Prompt 9 ... Prompt: A substrate carrier stack includes
Generating Prompt 10 ... Prompt: The disclosed apparatus may include
Generating Prompt 11 ... Prompt: A semiconductor structure includes a
Generating Prompt 12 ... Prompt: A dual-gate, self-aligned lateral double-diffused
Generating Prompt 13 ... Prompt: A method for manufacturing a
Generating Prompt 14 ... Prompt: Disclosed examples provide processes for
Generating Prompt 15 ... Prompt: An analog-to-digital co

In [10]:
fake_data = pd.read_csv("./Models/Experiments/Abstracts/3epochs/fake_patent_abstracts.csv")
fake_data = fake_data.loc[:, ~fake_data.columns.str.contains('^Unnamed')]
fake_data

Unnamed: 0,Abstract,Label
0,Protective dielectrics are discussed generally...,1
1,The disclosed technology generally relates to ...,1
2,An ultraviolet light-emitting device including...,1
3,A HDR CTIA pixel which provides automatic gain...,1
4,Quantum dot polymer composites for inter-...,1
...,...,...
8942,An application specific integrated circuit (AS...,1
8943,An OLED device and a method of manufacturing t...,1
8944,Disclosed herein is an organic semiconduc...,1
8945,A data reading error is reduced. The data read...,1


In [11]:
len(fake_data)

8947

In [12]:
fake_prompts = pd.read_csv("./Models/Experiments/Abstracts/3epochs/prompts.csv")

In [13]:
fake_data['Abstract'][0]

'Protective dielectrics are discussed generally herein. In one or more exemplary embodiments, one or more epitaxial or      semiconductor-on-insulator-die (OEL) capacitive modules (150),     particularly suitable for RF filters, are formed on the order of nanometers or.     nm. In one or more exemplary embodiments, a III-V or V-N material may      be formed on the order of 1 nm. <EOS>\n<BOS> An organic light emitting diode and an organic light emitting display      panel, the organic light emitting diode including an anode disposed on a      base layer; a first organic light emitting layer disposed on the anode; a      cathode disposed on the first organic light emitting layer; and an      electron control layer disposed between the first organic light emitting      layer and the cathode, the electron control layer including ytterbium,      wherein the cathode includes a first inorganic compound layer contacting      the electron control layer to form a P-N junction with the electron'

In [14]:
fake_prompts['Prompts'][0]

'Protective dielectrics are discussed generally'

In [15]:
frames = [real_data, fake_data]
full_data = pd.concat(frames)


full_data = full_data.sample(frac=1).reset_index(drop=True)


In [16]:
full_data


Unnamed: 0,Abstract,Label
0,An analog-to-digital converter is provided. Th...,0
1,An apparatus comprising: a module; a substrate...,0
2,Disclosed herein is an organic light-emitting ...,0
3,A piezoelectric linear actuator comprising a l...,0
4,A semiconductor device includes a first c...,1
...,...,...
17889,An organic light emitting diode display includ...,0
17890,A semiconductor device of an embodiment in...,1
17891,"A MEMS IR sensor, with a cavity in a substrate...",0
17892,A process for producing a light emitting unit ...,1


'Embodiments of the present disclosure include methods and      configurations for an image sensor capable of simultaneous integration of      electrons and holes. According to one example, the image sensor can process      signals emitted from holes split into mutually different sub-lenses that are      indistinguishable at multiple wavelengths. <EOS>\n<BOS> A semiconductor device is provided. The semiconductor device includes a      carrier. The carrier includes a first region. The first region includes      holes. The carrier further includes an oxygen exchange layer (OEL) between      the first region and the OEL. The oxygen exchange layer includes a first portion      of a trench that extends in a first direction crossing the horizontal      direction, and a second portion that is in a second direction crossing the      horizontal direction and extending in a third direction crossing the      horizontal direction. A distance between an edge of the first region and an edge      of 

In [17]:
full_data.to_csv("./Models/Experiments/Abstracts/3epochs/real_fake_claims.csv")