In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
# import from library
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import re
import torch
# from fake_claims_generator import generate, grab_random_prompts

In [3]:
#!pip3 install git+https://github.com/huggingface/transformers@v3.1.0 # there were errors in the later versions, this version allows us to do our job for now
!pip3 install transformers==3.3.1

Collecting transformers==3.3.1
[?25l  Downloading https://files.pythonhosted.org/packages/19/22/aff234f4a841f8999e68a7a94bdd4b60b4cebcfeca5d67d61cd08c9179de/transformers-3.3.1-py3-none-any.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 16.5MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 56.4MB/s 
Collecting tokenizers==0.8.1.rc2
[?25l  Downloading https://files.pythonhosted.org/packages/80/83/8b9fccb9e48eeb575ee19179e2bdde0ee9a1904f97de5f02d19016b8804f/tokenizers-0.8.1rc2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 54.1MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/14/67/e42bd1181472c95c8cda79305df848264f2a7f62740995a46945d9797b67/sentencepiece-0.1.95-cp36-cp36m-manylinux2014_x86_64.whl (

In [4]:
# import necessary GPT-2 model architecture and tokenizer
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [5]:
%cd drive/My\ Drive/School\ Work/Fourth\ Year/Capstone\ Everything

/content/drive/My Drive/School Work/Fourth Year/Capstone Everything


In [6]:
def grab_random_prompts(data, numprompts, first_words=5, column='claims'):
    '''
    Inputs:
    numprompts int: Number of prompts we would want
    first_words int: How many first words would we want
    '''
    num_data = len(data)
    prompts = []
    while len(prompts) < numprompts:
        row = np.random.randint(0, num_data-1)
        try:
            words = data[column][row].split()
            firsts = " ".join(words[:first_words])
            prompts.append(firsts)
        except:
            #print("No claims")
            pass
    
#     for indx in rows:
#         words = data[column][indx].split()
#         firsts = " ".join(words[:first_words])
#         prompts.append(firsts)
    return prompts

OUTPUT_DIR = "./Models/Experiments/Abstracts/20epochs" # the model for abstract is stored here
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

tokenizer = GPT2Tokenizer.from_pretrained(OUTPUT_DIR)
model = GPT2LMHeadModel.from_pretrained(OUTPUT_DIR)
model = model.to(device)
def generate(input_str, length=250, n=5):
    '''
    This is the main generation code using our model
    '''
    cur_ids = torch.tensor(tokenizer.encode(input_str)).unsqueeze(0).long().to(device)
    model.eval()
    with torch.no_grad():
        for i in range(length):
            outputs = model(cur_ids[:, -1024:], labels=cur_ids[:, -1024:])
            loss, logits = outputs[:2]
            softmax_logits = torch.softmax(logits[0,-1], dim=0)
            next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=n)
            cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * next_token_id], dim=1)
        output_list = list(cur_ids.squeeze().to('cpu').numpy())
        output_text = tokenizer.decode(output_list)
        return output_text

def choose_from_top(probs, n=5):
    ind = np.argpartition(probs, -n)[-n:]
    top_prob = probs[ind]
    top_prob = top_prob / np.sum(top_prob) # Normalize
    choice = np.random.choice(n, 1, p = top_prob)
    token_id = ind[choice][0]
    return int(token_id)

In [7]:
device

'cuda'

## Load Real Data and Label Them


In [8]:
data = pd.read_csv("./Models/Data/H01L.csv")
real_data = pd.DataFrame({"Abstract": data["abstract"]})
real_data["Label"] = 0
real_data = real_data[real_data['Abstract'].notnull()]

In [9]:
real_data

Unnamed: 0,Abstract,Label
0,An organic light emitting diode display includ...,0
1,Provided is an electric element cooling module...,0
2,An organic light emitting diode display includ...,0
3,Provided is an electric element cooling module...,0
4,An organic light emitting diode display includ...,0
...,...,...
8942,A method including patterning a continuous fin...,0
8943,A method including patterning a continuous fin...,0
8944,A method including patterning a continuous fin...,0
8945,A method including patterning a continuous fin...,0


## Generate Fake Data and Label Them

In [None]:
# How many fake claims do you want to generate
# num_fake_claims = len(real_data)
# prompts = grab_random_prompts(data, num_fake_claims, first_words=5)
# fake_claims = []
# for i, prompt in enumerate(prompts):
#     print(f"Generating Prompt {i+1} ... Prompt: {prompt}")
#     generated_text = generate(prompt, n=8)
#     fake_claims.append(generated_text)
  

In [None]:
# write in chunks, and write to csv
'''
This code block is responsible for generating fake patent abstracts
I first deine the file paths where we will save the fake patent data
Then, I generate the fake patent prompts by sampling from the real abstracts
I save the prompts into a csv file.
Since I am running the generation on Colab and I'm generating thousands of fake patent abstracts,
there is a high frequency of interuption, I therefore need to save the prompts (to be accessed later),
and save the patent abstracts for every 100 that are generated.
The last_interupted variable stores where the generation was last interupted.
'''
import os.path
fake_data_file = "./Models/Experiments/Abstracts/20epochs/fake_patent_abstracts.csv"
prompt_file = "./Models/Experiments/Abstracts/20epochs/prompts.csv"
num_fake_claims = len(real_data)
if os.path.exists(prompt_file):
  prompts = pd.read_csv(prompt_file)
  prompts = prompts['Prompts'].tolist()
else:
  prompts = grab_random_prompts(data, num_fake_claims, first_words=5, column='abstract')
  # save prompts
  prompt_df = pd.DataFrame({"Prompts": prompts})
  prompt_df.to_csv(prompt_file)
fake_abstracts = []
last_interupted = 5100
for i, prompt in enumerate(prompts[last_interupted:]): # modify last_interupted if we got disconnected
    if i % 100 == 0 and i != 0: # for every 100 abstracts
      df = pd.DataFrame({"Abstract": fake_abstracts, "Label": 1})
      if os.path.exists(fake_data_file): # this means this csv is already created
        df.to_csv(fake_data_file, mode='a', header=False)
      else:
        df.to_csv(fake_data_file)
      fake_abstracts = []
    print(f"Generating Prompt {i+1} ... Prompt: {prompt}")
    generated_text = generate(prompt, n=8)
    fake_abstracts.append(generated_text)
# last batch (that doesn't go over 100)
df = pd.DataFrame({"Claims": fake_abstracts, "Label": 1})
df.to_csv(fake_data_file, mode='a', header=False)

Generating Prompt 1 ... Prompt: A method of fabricating a
Generating Prompt 2 ... Prompt: To provide a light-emitting element
Generating Prompt 3 ... Prompt: A light emitting diode (LED)
Generating Prompt 4 ... Prompt: A device includes a substrate
Generating Prompt 5 ... Prompt: The present invention relates to
Generating Prompt 6 ... Prompt: A silicon carbide semiconductor device,
Generating Prompt 7 ... Prompt: The present invention discloses a
Generating Prompt 8 ... Prompt: An assembly useable with a
Generating Prompt 9 ... Prompt: Disclosed are a backplane substrate,
Generating Prompt 10 ... Prompt: One aspect of the present
Generating Prompt 11 ... Prompt: The present application discloses a
Generating Prompt 12 ... Prompt: An organic light emitting device
Generating Prompt 13 ... Prompt: A stacked-die oscillator package includes
Generating Prompt 14 ... Prompt: An electronic apparatus that is
Generating Prompt 15 ... Prompt: According to example embodiments, a
Generating Prompt

In [10]:
fake_data = pd.read_csv("./Models/Experiments/Abstracts/20epochs/fake_patent_abstracts.csv")
fake_data = fake_data.loc[:, ~fake_data.columns.str.contains('^Unnamed')]
fake_data

Unnamed: 0,Abstract,Label
0,Vacuum annealing-based techniques for forming ...,1
1,A method for manufacturing a light emitting un...,1
2,Stacked semiconductor die assemblies with impr...,1
3,The present disclosure provides a biological f...,1
4,"According to one example, a device includes a ...",1
...,...,...
5095,An organic light-emitting diode may have trans...,1
5096,A deposition system and method includes a depo...,1
5097,A semiconductor package includes a semiconduct...,1
5098,A flexible substrate including a display area ...,1


In [11]:
len(fake_data)

5100

In [None]:
fake_prompts = pd.read_csv("./Models/Experiments/Abstracts/20epochs/prompts.csv")

In [None]:
fake_data['Abstract'][0]

'A solid-state image pickup device      includes a first semiconductor layer including a first photoelectric      converter, a second semiconductor layer including a second photoelectric      converter, and an intermediate layer between the first photoelectric      converter and the second photoelectric converter, and a pad electrode electrically      connects the first semiconductor layer to the second photoelectric      converter through the intermediate layer. A first resistive layer is over the      first semiconductor layer and a second resistive layer is under the second      semiconductor layer. A gate insulator and/or gate conductor coaxially      wraps completely around the first resistive layer and the second resistive      layer when it is in contact with the pad electrode. <EOS>\n<BOS> A light-emitting element having high external quantum efficiency is      provided. A light-emitting element having a long lifetime is provided. A      light-emitting element is provided which

In [None]:
fake_prompts['Prompts'][0]

'Protective dielectrics are discussed generally'

In [None]:
frames = [real_data, fake_data]
full_data = pd.concat(frames)


full_data = full_data.sample(frac=1).reset_index(drop=True)


In [None]:
full_data


Unnamed: 0,Abstract,Label
0,An imaging element includes a laminated plural...,0
1,A data reading error is reduced. A memory cell...,0
2,The present disclosure describes additives tha...,1
3,Provided are a pickup sensor and a biological ...,0
4,A display apparatus includes a first display a...,0
...,...,...
17889,"A semiconductor device includes a substrate, a...",0
17890,Low temperature epitaxial silicon deposition f...,0
17891,A photoelectric conversion element and an opti...,0
17892,The present disclosure discloses a light emitt...,0


'Embodiments of the present disclosure include methods and      configurations for an image sensor capable of simultaneous integration of      electrons and holes. According to one example, the image sensor can process      signals emitted from holes split into mutually different sub-lenses that are      indistinguishable at multiple wavelengths. <EOS>\n<BOS> A semiconductor device is provided. The semiconductor device includes a      carrier. The carrier includes a first region. The first region includes      holes. The carrier further includes an oxygen exchange layer (OEL) between      the first region and the OEL. The oxygen exchange layer includes a first portion      of a trench that extends in a first direction crossing the horizontal      direction, and a second portion that is in a second direction crossing the      horizontal direction and extending in a third direction crossing the      horizontal direction. A distance between an edge of the first region and an edge      of 

In [None]:
full_data.to_csv("./Models/Experiments/Abstracts/20epochs/real_fake_abstracts.csv")