In [10]:
import torch
from torchtext import data
from torchtext.data import Field, BucketIterator
import numpy as np
import pandas as pd
import spacy


In [11]:
patent_data = pd.read_csv('../H04L.csv')
patent_data.head()

Unnamed: 0,uid,url,claims,abstract,description,collection_time,write_date
0,10111,http://patft.uspto.gov//netacgi/nph-Parser?Sec...,"9. The wrist-watch according to claim 8, wher...",One embodiment of the present invention provid...,TECHNICAL FIELD The present invention relate...,2020-08-03 08:59:43.554478,2020-08-03 08:59:43.486725
1,10088,http://patft.uspto.gov//netacgi/nph-Parser?Sec...,"9. The wrist-watch according to claim 8, wher...",One embodiment of the present invention provid...,TECHNICAL FIELD The present invention relate...,2020-08-03 08:59:42.786863,2020-08-03 08:59:42.714765
2,713,http://patft.uspto.gov//netacgi/nph-Parser?Sec...,9. The wiring layer laminate for a semiconduc...,One aspect of the present invention relates to...,CROSS-REFERENCE TO RELATED APPLICATIONS This...,2020-03-03 05:39:31.535402,2020-03-02 18:39:31.455487
3,695,http://patft.uspto.gov//netacgi/nph-Parser?Sec...,9. The wiring layer laminate for a semiconduc...,One aspect of the present invention relates to...,CROSS-REFERENCE TO RELATED APPLICATIONS This...,2020-03-03 05:39:28.670518,2020-03-02 18:39:28.547115
4,13246,http://patft.uspto.gov//netacgi/nph-Parser?Sec...,9. The wafer support structure as recited in ...,An Electrostatic Chuck (ESC) in a chamber of a...,BACKGROUND 1. Field of the Invention The pr...,2020-08-03 09:28:07.124295,2020-08-03 09:28:07.076907


In [12]:
# splitting dataset into train, validation and test 
# https://stackoverflow.com/questions/38250710/how-to-split-data-into-3-sets-train-validation-and-test
def train_valid_test_split(data, train, valid):
    '''
    Takes dataset (pandas Dataframe) and ratio between train, validation and test set as input
    Return split dataset
    
    '''
    assert (train + valid) <= 1, "Train, validation and test set proportion needs to add up to 1"
    train_d, validate_d, test_d = np.split(data.sample(frac=1), [int(train*len(data)), int((train+valid)*len(data))])
    print(f"Proportion: \nTrain: {train*100}% \nValidation: {valid*100}%\nTest:{round(1-train-valid, 2)*100}%")
    print(f"Total: \nTrain: {len(train_d)} \nValidation: {len(validate_d)}\nTest:{len(test_d)}")
    return train_d, validate_d, test_d
    

In [13]:
train_data, validate_data, test_data = train_valid_test_split(patent_data, 0.7, 0.2)

Proportion: 
Train: 70.0% 
Validation: 20.0%
Test:10.0%
Total: 
Train: 12812 
Validation: 3661
Test:1831


In [16]:
data_path = "../processed_datasets"
train_data.to_csv(data_path+'/train.csv')
validate_data.to_csv(data_path+'/validate.csv')
test_data.to_csv(data_path+'/test.csv')

In [6]:
spacy_eng = spacy.load('en')

In [7]:
def tokenizer_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]

In [8]:
abstract = Field(tokenize=tokenizer_eng, 
               lower=True, init_token='<sos>', eos_token='<eos>')
claims = Field(tokenize=tokenizer_eng, 
               lower=True, init_token='<sos>', eos_token='<eos>')