In [None]:
'''
Text Preprocessing for the patent classification task.
Not using the XML format but the Patent Official Gazettes (https://bulkdata.uspto.gov/data/patent/officialgazette/2021/)
Download one or more of the zip file, I used e-OG20210302_1484-1.zip
A csv file is generated, containing the text of the each patent, as well as the top-level classification
'''

In [2]:
from bs4 import BeautifulSoup, Doctype
import os
import pandas as pd
import re

In [3]:
## Processing all the patents in individual html files

In [4]:
directory = 'C:/Users/liangx36/Downloads/e-OG20210223_1483-4/1483-4/OG/html/'
raw = []

In [5]:
for root, dirnames, filenames in os.walk(directory):
    for filename in filenames:
        if filename.endswith('.html'):
            fname = os.path.join(root, filename)
            #print('Filename: {}'.format(filename))
            with open(fname) as handle:
                soup = BeautifulSoup(handle.read(), 'html.parser')
                data = ""
                for string in soup.stripped_strings:
                    data += string + ' '
            raw.append([data, filename.split("-")[0].strip('US')])                       

In [6]:
df = pd.DataFrame(raw, columns=['content','filename'])

In [7]:
## Processing the reference data from classification folder

In [8]:
directory_class = 'C:/Users/liangx36/Downloads/e-OG20210223_1483-4/1483-4/OG/classification/'
final_ref = pd.DataFrame(columns = ['Subclass','Subgroup','Patent', 'IPC'])

In [9]:
for root, dirnames, filenames in os.walk(directory_class):
    for filename in filenames:
        if filename.endswith('Body.html'):
            if filename.split("_")[0].startswith('uspcClassGroup'):
                break
            fname = os.path.join(root, filename)
            with open(fname) as handle:
                soup = BeautifulSoup(handle.read(), 'html.parser')
                table_body=soup.find('table')
                rows = table_body.find_all('tr')
                reference = []
                for row in rows:
                    cols=row.find_all('td')
                    cols=[x.text for x in cols]
                    reference.append(cols)
                ref = pd.DataFrame(data=reference[1:], columns=reference[0])
                IPC = re.search('[A-H]',filename.split("_")[1]).group(0)
                ref['IPC'] = IPC
            final_ref = pd.concat([final_ref, ref])

In [10]:
df = df.rename(columns={"filename": "Patent"})

In [11]:
final_df = pd.merge(df, final_ref, how="inner", on=["Patent"])

In [12]:
columns_to_keep = ['content', 'IPC']
final_df = final_df[columns_to_keep]

In [13]:
final_df.to_csv('final_dataset.csv', index=False)

In [14]:
#### some descriptives for the dataset

In [15]:
## Average length:
sum = 0
for i in range(len(df.content)):
    sum += len(df.content[i])
sum/len(df.content)

2045.5860727728984