In [1]:
# run script to clean and format raw xml data files

%run "../scripts/xml_cleaner.py"

In [2]:
import xml.etree.ElementTree as ET
import pandas as pd

# custom function for extracting data from xml and saving to pandas DataFrame
def patent_xml2df(xml_data):
    root = ET.XML(xml_data)
    all_records = []
    for patent in root:
        record = {}
        for element in patent[0]:
            if element.tag == 'publication-reference':
                for item in element[0]:
                    if item.tag == 'date':
                        record['grant_date'] = item.text
                    elif item.tag in ('country','doc-number'):
                        record[item.tag] = item.text
            elif element.tag == 'application-reference':
                record['application_type'] = element.get('appl-type')
                for item in element[0]:
                    if item.tag == 'date':
                        record['filing_date'] = item.text
                    elif item.tag == 'doc-number':
                        record['application_number'] = item.text
            elif element.tag == 'figures':
                for item in element:
                    record[item.tag] = item.text
            elif element.tag in ('number-of-claims','invention-title'):
                record[element.tag] = element.text
        all_records.append(record)
    return pd.DataFrame(all_records)

In [3]:
import glob

# use globbing to get all names of cleaned xml file paths
clean_files = glob.glob('../clean_data/*')


# loop over file names, extract data and concatenate into single DataFrame
xml_data_list = []
for clean_file in clean_files:
    data = open(clean_file).read()
    df = patent_xml2df(data)
    xml_data_list.append(df)

patents_df = pd.concat(xml_data_list, ignore_index=True)
    

In [4]:
# change the "filing_date" and "grant_date" columns to datetime objects
patents_df['filing_date'] = pd.to_datetime(patents_df['filing_date'])
patents_df['grant_date'] = pd.to_datetime(patents_df['grant_date'])

In [5]:
patents_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37764 entries, 0 to 37763
Data columns (total 10 columns):
application_number          37764 non-null object
application_type            37764 non-null object
country                     37764 non-null object
doc-number                  37764 non-null object
filing_date                 37764 non-null datetime64[ns]
grant_date                  37764 non-null datetime64[ns]
invention-title             37617 non-null object
number-of-claims            37764 non-null object
number-of-drawing-sheets    36501 non-null object
number-of-figures           36501 non-null object
dtypes: datetime64[ns](2), object(8)
memory usage: 2.9+ MB
