In [None]:
## V2, handles large files (1GB) parsing without memory overload.

In [2]:
## Libraries
import glob
from pathlib import Path

# - to extract gz files
import gzip
import json

# - to convert json to dataframe
import pandas as pd

In [30]:
patents_data = []
patents_classifications = []
patents_applicants = []
patents_inventors = []

In [33]:
path = r'..\data\raw\patents\*.gz'
files = glob.glob(path)

In [34]:
#filename = "us-patent-2023-05-01-2023-06-25"
#filename = "us-patent-20230101-20230225"
#filename = "us-patent-20230226-20230430"
#filename = "us-patent-2023-06-26-2023-07-31"

#filename = 'us-patent-2022-03-01-2022-04-18' 2
#filename = 'us-patent-2022-01-01-2022-02-28' 1
#filename = 'us-patent-2022-09-17-2022-11-02'
#filename = 'us-patent-2022-04-19-2022-06-03'
#filename = 'us-patent-2022-06-04-2022-07-31'

#ext = ".jsonl.gz"
#folder = "../data/raw/patents/"

for file in files:
    patents_data = []
    patents_classifications = []
    patents_applicants = []
    patents_inventors = []
    with gzip.open(file, 'rt', encoding="ascii", errors="ignore") as f:
        for line in f:
            patent = json.loads(line)

            if patent['biblio'].get('invention_title') is None:         ## if invention title is None, then do not capture, skip to next record.
                continue                                                

            abstract = patent.get('abstract', 'na')
            if abstract != 'na':
                abstract = abstract[0]['text']
            data = {
                'lens_id': patent['lens_id'],
                'jurisdiction': patent['jurisdiction'],
                'patent_id': patent['doc_key'],
                'date_published': patent['date_published'],
                'title': patent['biblio']['invention_title'][0]['text'],
                'abstract': abstract
            }
            patents_data.append(data)

            for applicant in patent['biblio']['parties']['applicants']:
                app_data = {
                    'lens_id': patent['lens_id'],
                    'patent_id': patent['doc_key'],
                    'residence': applicant.get('residence', 'NA'),
                    'name': applicant['extracted_name']['value']
                }
                patents_applicants.append(app_data)
            
            if patent['biblio']['parties'].get('inventors') is not None:
                for inventor in patent['biblio']['parties']['inventors']:
                    inv_data = {
                        'lens_id': patent['lens_id'],
                        'patent_id': patent['doc_key'],
                        'residence': inventor.get('residence', 'NA'),
                        'name': inventor['extracted_name']['value']
                    }
                    patents_inventors.append(inv_data)

            #print('process classifications')
            classifications_cpc = patent['biblio'].get('classifications_cpc')
            if classifications_cpc is not None:
                for classification in classifications_cpc['classifications']:
                    #print('process classification')
                    class_data = {
                        'lens_id': patent['lens_id'],
                        'patent_id': patent['doc_key'],
                        'classification': classification['symbol']
                    }
                    patents_classifications.append(class_data)

            del(patent)    ## clear variable from memory
        del(line)
    del(f)

    ## save data to parquet
    filename = Path(Path(file).stem).stem
    path = "../data/processed/partial/"

    pd.DataFrame(patents_data).to_parquet(path + filename + "_data.parquet", index=False)
    pd.DataFrame(patents_classifications).to_parquet(path + filename + "_classifications.parquet", index=False)
    pd.DataFrame(patents_applicants).to_parquet(path + filename + "_applicants.parquet", index=False)
    pd.DataFrame(patents_inventors).to_parquet(path + filename + "_inventors.parquet", index=False)

    del(patents_data)
    del(patents_classifications)
    del(patents_applicants)
    del(patents_inventors)


In [4]:
## Merge partial files into master datasets
# Master Datasets:
# data/processed/patents_applicants.parquet
# data/processed/patents_classifications.parquet
# data/processed/patents_data.parquet
# data/processed/patents_inventors.parquet

data = r'..\data\processed\partial\*_data.parquet'
classifications = r'..\data\processed\partial\*_classifications.parquet'
inventors = r'..\data\processed\partial\*_inventors.parquet'
applicants = r'..\data\processed\partial\*_applicants.parquet'


In [5]:
df_patent = pd.DataFrame()
files = glob.glob(data)
path = "../data/processed/"

for f in files:
    df = pd.read_parquet(f)
    df_patent = pd.concat([df_patent, df])

pd.DataFrame(df_patent).to_parquet(path + "patents_data.parquet", index=False)
del(df_patent)

In [None]:
df_classifications = pd.DataFrame()
files = glob.glob(classifications)
path = "../data/processed/"

for f in files:
    df = pd.read_parquet(f)
    df_classifications = pd.concat([df_classifications, df])

pd.DataFrame(df_classifications).to_parquet(path + "patents_classifications.parquet", index=False)
del(df_classifications)

In [None]:
df_inventors = pd.DataFrame()
files = glob.glob(inventors)
path = "../data/processed/"

for f in files:
    df = pd.read_parquet(f)
    df_inventors = pd.concat([df_inventors, df])

pd.DataFrame(df_inventors).to_parquet(path + "patents_inventors.parquet", index=False)
del(df_inventors)

In [None]:
df_applicants = pd.DataFrame()
files = glob.glob(applicants)
path = "../data/processed/"

for f in files:
    df = pd.read_parquet(f)
    df_applicants = pd.concat([df_applicants, df])

pd.DataFrame(df_applicants).to_parquet(path + "patents_applicants.parquet", index=False)
del(df_applicants)

In [1]:
import pandas as pd

pd.read_parquet('../data/processed/patents_data.parquet')

Unnamed: 0,lens_id,jurisdiction,patent_id,date_published,title,abstract
0,005-139-527-315-761,AU,AU_2016369371_B2_20201224,2020-12-24,"Methods for the preparation of 1,3-benzodioxol...",The present invention relates to novel methods...
1,010-652-277-726-485,AU,AU_2015411377_B2_20201224,2020-12-24,Automatic blade control system for a motor grader,Disclosed is a method for automatically contro...
2,018-407-500-475-466,AU,AU_2019275507_B2_20201224,2020-12-24,PROTOCOL CONVERSION SYSTEM AND METHOD FOR A VE...,A method and system for converting protocols o...
3,031-647-115-365-483,AU,AU_2016204645_B2_20201224,2020-12-24,Furrow-opening tyre,Abstract Furrow-opening tyre A tyre (1) for a ...
4,028-391-194-640-273,AU,AU_2016348556_B2_20201224,2020-12-24,Submersible provided with means for manoeuvrin...,The invention relates to a submersible of the ...
...,...,...,...,...,...,...
1235696,196-336-563-801-248,US,US_11685430_B2_20230627,2023-06-27,Motor control device,A motor controlling ECU 202 includes an automa...
1235697,193-772-626-460-623,US,US_11687509_B2_20230627,2023-06-27,Computer implemented method for creating datab...,Computer implemented methods for storing ad ho...
1235698,193-177-759-894-058,US,US_11687006_B2_20230627,2023-06-27,Method of manufacturing photo masks,In a method of manufacturing a photo mask for ...
1235699,198-190-492-798-136,US,US_11690277_B2_20230627,2023-06-27,Method of p-type doping carbon nanotube,A method of p-type doping a carbon nanotube in...
