## This notebook sets out how to read files from google drive ##

In [None]:
## Install required libraries
!pip install pydrive

In [1]:
# Import libraries
# - to read google drive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive

# - to extract gz files
import gzip
import json

# - to convert json to dataframe
import pandas as pd

### Authenticate session using Google Drive API credentials

In [3]:
## Follow steps outlined in https://pythonhosted.org/PyDrive/quickstart.html#authentication to set up client_secrets.json
## save the client_secrets.json file to the notebook file

## Run to authenticate.

gauth = GoogleAuth()
gauth.LocalWebserverAuth() # Creates local webserver and auto handles authentication.

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?client_id=1010998970874-55ji152nek39gamaen2di7dfhf1s16ms.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&access_type=offline&response_type=code

Authentication successful.


#### Example 1: How to retreive a list of files from a named folder

In [4]:
## code to extract a list of ids for all files in a folder.
## the folder is identified by it's ID: in this case, the folder data/raw/patents = '1JTzjr6SByYjwOZocX6pkVuUC3vYfkk7Q'

drive = GoogleDrive(gauth) # Create GoogleDrive instance with authenticated GoogleAuth instance

file_list = drive.ListFile({'q': "'1JTzjr6SByYjwOZocX6pkVuUC3vYfkk7Q' in parents"}).GetList()
for file in file_list:
  print('title: %s, id: %s' % (file['title'], file['id']))

title: 2020-us-patent.jsonl.gz, id: 1Wlesl9kiYSHE80T6TZnY6s3G9MXsib_w
title: 2021-au-patent.jsonl.gz, id: 1mwnh0PvPnovwPl96ULZDulkMQ9RR69__
title: 2023-au-patent.jsonl.gz, id: 1H79jIWowBbYViFhrzgZbBJ9WPD4-o3Dj
title: 2022-us-patent.jsonl.gz, id: 1ss2nzylIczSfF8EUTP5_2HmXwWeZ1hRt
title: 2023-us-patent.jsonl.gz, id: 1awgN_UNVPPJ6DIikuQLW4A8WBKtD5_WA
title: 2021-us-patent.jsonl.gz, id: 18JTQ9wOfytCf-f7ByvSrmF83d-b-MufJ


#### Example 2: How to download a file from google drive

In [5]:
# retrieve the ID for the first file from the file_list
filename = file_list[0]['title']
file_id = file_list[0]['id']

In [None]:
# Initialize GoogleDriveFile instance with file id.
file = drive.CreateFile({'id': file_id})

# get the content of the file - this will download the file to your local computer.
content = file.GetContentFile('../data/raw/' + filename)

#### Example 3: How to extract gzip file using gzip python package

In [6]:
## Open the gzip file and read the contents, apply decode to convert from bytes to string.
with gzip.open('../data/raw/' + filename, 'rb') as f:
    file_content = f.read().decode('utf8')


#### Example 4: Working with JSON Lines

In [7]:
## JSON Lines differs from JSON array by having each JSON object in a new line.
## A reader is needed to iterate through each line. However the first line can simply be extracted by using this partition method.

first_record = file_content.partition('\n')[0]

In [9]:
# This loads the string 'first_record' into the json package.
patent_json = json.loads(first_record)

In [28]:
## Print sample json recrod
print(patent_json['jurisdiction'])
print(patent_json['abstract'])
print(patent_json['biblio']['parties']['applicants'][0]['residence'])   ## the residence status of the first applicant party

US
[{'text': 'A method for improving the ion beam quality in an ion implanter is disclosed. In some ion implantation systems, contaminants from the ion source are extracted with the desired ions, introducing contaminants to the workpiece. These contaminants may be impurities in the ion source chamber. This problem is exacerbated when mass analysis of the extracted ion beam is not performed, and is further exaggerated when the desired feedgas includes a halogen. The introduction of a diluent gas in the ion chamber may reduce the deleterious effects of the halogen on the inner surfaces of the chamber, reducing contaminants in the extracted ion beam. In some embodiments, the diluent gas may be germane or silane.', 'lang': 'en'}]
US


In [16]:
patent_json

{'lens_id': '002-152-403-917-266',
 'jurisdiction': 'US',
 'doc_number': '10804075',
 'kind': 'B2',
 'date_published': '2020-10-13',
 'doc_key': 'US_10804075_B2_20201013',
 'docdb_id': 539082606,
 'lang': 'en',
 'biblio': {'publication_reference': {'jurisdiction': 'US',
   'doc_number': '10804075',
   'kind': 'B2',
   'date': '2020-10-13'},
  'application_reference': {'jurisdiction': 'US',
   'doc_number': '201615350685',
   'kind': 'A',
   'date': '2016-11-14'},
  'priority_claims': {'claims': [{'jurisdiction': 'US',
     'doc_number': '201615350685',
     'kind': 'A',
     'date': '2016-11-14',
     'sequence': 1},
    {'jurisdiction': 'US',
     'doc_number': '201314089916',
     'kind': 'A',
     'date': '2013-11-26',
     'sequence': 2},
    {'jurisdiction': 'US',
     'doc_number': '201361847776',
     'kind': 'P',
     'date': '2013-07-18',
     'sequence': 3}],
   'earliest_claim': {'date': '2013-07-18'}},
  'invention_title': [{'text': 'Method of improving ion beam quality in 