In [1]:
from google.cloud import storage
import os
import pandas as pd 
from google.cloud import bigquery
import sys 
import tempfile
PROJECT_ID = "patents-research-275923"
BUCKET_NAME = "patents-research"

In [3]:
QUERY_CITATION = """
SELECT filteredIDs.patent_id AS patent_id, citation.citation_id
FROM
  `patents-public-data.patentsview.uspatentcitation` AS citation,
  (SELECT DISTINCT apps.patent_id, apps.date as publication_date
  FROM
    `patents-public-data.patentsview.application` AS apps,
    `patents-public-data.patentsview.cpc_current` AS cpc
  WHERE
    apps.patent_id=cpc.patent_id
    AND (cpc.subsection_id IN ('C05', 'C07', 'C08', 'C09', 'C11', 'C12', 'C13', 'C25', 'C40')
          OR cpc.group_id in ('A01G', 'A01H', 'A61K', 'A61P', 'A61Q', 'B01F', 'B01J', 'B81B', 'B82B', 'B82Y','G01N', 'G16H'))) AS filteredIDs
WHERE
  filteredIDs.patent_id = citation.patent_id AND citation.date < filteredIDs.publication_date
ORDER BY
  patent_id,
  citation.citation_id
"""

def load_data_from_file(df, dataset_id = "patents.backcited"):
    bigquery_client = bigquery.Client()
    
    
    with tempfile.NamedTemporaryFile() as temp:
        print("creating temp csv at {} ".format(temp.name))
        df.to_csv(temp.name, index=False)
        print("temp csv created")
        
        with open(temp.name, 'rb') as source_file:
            # This example uses CSV, but you can use other formats.
            # See https://cloud.google.com/bigquery/loading-data
            job_config = bigquery.LoadJobConfig()
            job_config.source_format = 'text/csv'
            job_config.autodetect=True
            job_config.write_disposition = "WRITE_TRUNCATE"
            job = bigquery_client.load_table_from_file(source_file, dataset_id, job_config=job_config)

            job.result()  # Waits for job to complete

    print('Loaded {} rows into {}.'.format(job.output_rows, dataset_id))
    
    

In [None]:
df= pd.read_gbq(QUERY_CITATION, project_id=PROJECT_ID, dialect='standard', progress_bar_type='tqdm')

In [None]:
# get the count
dfcount = df.groupby(["patent_id"])["patent_id"].count().reset_index(name="count")
# combine citations by patent_id into a single string
df_ids = df.groupby('patent_id')['citation_id'].apply(' '.join).reset_index(name='citations')

In [None]:
#merge the two dfs
result = pd.concat([dfcount.set_index('patent_id'),df_ids.set_index('patent_id')], axis=1, join='inner')
result.reset_index(level=0, inplace=True)

#formate with tokens delimiters and combine
result['count'] = result['count'].astype(str)
result['count'] = '<start -backcitedcount>'+result['count']+'<end -backcitedcount>'
result['citations'] = '<start -backcited>'+result['citations']+'<end -backcited>'
result['backcited']  = result['count'] + result['citations']
result.drop(['count','citations'], axis=1, inplace=True)


In [4]:
load_data_from_file(result)

NameError: name 'result' is not defined