## Query that merges cleaned_claims with all the other data

In [None]:
from tqdm import tqdm 
from google.cloud import storage
import os
import pandas as pd 
from google.cloud import bigquery
import sys 

#%load_ext google.colab.data_tableå

# authenticate self
#from google.colab import auth
#auth.authenticate_user()
#print('Authenticated')



QUERY="""
SELECT app.patent_id as patent_id, app.date as publication_date, filterData.bkwdCitations, filterData.fwrdCitations_5, patent.title, patent.abstract, claims
FROM
  (SELECT claims, CAST(patent_id AS STRING) as patent_id
   FROM `patents-research-275923.patents.cleaned_claims`) as claims,
  `patents-public-data.patentsview.patent` as patent,
  `patents-public-data.patentsview.application` as app,
  (SELECT DISTINCT cpc.patent_id, IFNULL(citation_5.bkwdCitations, 0) as bkwdCitations, IFNULL(citation_5.fwrdCitations_5, 0) as fwrdCitations_5
  FROM
    `patents-public-data.patentsview.cpc_current` AS cpc
    LEFT JOIN
    (SELECT  b.patent_id, b.bkwdCitations, f.fwrdCitations_5
      FROM 

        (SELECT 
          cited.citation_id as patent_id,
          IFNULL(COUNT(*),0) as fwrdCitations_5
          FROM 
          `patents-public-data.patentsview.uspatentcitation` AS cited,
          `patents-public-data.patentsview.application` AS apps
        WHERE
          apps.country = 'US'
          AND cited.citation_id = apps.patent_id 
          AND cited.date >= apps.date AND SAFE_CAST(cited.date AS DATE) <= DATE_ADD(SAFE_CAST(apps.date AS DATE), INTERVAL 5 YEAR) -- get in 5year interval 
         GROUP BY 
         cited.citation_id) AS f,

       (SELECT 
          cited.patent_id,
          IFNULL(COUNT(*),0) as bkwdCitations
          FROM 
          `patents-public-data.patentsview.uspatentcitation` AS cited,
          `patents-public-data.patentsview.application` AS apps
        WHERE
          apps.country = 'US'
          AND cited.patent_id = apps.patent_id 
          AND cited.date < apps.date -- get all backward citatoin count
         GROUP BY 
         cited.patent_id) AS b
      WHERE
      b.patent_id = f.patent_id AND b.bkwdCitations IS NOT NULL AND f.fwrdCitations_5 IS NOT NULL) AS citation_5 
      ON cpc.patent_id=citation_5.patent_id
      WHERE
       (cpc.subsection_id IN ('C05', 'C07', 'C08', 'C09', 'C11', 'C12', 'C13', 'C25', 'C40')
        OR cpc.group_id in ('A01G', 'A01H', 'A61K', 'A61P', 'A61Q', 'B01F', 'B01J', 'B81B', 'B82B', 'B82Y','G01N', 'G16H')))
  as filterData
  WHERE
  app.patent_id = filterData.patent_id AND app.patent_id = patent.id 
  AND SAFE_CAST(app.date AS DATE) < '2014-01-01' AND claims.patent_id  =app.patent_id
"""

QUERY2 = """
SELECT db.patent_id, db.fwrdCitations_5, db.abstract , db.claims , db.title , db.publication_date , IFNULL(backcited.backcited, "<start -backcitedcount>0<end -backcitedcount>")  as bkwdCitations 
FROM (SELECT app.patent_id as patent_id, app.date as publication_date,filterData.fwrdCitations_5, patent.title, patent.abstract, claims
FROM
  
  (SELECT claims, CAST(patent_id AS STRING) as patent_id
   FROM `patents-research-275923.patents.cleaned_claims`) as claims,
  `patents-public-data.patentsview.patent` as patent,
  `patents-public-data.patentsview.application` as app,
  (SELECT DISTINCT cpc.patent_id, IFNULL(citation_5.fwrdCitations_5, 0) as fwrdCitations_5
  FROM
    `patents-public-data.patentsview.cpc_current` AS cpc
    LEFT JOIN
        
        -- This query gets the forward citation count
        (SELECT 
          cited.citation_id as patent_id, IFNULL(COUNT(*),0) as fwrdCitations_5
          FROM 
           `patents-public-data.patentsview.patent` AS patent,
          `patents-public-data.patentsview.uspatentcitation` AS cited,
          `patents-public-data.patentsview.application` AS application
          WHERE
            patent.id = cited.patent_id AND cited.citation_id = application.patent_id
            AND (patent.date >= application.date AND SAFE_CAST(patent.date AS DATE) <= DATE_ADD(SAFE_CAST(application.date AS DATE), INTERVAL 5 YEAR))
          GROUP BY
            cited.citation_id) AS citation_5
         
      ON cpc.patent_id=citation_5.patent_id
      WHERE
       (cpc.subsection_id IN ('C05', 'C07', 'C08', 'C09', 'C11', 'C12', 'C13', 'C25', 'C40')
        OR cpc.group_id in ('A01G', 'A01H', 'A61K', 'A61P', 'A61Q', 'B01F', 'B01J', 'B81B', 'B82B', 'B82Y','G01N', 'G16H')))
  as filterData
  WHERE
  app.patent_id = filterData.patent_id AND app.patent_id = patent.id 
  AND SAFE_CAST(app.date AS DATE) < '2014-01-01' AND claims.patent_id  =app.patent_id) as db
  LEFT JOIN `patents-research-275923.patents.backcited` as backcited ON CAST(backcited.patent_id AS STRING) = db.patent_id
"""
    
    
    
PROJECT_ID = "patents-research-275923"
CLOUD_BUCKET_NAME = "patents-research"
CLOUD_FILEPATH = "patent_research/data.tsv"

COLUMNS = ['patent_id', 'fwrdCitations_5','publication_date','bkwdCitations','title', 'abstract', 'claims']
START_TOKENS = ['<start -id>','<start -date>', '<start -backcited>', '<start -title>',  '<start -abstract>', '<start -claims>']
END_TOKENS = ['<end -id>','<end -date>', '<end -backcited>','<end -title>',  '<end -abstract>','<end -claims>' ]
VAR_COLUMNS = ['patent_id','publication_date','bkwdCitations', 'title', 'abstract', 'claims']



## Base Functions to push to google storage and concatenate data

In [None]:
#ctrl-
def addFieldTokens(df, start_tokens, end_tokens, col_names):
    '''
    Iterate through all df columns; 
    append the start tokens as a prefix to the data;
    append the end tokens as a suffix to the data;
    '''
    for i, col in enumerate(col_names):  # iter thru the "variable" columns (3:end)
            # set rows to be of format: start token, data, end token
            if col != 'claims' and col!='bkwdCitations':
                df[col_names[i]] = df[col_names[i]].apply(lambda x: start_tokens[i]+str(x)+ end_tokens[i])
                    
                #df[col_names[i]] = start_tokens[i] + df[col_names[i]].astype(str) + end_tokens[i]
    return df

def concatColumns(df):
    '''
    Function to condense preprocessed and field tokenized columns into one column
    This column contains a single concatenated string with all metadata
    '''
    #	id	label	alpha	text
    #df["text"] = df["publication_date"]+df["bkwdCitations"]+df["title"]+df["summary_text"]+df["claims"]
    df["text"] = ""
    i=0
    for col in VAR_COLUMNS:
        df["text"]+= df[col]
        df.drop([col], axis=1, inplace=True)
        i+=1
        print("{}/{} done".format(i,len(VAR_COLUMNS)))

def upload_df_toGoogle(data_frame, filepath="patent_research/data2.tsv"):
    """Uploads a file to the bucket."""
    # bucket_name = "your-bucket-name"
    # source_file_name = "local/path/to/file"
    # destination_blob_name = "storage-object-name"
    
    # create bucket if it does not exists
    STORAGE_CLIENT = storage.Client()
    if not STORAGE_CLIENT.lookup_bucket(CLOUD_BUCKET_NAME):
        STORAGE_CLIENT.create_bucket(CLOUD_BUCKET_NAME, project=PROJECT_ID)
    
    with tempfile.NamedTemporaryFile() as temp:
        df.to_csv(temp.name, index=False,sep='\t')
        with open(temp.name, 'rb') as source_file:
            # upload to google cloud
            storage_client = storage.Client()
            bucket = storage_client.bucket(CLOUD_BUCKET_NAME)
            blob = bucket.blob(filepath)

            blob.upload_from_file(temp.name,content_type='text/tab-separated-values')

    print("File data_frame uploaded to {}".format(filepath))

## Conat the Columns into Text blob and upload tsv to cloud

In [None]:

# Get data from big query
print('Executing big query')
df = pd.read_gbq(QUERY2, project_id=PROJECT_ID, dialect='standard', progress_bar_type='tqdm')
print('Finised big query')

In [None]:

# Uravel claims - for some reason it was in a dictionary
df.claims = df.claims.apply(lambda x: x['claims'])

In [None]:
# Rearrange data schema
print('Rearrange cols')
df = df[COLUMNS]
print('Finished Rearrange cols')

In [None]:
print('Adding token delimiters')

df = addFieldTokens(df,START_TOKENS,END_TOKENS,VAR_COLUMNS)
print('Finished adding delimiters')


In [None]:
# add the tokens to the filed
print('Conc colu,ns')
df["text"] = ""
i=0
for col in VAR_COLUMNS:
    df["text"]+=" "+df[col]
    if col!= "patent_id" and col !="publication_date":
        df.drop([col], axis=1, inplace=True)
    i+=1
    print("{}/{} done".format(i,len(VAR_COLUMNS)))
print('Finished adding delimiters')

In [None]:
# concat the coulms to one text blob
print('Creating text blobs')
concatColumns(df)
print('Finished text blobs')

In [None]:
#upload to big cloud storage
print('Uploading to cloud storages')
#upload_df_toGoogle(df)

df.to_csv('gs://patents-research/patent_research/data_frwdcorrect.tsv', sep='\t',index=False)

#df.read_csv('gs://patents-research/patent_research/data_frwdcorrect.tsv', sep='\t')


df.fwrdCitations_5.describe(percentiles=[.25, .5, .75, .95,.99] )   


print('Finished to cloud storages')