In [None]:
import pandas as pd
import numpy as np
import google.auth
from google.auth import compute_engine
from google.cloud import bigquery
from google.colab import auth

# Create the BQ client. This will ask you to log in the first time.

project_id = 'data-engineering-prod'
auth.authenticate_user()
client = bigquery.Client(project=project_id)

In [None]:
# list datasets and number of views/tables
print("Collecting data on all views in " + project_id + ". This may take some time...")
views = []
dataset_count = 0
for dataset in client.list_datasets():
  if dataset.dataset_id.startswith('u_'): continue
  dataset_views = list(client.list_tables(dataset=dataset.reference))
  dataset_count += 1
  for view in dataset_views:
      views.append(view)

  print(dataset.dataset_id + " contains {:d} views".format(len(dataset_views)))

print("\n" + project_id + " contains {:d} dataset".format(dataset_count))
print("\nDone!")

In [None]:
from datetime import datetime, timedelta
import pytz
import google.cloud.bigquery.job as job
from collections import Counter
from  __builtin__ import any as b_any
from google.cloud import storage
from timeit import default_timer as timer

### JOBS ANALYSIS ###
# Performs analysis on which objects are being used,
# how often and by whom.

# submit query as user in data-integration-prod
project_id = 'data-integration-prod' 
client = bigquery.Client(project=project_id)
print("Starting jobs processing...")
start = timer()

# for each object
jobs_data = []
error_data = []
cost_data = []

project_id_for_query = "data-engineering-prod"

today = datetime.now().strftime('%Y%m%d')
startDate = (datetime.now() - timedelta(days=1)).strftime('%Y%m%d')

for view in views:

  # we now want to use the logs dump to return jobs only for the specified view
  query = """
      select protopayload_auditlog.authenticationInfo.principalEmail as email,
      timestamp as time,
      protopayload_auditlog.servicedata_v1_bigquery.jobInsertRequest.resource.jobName.location as location,
      protopayload_auditlog.servicedata_v1_bigquery.jobInsertRequest.resource.jobConfiguration.query.query as query,
      protopayload_auditlog.servicedata_v1_bigquery.jobInsertRequest.resource.jobStatus.error.message
      from `data-engineering-prod.logs_bigquery.cloudaudit_googleapis_com_data_access_*`
      where REGEXP_CONTAINS(protopayload_auditlog.servicedata_v1_bigquery.jobInsertRequest.resource.jobConfiguration.query.query, "{0}") 
      and protopayload_auditlog.servicedata_v1_bigquery.jobInsertRequest.resource.jobConfiguration.dryRun is null
      and protopayload_auditlog.servicedata_v1_bigquery.jobInsertResponse.resource.jobStatus.state = "DONE"
      AND _TABLE_SUFFIX BETWEEN '{1}' and '{2}'
    """.format(view.full_table_id, today, startDate)
  query_job = client.query(query)

  users = map(lambda x: x[0], list(query_job))
  most_common_user = list(Counter(users).most_common())[0][0] if len(users) > 0 else "No user data"
  last_query_mapped = map(lambda x: x[1], list(query_job))
  last_query_filter = filter(lambda x: x != None, last_query_mapped)
  last_query = next(iter(sorted(last_query_filter, reverse=True)), None)
  
  d = [view.full_table_id, 
      view.dataset_id,
      view.table_id,
      view.dataset_id.split('_')[0], 
      len(list(query_job)),
      most_common_user,
      b_any("serviceaccount.com" in user for user in Counter(users).keys()),
      b_any("ovoenergy.com" in user for user in Counter(users).keys()),
      last_query.strftime("%Y%m%d") if last_query != None else None]  

  jobs_data.append(d)

  # if ["EU","US"] => multi-region (this will be every query basically so not high importance)
  # if ORDER BY => requires single node
  # any errors?
  # log the query
  for job in list(query_job): 
    if (job[4] != None) | ('ORDER BY' in job[3]):
      error = [view.full_table_id, 
               job[2],
               job[4],
               job[3],
               'ORDER BY' in job[3]]
      print(error)

end = timer()
print("Jobs processing finished in {} seconds".format(end-start))

Starting jobs processing...


In [None]:
# Import the Google Cloud client library and JSON library
from google.cloud import storage
import json
from google.cloud import bigquery

project_id='data-integration-prod'
# Instantiate a Google Cloud Storage client and specify required bucket and file
storage_client = storage.Client(project=project_id)
bucket = storage_client.get_bucket("data-engineering-prod-bq-analysis")
blob = bucket.blob("data_engineering_prod_bq_jobs_analysis_30_days.json")

# Download the contents of the blob as a string and then parse it using json.loads() method
json_file = "data_engineering_prod_bq_analysis_30_days_update_script.json"

# Download a table.
table = bigquery.TableReference.from_string("data-integration-prod.data_engineering_prod_bq_analysis.data_engineering_prod_bq_jobs_analysis_30_days")
client = bigquery.Client(project=project_id)
dataframe = client.list_rows(table).to_dataframe()
data = dataframe.values.tolist()
print(len(data))

# update data and reupload to bucket
for entry in data:
  for job in jobs_data:
    if job[0] in entry[6]: # check full table name
        entry[0] = today # update last query
        entry[3] += job[4] # increase query count
        if job[6] == True & entry[1] == False: # service account
          entry[1] = True
        if job[7] == True & entry[5] == False: # retail account
          entry[5] = True
        
    else:
      d = [today,
           job[6], # service account
           job[5], # most common user
           job[4], # number of queries
           job[3], # layer
           job[7], # retail account
           job[0], # full table id
           job[2], # table id
           job[1]], # dataset id
      data.append(d) 


# write data to data-integration-prod
# to update the results we will wipe the existing table and replace it
#project_id = 'data-integration-prod'
#dataset_id = 'data_engineering_prod_bq_analysis'
#table_id = 'data_engineering_prod_bq_jobs_analysis_30_days'
#full_table_id = project_id + '.' + dataset_id + '.' + table_id
#print("Updating " + full_table_id)

#client = bigquery.Client(project=project_id)

# remove existing table
#client.delete_table(full_table_id, not_found_ok=True)

# write new table
#job_config = bigquery.LoadJobConfig(autodetect=True, 
#            source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON)
#uri = "gs://data-engineering-prod-bq-analysis/data_engineering_prod_bq_jobs_analysis_30_days.json"
#load_job = client.load_table_from_uri(
#    uri, full_table_id, job_config=job_config)  # Make an API request.
#load_job.result() 

# check the number of rows loaded into the table is correct
#destination_table = client.get_table(full_table_id)
#print("Loaded {} rows.".format(destination_table.num_rows))
#assert(len(jobs_data) == destination_table.num_rows)