In [12]:
import pandas as pd
import numpy as np
import google.auth
from google.auth import compute_engine
from google.cloud import bigquery
from google.colab import auth

# Create the BQ client. This will ask you to log in the first time.

project_id = 'data-engineering-prod'
auth.authenticate_user()
client = bigquery.Client(project=project_id)

In [13]:
# list datasets and number of views/tables
print("Collecting data on all views in " + project_id + ". This may take some time...")
views = []
dataset_count = 0
for dataset in client.list_datasets():
  if dataset.dataset_id.startswith('u_'): continue
  dataset_views = list(client.list_tables(dataset=dataset.reference))
  dataset_count += 1
  for view in dataset_views:
    try:
      dataset_ref = bigquery.DatasetReference(project_id, dataset.dataset_id)
      view_ref = dataset_ref.table(view.table_id)
      v = client.get_table(view_ref)
      views.append(v)

    except Exception, err:
      print(err)
      pass

  print(dataset.dataset_id + " contains {:d} views".format(len(dataset_views)))

print("\n" + project_id + " contains {:d} dataset".format(dataset_count))
print("\nDone!")

Collecting data on all views in data-engineering-prod. This may take some time...

Collecting views for: landing_andromeda
landing_andromeda contains 109 views

Collecting views for: landing_andromeda_secure
landing_andromeda_secure contains 660 views

Collecting views for: landing_identity_secure
landing_identity_secure contains 5 views

Collecting views for: landing_orex_secure
landing_orex_secure contains 16 views

Collecting views for: landing_orion_migration_secure
landing_orion_migration_secure contains 51 views

Collecting views for: landing_orion_secure
landing_orion_secure contains 3 views

Collecting views for: landing_pace_secure
landing_pace_secure contains 1 views

Collecting views for: landing_payments_secure
landing_payments_secure contains 23 views

Collecting views for: product_andromeda_secure
product_andromeda_secure contains 200 views

Collecting views for: product_identity_secure
product_identity_secure contains 1 views

Collecting views for: product_orion
product_

In [None]:
import datetime
import pytz
import google.cloud.bigquery.job as job
from collections import Counter
from  __builtin__ import any as b_any
from google.cloud import storage

### JOBS ANALYSIS ###
# Performs analysis on which objects are being used,
# how often and by whom.

# submit query as user in data-integration-prod
project_id = 'data-integration-prod' 
client = bigquery.Client(project=project_id)
print("Starting jobs processing...")

# for each object
jobs_data = []
error_data = []
cost_data = []

project_id_for_query = "data-engineering-prod"

for view in views:

  # we now want to use the logs dump to return jobs only for the specified view
  query = """
      select protopayload_auditlog.authenticationInfo.principalEmail as email,
      timestamp as time,
      protopayload_auditlog.servicedata_v1_bigquery.jobInsertRequest.resource.jobName.location as location,
      protopayload_auditlog.servicedata_v1_bigquery.jobInsertRequest.resource.jobConfiguration.query.query as query,
      protopayload_auditlog.servicedata_v1_bigquery.jobInsertRequest.resource.jobStatus.error.message
      from andromeda-data-nonprod.test_log_export_eu.cloudaudit_googleapis_com_data_access_data_engineering_prod
      where REGEXP_CONTAINS(protopayload_auditlog.servicedata_v1_bigquery.jobInsertRequest.resource.jobConfiguration.query.query, "{0}.{1}.{2}") 
      and protopayload_auditlog.servicedata_v1_bigquery.jobInsertRequest.resource.jobConfiguration.dryRun is null
      and protopayload_auditlog.servicedata_v1_bigquery.jobInsertResponse.resource.jobStatus.state = "DONE"
      and timestamp BETWEEN TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL -30 DAY) AND CURRENT_TIMESTAMP()
    """.format(project_id_for_query, view.dataset_id, view.table_id)
  query_job = client.query(query)

  users = map(lambda x: x[0], list(query_job))
  most_common_user = list(Counter(users).most_common())[0][0] if len(users) > 0 else "No user data"
  last_query_mapped = map(lambda x: x[1], list(query_job))
  last_query_filter = filter(lambda x: x != None, last_query_mapped)
  last_query = next(iter(sorted(last_query_filter, reverse=True)), None)
  
  d = [view.full_table_id, 
      view.dataset_id,
      view.table_id,
      view.dataset_id.split('_')[0], 
      len(list(query_job)),
      most_common_user,
      b_any("serviceaccount.com" in user for user in Counter(users).keys()),
      b_any("ovoenergy.com" in user for user in Counter(users).keys()),
      last_query.strftime("%Y%m%d") if last_query != None else None]  

  jobs_data.append(d)

  # if ["EU","US"] => multi-region (this will be every query basically so not high importance)
  # if ORDER BY => requires single node
  # any errors?
  # log the query
  for job in list(query_job): 
    if (job[4] != None) | ('ORDER BY' in job[3]):
      error = [view.full_table_id, 
               job[2],
               job[4],
               job[3],
               'ORDER BY' in job[3]]
      print(error)

print("Jobs processing finished.")

Starting jobs processing...


In [None]:
# write data to newline delimited json file
json_file = "data_engineering_prod_bq_jobs_analysis_30_days.json"
columns = ["full_table_id","dataset_id","table_id","layer","queries","most_common_user","any_service_account","any_retail_account","last_query"]
df = pd.DataFrame(jobs_data, columns=columns)
# bq requires newline delimited json so append line break
file = open(json_file, "w")
for row in df.iterrows():
  row[1].to_json(file)
  file.write("\n")
file.close()

# write the results to a gc bucket 
print("Writing results to GC Bucket..")
client = storage.Client(project=project_id)
bucket = client.get_bucket("data-engineering-prod-bq-analysis")
blob = bucket.blob(json_file)

with open('data_engineering_prod_bq_jobs_analysis_30_days.json', 'rb') as file:
  blob.upload_from_file(file)

# write data to data-integration-prod
# to update the results we will wipe the existing table and replace it
project_id = 'data-integration-prod'
dataset_id = 'data_engineering_prod_bq_analysis'
table_id = 'data_engineering_prod_bq_jobs_analysis_30_days'
full_table_id = project_id + '.' + dataset_id + '.' + table_id
print("Updating " + full_table_id)

client = bigquery.Client(project=project_id)

# remove existing table
client.delete_table(full_table_id, not_found_ok=True)

# write new table
job_config = bigquery.LoadJobConfig(autodetect=True, 
            source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON)
uri = "gs://data-engineering-prod-bq-analysis/data_engineering_prod_bq_jobs_analysis_30_days.json"
load_job = client.load_table_from_uri(
    uri, full_table_id, job_config=job_config)  # Make an API request.
load_job.result() 

# check the number of rows loaded into the table is correct
destination_table = client.get_table(full_table_id)
print("Loaded {} rows.".format(destination_table.num_rows))
assert(len(jobs_data) == destination_table.num_rows)

In [None]:
### ERROR DATA ###
# write data to newline delimited json file for errors
json_file = "data_engineering_prod_bq_jobs_cost_analysis.json"
columns = ["table_id","location","error_message","query","order_by"]
df = pd.DataFrame(error_data, columns=columns)
# bq requires newline delimited json so append line break
file = open(json_file, "w")
for row in df.iterrows():
  row[1].to_json(file)
  file.write("\n")
file.close()

In [None]:
from google.cloud import storage

# write the results to a gc bucket 
client = storage.Client(project=project_id)
bucket = client.get_bucket("data-engineering-prod-bq-analysis")
blob = bucket.blob(json_file)

with open('data_engineering_prod_bq_jobs_cost_analysis.json', 'rb') as file:
  blob.upload_from_file(file)

In [None]:
# write data to data-integration-prod
# to update the results we will wipe the existing table and replace it
project_id = 'data-integration-prod'
dataset_id = 'data_engineering_prod_bq_analysis'
table_id = 'data_engineering_prod_bq_jobs_cost_analysis'
full_table_id = project_id + '.' + dataset_id + '.' + table_id
client = bigquery.Client(project=project_id)

# remove existing table
client.delete_table(full_table_id, not_found_ok=True)

# write new table
job_config = bigquery.LoadJobConfig(autodetect=True, 
            source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON)
uri = "gs://data-engineering-prod-bq-analysis/data_engineering_prod_bq_jobs_cost_analysis.json"
load_job = client.load_table_from_uri(
    uri, full_table_id, job_config=job_config)  # Make an API request.
load_job.result() 

# check the number of rows loaded into the table is correct
destination_table = client.get_table(full_table_id)
print("Loaded {} rows.".format(destination_table.num_rows))
assert(len(error_data) == destination_table.num_rows)