In [None]:
import pandas as pd
import numpy as np
import google.auth
from google.auth import compute_engine
from google.cloud import bigquery
from google.colab import auth

# Create the BQ client. This will ask you to log in the first time.

project_id = 'data-engineering-prod'
auth.authenticate_user()
client = bigquery.Client(project=project_id)

In [None]:
# list datasets and number of views/tables
print("Collecting data on all views in " + project_id + ". This may take some time...")
views = []
datasetCount = 0
for dataset in client.list_datasets():
  print("\nCollecting views for: " + dataset.dataset_id)  
  datasetViews = list(client.list_tables(dataset=dataset.reference))
  datasetCount += 1
  for view in datasetViews:
    try:
      dataset_ref = bigquery.DatasetReference(project_id, dataset.dataset_id)
      view_ref = dataset_ref.table(view.table_id)
      v = client.get_table(view_ref)
      views.append(v)

    except Exception, err:
      print(err)
      pass

  print(dataset.dataset_id + " contains {:d} views".format(len(datasetViews)))

print("\n" + project_id + " contains {:d} dataset".format(datasetCount))
print("\nDone!")

In [None]:
# Example data from table/view
print("Example table/view and query:")
print("\nView: {}".format(views[0].full_table_id))
print("\nView Query:\n{}".format(views[0].view_query))
print("\nType: {}".format(views[0].table_type))

Example table/view and query:

View: data-engineering-prod:landing_andromeda.energy_contracts_consumption_charge_generated_v2

View Query:
#standardSQL
SELECT
  `kafkaData`,
  `unionRecord`,
  `_PARTITIONTIME` `PARTITIONTIME`
FROM
  `data-engineering-prod.auto_capture_v2.energy_contracts_consumption_charge_generated_v2`

Type: VIEW


In [108]:
# collect relevant data
data = []

# https://cloud.google.com/bigquery/docs/locations
# indicates "EU" & "US" are multi-regional locations
multi_regional = ["EU", "US"]

for view in views:
  multi = True if view.location in multi_regional else False

  d = [view.full_table_id,
      view.table_type, 
      view.view_query, 
      view.location, 
      multi,
      view.num_bytes, 
      view.created.strftime('%Y-%m-%d %H:%M:%S'), 
      view.modified.strftime('%Y-%m-%d %H:%M:%S'), 
      view.expires.strftime('%Y-%m-%d %H:%M:%S') if view.expires != None else None,
      view.partitioning_type,
      view.partition_expiration.strftime('%Y-%m-%d %H:%M:%S') if view.partition_expiration != None else None,]
  data.append(d)

In [109]:
# write data to newline delimited json file
json_file = "data_engineering_prod_bq_analysis.json"
columns = ["full_table_id","type","query","location","multi_regional","bytes","created", "last_modified","expiration","partitioning_type","partition_expiration"]
df = pd.DataFrame(data, columns=columns)
# bq requires newline delimited json so append line break
file = open(json_file, "w")
for row in df.iterrows():
  row[1].to_json(file)
  file.write("\n")
file.close()
df

Unnamed: 0,full_table_id,type,query,location,multi_regional,bytes,created,last_modified,expiration,partitioning_type,partition_expiration
0,data-engineering-prod:landing_andromeda.energy...,VIEW,"#standardSQL\nSELECT\n `kafkaData`,\n `union...",EU,True,0,2018-08-16 16:48:02,2018-10-01 23:45:33,,,
1,data-engineering-prod:landing_andromeda.energy...,VIEW,"#standardSQL\nSELECT\n `kafkaData`,\n `union...",EU,True,0,2018-08-16 16:48:08,2018-10-01 23:45:36,,,
2,data-engineering-prod:landing_andromeda.energy...,VIEW,"#standardSQL\nSELECT\n `unionRecord`,\n `kaf...",EU,True,0,2019-09-11 12:26:49,2019-09-11 12:31:46,,,
3,data-engineering-prod:landing_andromeda.energy...,VIEW,"#standardSQL\nSELECT\n `unionRecord`,\n `kaf...",EU,True,0,2019-11-25 10:53:00,2019-11-25 10:53:00,,,
4,data-engineering-prod:landing_andromeda.energy...,VIEW,"#standardSQL\nSELECT\n `unionRecord`,\n `kaf...",EU,True,0,2019-12-05 15:58:11,2019-12-05 15:58:11,,,
5,data-engineering-prod:landing_andromeda.energy...,VIEW,"#standardSQL\nSELECT\n `metadata`,\n `mpxn`,...",EU,True,0,2018-08-16 16:48:18,2018-10-01 23:45:42,,,
6,data-engineering-prod:landing_andromeda.energy...,VIEW,"#standardSQL\nSELECT\n `metadata`,\n `mpxn`,...",EU,True,0,2018-08-16 16:48:23,2018-10-01 23:45:45,,,
7,data-engineering-prod:landing_andromeda.energy...,VIEW,"#standardSQL\nSELECT\n `unionRecord`,\n `kaf...",EU,True,0,2018-08-16 16:48:28,2018-10-01 23:45:48,,,
8,data-engineering-prod:landing_andromeda.energy...,VIEW,"#standardSQL\nSELECT\n `unionRecord`,\n `kaf...",EU,True,0,2018-10-02 09:53:29,2018-10-02 09:53:29,,,
9,data-engineering-prod:landing_andromeda.energy...,VIEW,"#standardSQL\nSELECT\n `metadata`,\n `supply...",EU,True,0,2018-08-16 16:48:33,2018-10-01 23:45:51,,,


In [110]:
from google.cloud import storage

# write the results to a gc bucket 
client = storage.Client(project=project_id)
bucket = client.get_bucket("data-engineering-prod-bq-analysis")
blob = bucket.blob(json_file)

with open('data_engineering_prod_bq_analysis.json', 'rb') as file:
  blob.upload_from_file(file)

In [111]:
# write data to data-integration-prod
# to update the results we will wipe the existing table and replace it
project_id = 'data-integration-prod'
dataset_id = 'data_engineering_prod_bq_analysis'
table_id = 'data_engineering_prod_bq_analysis'
full_table_id = project_id + '.' + dataset_id + '.' + table_id
client = bigquery.Client(project=project_id)

# remove existing table
client.delete_table(full_table_id, not_found_ok=True) 

# write new table
job_config = bigquery.LoadJobConfig(autodetect=True, 
            source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON)
uri = "gs://data-engineering-prod-bq-analysis/data_engineering_prod_bq_analysis.json"
load_job = client.load_table_from_uri(
    uri, full_table_id, job_config=job_config)  # Make an API request.
load_job.result() 

# check the number of rows loaded into the table is correct
destination_table = client.get_table(full_table_id)
print("Loaded {} rows.".format(destination_table.num_rows))
assert(len(views) == destination_table.num_rows)

Loaded 1646 rows.


In [None]:
import datetime
import pytz
import google.cloud.bigquery.job as job
from collections import Counter

### JOBS ANALYSIS ###
# Performs analysis on which objects are being used,
# how often and by whom.

project_id = 'data-integration-prod' # this will need to be updated to 'data-engineering-prod'
print("Starting jobs processing...")
# for 30 days ago (min_creation_time)
thirty_days = datetime.datetime.now(pytz.UTC) - datetime.timedelta(30)
sixty_days = datetime.datetime.now(pytz.UTC) - datetime.timedelta(60)
ninety_days = datetime.datetime.now(pytz.UTC) - datetime.timedelta(90)
jobs = list(client.list_jobs(project=project_id, min_creation_time=ninety_days))

# for each object
  # how many jobs are for that object: count
  # which users are querying that object: users[]
  # when was the last query on that object: datetime
# => [table_id, count, users[], datetime]
jobsData = []
for view in views:
  # filter for query jobs
  queryJobs = filter(lambda x: type(x) is job.QueryJob, jobs)
  queryJobsForView = filter(lambda x: view.table_id in x.query, queryJobs)
  
  queries_thirty_days = filter(lambda x: x.created > thirty_days, queryJobsForView)
  queries_sixty_days= filter(lambda x: x.created > sixty_days, queryJobsForView)
  queries_ninety_days = filter(lambda x: x.created > ninety_days, queryJobsForView)

  users_30_days = map(lambda x: x.user_email, queries_thirty_days)
  users_60_days = map(lambda x: x.user_email, queries_sixty_days)
  users_90_days = map(lambda x: x.user_email, queries_ninety_days)

  lastQuery = next(iter(sorted(queryJobsForView, key = lambda x: x.created, reverse=True)), None)
  
  d = ["QueryJob",
       view.full_table_id, 
       view.dataset_id.split('_')[0], 
       len(queries_thirty_days), 
       len(queries_sixty_days),
       len(queries_ninety_days),
       list(Counter(users_30_days).most_common())[0][0] if len(users_30_days) > 0 else "No user data", 
       list(Counter(users_60_days).most_common())[0][0] if len(users_60_days) > 0 else "No user data", 
       list(Counter(users_90_days).most_common())[0][0] if len(users_90_days) > 0 else "No user data", 
       lastQuery.created.strftime('%Y-%m-%d %H:%M:%S') if lastQuery != None else None]  

  jobsData.append(d)

  # filter for load jobs - not sure these are required
#  loadJobs = filter(lambda x: type(x) is job.LoadJob, jobs)
#  loadJobsForView = filter(lambda x: view.table_id == x.destination, loadJobs)
#  users = map(lambda x: x.user_email, loadJobsForView)
#  loaded_thirty_days = filter(lambda x: x.created > thirty_days, queryJobsForView)
#  loaded_sixty_days= filter(lambda x: x.created > sixty_days, queryJobsForView)
#  loaded_ninety_days = filter(lambda x: x.created > ninety_days, queryJobsForView)
#  lastQuery = next(iter(sorted(loadJobsForView, key = lambda x: x.created, reverse=True)), None) 
#  d = ["LoadJob",
#      view.full_table_id, 
#      view.dataset_id.split('_')[0], 
#      len(queries_thirty_days), 
#      len(queries_sixty_days),
#      len(queries_ninety_days),
#      list(Counter(users).most_common()) if len(users) > 0 else "No user data", 
#      lastQuery.created.strftime('%Y-%m-%d %H:%M:%S') if lastQuery != None else None]  

#  jobsData.append(d)


print("Jobs processing finished.")

In [None]:
# write data to newline delimited json file
json_file = "data_engineering_prod_bq_jobs_analysis.json"
columns = ["type","full_table_id","layer","queries_30_days","queries_60_days","queries_90_days","most_common_user_30","most_common_user_60","most_common_user_90","last_query_time"]
df = pd.DataFrame(jobsData, columns=columns)
# bq requires newline delimited json so append line break
file = open(json_file, "w")
for row in df.iterrows():
  row[1].to_json(file)
  file.write("\n")
file.close()
df

In [None]:
from google.cloud import storage

# write the results to a gc bucket 
client = storage.Client(project=project_id)
bucket = client.get_bucket("data-engineering-prod-bq-analysis")
blob = bucket.blob(json_file)

with open('data_engineering_prod_bq_jobs_analysis.json', 'rb') as file:
  blob.upload_from_file(file)

In [None]:
# write data to data-integration-prod
# to update the results we will wipe the existing table and replace it
project_id = 'data-integration-prod'
dataset_id = 'data_engineering_prod_bq_analysis'
table_id = 'data_engineering_prod_bq_jobs_analysis'
full_table_id = project_id + '.' + dataset_id + '.' + table_id
client = bigquery.Client(project=project_id)

# remove existing table
client.delete_table(full_table_id, not_found_ok=True)

# write new table
job_config = bigquery.LoadJobConfig(autodetect=True, 
            source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON)
uri = "gs://data-engineering-prod-bq-analysis/data_engineering_prod_bq_jobs_analysis.json"
load_job = client.load_table_from_uri(
    uri, full_table_id, job_config=job_config)  # Make an API request.
load_job.result() 

# check the number of rows loaded into the table is correct
destination_table = client.get_table(full_table_id)
print("Loaded {} rows.".format(destination_table.num_rows))
assert(len(jobsData) == destination_table.num_rows)