In [None]:
import pandas as pd
import numpy as np
import google.auth
from google.auth import compute_engine
from google.cloud import bigquery
from google.colab import auth

# Create the BQ client. This will ask you to log in the first time.

project_id = 'data-engineering-prod'
auth.authenticate_user()
client = bigquery.Client(project=project_id)

In [None]:
# list datasets and number of views/tables
print("Collecting data on all views in " + project_id + ". This may take some time...")
views = []
dataset_count = 0
for dataset in client.list_datasets():
  print("\nCollecting views for: " + dataset.dataset_id)  
  dataset_views = list(client.list_tables(dataset=dataset.reference))
  dataset_count += 1
  for view in dataset_views:
    try:
      dataset_ref = bigquery.DatasetReference(project_id, dataset.dataset_id)
      view_ref = dataset_ref.table(view.table_id)
      v = client.get_table(view_ref)
      views.append(v)

    except Exception, err:
      print(err)
      pass

  print(dataset.dataset_id + " contains {:d} views".format(len(dataset_views)))

print("\n" + project_id + " contains {:d} dataset".format(dataset_count))
print("\nDone!")

In [None]:
# Example data from table/view
print("Example table/view and query:")
print("\nView: {}".format(views[0].full_table_id))
print("\nView Query:\n{}".format(views[0].view_query))
print("\nType: {}".format(views[0].table_type))

Example table/view and query:

View: data-engineering-prod:landing_andromeda.energy_contracts_consumption_charge_generated_v2

View Query:
#standardSQL
SELECT
  `kafkaData`,
  `unionRecord`,
  `_PARTITIONTIME` `PARTITIONTIME`
FROM
  `data-engineering-prod.auto_capture_v2.energy_contracts_consumption_charge_generated_v2`

Type: VIEW


In [None]:
### COST ANALYSIS ###
# This section is for the cost analysis of existing tables and views in the project

# https://cloud.google.com/bigquery/pricing

# - cost of storage (bytes)
# - region/multi
# - partitioned
# - expiration

# collect relevant data
cost_data = []

# https://cloud.google.com/bigquery/docs/locations
# indicates "EU" & "US" are multi-regional locations
multi_regional = ["EU", "US"]

for view in views:
  multi = True if view.location in multi_regional else False

  d = [view.full_table_id,
      view.location, 
      multi,
      view.num_bytes, 
      view.expires.strftime('%Y-%m-%d %H:%M:%S') if view.expires != None else None,
      view.partitioning_type,
      view.partition_expiration.strftime('%Y-%m-%d %H:%M:%S') if view.partition_expiration != None else None,
      'ORDER BY' in view.view_query if view.view_query != None else False,
      'JOIN' in view.view_query if view.view_query != None else False
      ]
  cost_data.append(d)
    

In [None]:
# write data to newline delimited json file
json_file = "data_engineering_prod_bq_cost_analysis.json"
columns = ["full_table_id","location","multi","bytes","expiration","partition","parition_expiration","order_by","join"]
df = pd.DataFrame(cost_data, columns=columns)
# bq requires newline delimited json so append line break
file = open(json_file, "w")
for row in df.iterrows():
  row[1].to_json(file)
  file.write("\n")
file.close()
df

In [None]:
from google.cloud import storage

# write the results to a gc bucket 
client = storage.Client(project=project_id)
bucket = client.get_bucket("data-engineering-prod-bq-analysis")
blob = bucket.blob(json_file)

with open('data_engineering_prod_bq_cost_analysis.json', 'rb') as file:
  blob.upload_from_file(file)

In [None]:
# write data to data-integration-prod
# to update the results we will wipe the existing table and replace it
project_id = 'data-integration-prod'
dataset_id = 'data_engineering_prod_bq_analysis'
table_id = 'data_engineering_prod_bq_cost_analysis'
full_table_id = project_id + '.' + dataset_id + '.' + table_id
client = bigquery.Client(project=project_id)

# remove existing table
client.delete_table(full_table_id, not_found_ok=True)

# write new table
job_config = bigquery.LoadJobConfig(autodetect=True, 
            source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON)
uri = "gs://data-engineering-prod-bq-analysis/data_engineering_prod_bq_cost_analysis.json"
load_job = client.load_table_from_uri(
    uri, full_table_id, job_config=job_config)  # Make an API request.
load_job.result() 

# check the number of rows loaded into the table is correct
destination_table = client.get_table(full_table_id)
print("Loaded {} rows.".format(destination_table.num_rows))
assert(len(cost_data) == destination_table.num_rows)