In [53]:
import pandas as pd
import numpy as np
import google.auth
from google.auth import compute_engine
from google.cloud import bigquery
from google.colab import auth

# Create the BQ client. This will ask you to log in the first time.

project_id = 'data-engineering-prod'
auth.authenticate_user()
client = bigquery.Client(project=project_id)

In [54]:
# list datasets and number of views/tables
datasets = []
for dataset in client.list_datasets():
    views = list(client.list_tables(dataset=dataset.reference))
    datasets.append(views)
    print(dataset.dataset_id + " contains {:d} views".format(len(views)))
    
print("\n" + project_id + " contains {:d} datasets".format(len(datasets)))

landing_andromeda contains 107 views
landing_andromeda_secure contains 660 views
landing_identity_secure contains 4 views
landing_orex_secure contains 16 views
landing_orion_migration_secure contains 51 views
landing_orion_secure contains 3 views
landing_pace_secure contains 1 views
landing_payments_secure contains 23 views
product_andromeda_secure contains 197 views
product_identity_secure contains 1 views
product_orion contains 31 views
product_orion_secure contains 100 views
product_payments_secure contains 1 views
raw_andromeda_secure contains 418 views
raw_identity_secure contains 2 views
raw_orex_secure contains 11 views
raw_orion_migration_secure contains 13 views
raw_orion_secure contains 1 views
raw_pace_secure contains 1 views
raw_payments_secure contains 5 views

data-engineering-prod contains 20 datasets


In [55]:
# Collect data on all views within datasets
print("Collecting data on all views in " + project_id + ". This may take some time...\n")
views = []

for dataset in client.list_datasets():
    print("Collecting views for: " + dataset.dataset_id)
    
    for view in list(client.list_tables(dataset=dataset.reference)):
        try:
            dataset_ref = bigquery.DatasetReference(project_id, dataset.dataset_id)
            view_ref = dataset_ref.table(view.table_id)
            v = client.get_table(view_ref)
            views.append(v)

        except Exception, err:
            print(err)
            pass
        
print("\nDone!")

Collecting data on all views in data-engineering-prod. This may take some time...

Collecting views for: landing_andromeda
Collecting views for: landing_andromeda_secure
Collecting views for: landing_identity_secure
Collecting views for: landing_orex_secure
Collecting views for: landing_orion_migration_secure
Collecting views for: landing_orion_secure
Collecting views for: landing_pace_secure
Collecting views for: landing_payments_secure
Collecting views for: product_andromeda_secure
Collecting views for: product_identity_secure
Collecting views for: product_orion
Collecting views for: product_orion_secure
Collecting views for: product_payments_secure
Collecting views for: raw_andromeda_secure
Collecting views for: raw_identity_secure
Collecting views for: raw_orex_secure
Collecting views for: raw_orion_migration_secure
Collecting views for: raw_orion_secure
Collecting views for: raw_pace_secure
Collecting views for: raw_payments_secure

Done!


In [56]:
# Example data from table/view
print("Example table/view and query:")
print("\nView: {}".format(views[0].full_table_id))
print("\nView Query:\n{}".format(views[0].view_query))
print("\nType: {}".format(views[0].table_type))

Example table/view and query:

View: data-engineering-prod:landing_andromeda.energy_contracts_consumption_charge_generated_v2

View Query:
#standardSQL
SELECT
  `kafkaData`,
  `unionRecord`,
  `_PARTITIONTIME` `PARTITIONTIME`
FROM
  `data-engineering-prod.auto_capture_v2.energy_contracts_consumption_charge_generated_v2`

Type: VIEW


In [58]:
# select required data from views
data = []
for view in views:
    d = [view.dataset_id, view.table_id, view.table_type, view.view_query, view.location, view.num_bytes, view.modified.strftime('%Y-%m-%d %H:%M:%S'), view.created.strftime('%Y-%m-%d %H:%M:%S'), view.self_link]
    data.append(d)

In [63]:
# put views into newline delimited json file
json_file = "data_engineering_prod_bq_analysis.json"
columns = ["dataset","table","type","query","location","bytes","last_modified","created","link"]
df = pd.DataFrame(data, columns=columns)
# bq require newline delimited so must append line break
file = open(json_file, "w")
for row in df.iterrows():
  row[1].to_json(file)
  file.write("\n")
file.close()
df

Unnamed: 0,dataset,table,type,query,location,bytes,last_modified,created,link
0,landing_andromeda,energy_contracts_consumption_charge_generated_v2,VIEW,"#standardSQL\nSELECT\n `kafkaData`,\n `union...",EU,0,2018-10-01 23:45:33,2018-08-16 16:48:02,https://www.googleapis.com/bigquery/v2/project...
1,landing_andromeda,energy_contracts_consumption_charge_generated_v3,VIEW,"#standardSQL\nSELECT\n `kafkaData`,\n `union...",EU,0,2018-10-01 23:45:36,2018-08-16 16:48:08,https://www.googleapis.com/bigquery/v2/project...
2,landing_andromeda,energy_contracts_consumption_charge_generated_v6,VIEW,"#standardSQL\nSELECT\n `unionRecord`,\n `kaf...",EU,0,2019-09-11 12:31:46,2019-09-11 12:26:49,https://www.googleapis.com/bigquery/v2/project...
3,landing_andromeda,energy_contracts_consumption_charge_generated_v7,VIEW,"#standardSQL\nSELECT\n `unionRecord`,\n `kaf...",EU,0,2019-11-25 10:53:00,2019-11-25 10:53:00,https://www.googleapis.com/bigquery/v2/project...
4,landing_andromeda,energy_contracts_consumption_charge_generated_v8,VIEW,"#standardSQL\nSELECT\n `unionRecord`,\n `kaf...",EU,0,2019-12-05 15:58:11,2019-12-05 15:58:11,https://www.googleapis.com/bigquery/v2/project...
5,landing_andromeda,energy_contracts_contract_created_v1,VIEW,"#standardSQL\nSELECT\n `metadata`,\n `mpxn`,...",EU,0,2018-10-01 23:45:42,2018-08-16 16:48:18,https://www.googleapis.com/bigquery/v2/project...
6,landing_andromeda,energy_contracts_contract_created_v2,VIEW,"#standardSQL\nSELECT\n `metadata`,\n `mpxn`,...",EU,0,2018-10-01 23:45:45,2018-08-16 16:48:23,https://www.googleapis.com/bigquery/v2/project...
7,landing_andromeda,energy_contracts_contract_v1,VIEW,"#standardSQL\nSELECT\n `unionRecord`,\n `kaf...",EU,0,2018-10-01 23:45:48,2018-08-16 16:48:28,https://www.googleapis.com/bigquery/v2/project...
8,landing_andromeda,energy_contracts_contract_v2,VIEW,"#standardSQL\nSELECT\n `unionRecord`,\n `kaf...",EU,0,2018-10-02 09:53:29,2018-10-02 09:53:29,https://www.googleapis.com/bigquery/v2/project...
9,landing_andromeda,energy_contracts_electricity_consumption_charg...,VIEW,"#standardSQL\nSELECT\n `metadata`,\n `supply...",EU,0,2018-10-01 23:45:51,2018-08-16 16:48:33,https://www.googleapis.com/bigquery/v2/project...


In [60]:
from google.cloud import storage

# write the results to a gc bucket 
client = storage.Client(project=project_id)
bucket = client.get_bucket("data-engineering-prod-bq-analysis")
blob = bucket.blob(json_file)

with open('data_engineering_prod_bq_analysis.json', 'rb') as file:
  blob.upload_from_file(file)

In [61]:
# write data to data-integration-prod
# to update the results we will wipe the existing table and replace it
project_id = 'data-integration-prod'
dataset_id = 'data_engineering_prod_bq_analysis'
table_id = 'data_engineering_prod_bq_analysis'
full_table_id = project_id + '.' + dataset_id + '.' + table_id
client = bigquery.Client(project=project_id)

# remove existing table
client.delete_table(full_table_id, not_found_ok=True) 

# write new table
job_config = bigquery.LoadJobConfig(autodetect=True, 
            source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON)
uri = "gs://data-engineering-prod-bq-analysis/data_engineering_prod_bq_analysis.json"
load_job = client.load_table_from_uri(
    uri, full_table_id, job_config=job_config)  # Make an API request.
load_job.result() 

# check the number of rows loaded into the table is correct
destination_table = client.get_table(full_table_id)
print("Loaded {} rows.".format(destination_table.num_rows))
assert(len(views) == destination_table.num_rows)

Loaded 1646 rows.
