In [31]:
import pandas as pd
import google.auth
from google.auth import compute_engine
from google.cloud import bigquery
from google.colab import auth
import google.cloud.bigquery.job as job
from datetime import datetime, timedelta
from collections import Counter
from  __builtin__ import any as b_any
from google.cloud import storage
from timeit import default_timer as timer

# Create the BQ client. This will ask you to log in the first time.
auth.authenticate_user()

# Download list of kafka topics to see if any objects are orphaned.
table = bigquery.TableReference.from_string("data-integration-prod.data_engineering_prod_bq_analysis.orion_kafka_topics")
client = bigquery.Client(project="data-integration-prod")
dataframe = client.list_rows(table).to_dataframe()
kafka_topics = dataframe.values.tolist()
topics_only = map(lambda t: t[0], kafka_topics)

# list datasets and number of views/tables
print("Collecting data on all objects. This may take some time...")
data = []
dataset_info = []
dataset_count = 0

# orion teams
orion_datasets = ["orion","orex","payment","identity","comms","flows","sre","auto_capture",
                  "tardis","bast","smint","payments","jaws","onboarding","pace"]

project_id = 'data-engineering-prod'
client = bigquery.Client(project=project_id)
for dataset in [dataset for dataset in client.list_datasets() if any(name in dataset.dataset_id for name in orion_datasets) if not dataset.dataset_id.startswith('u_')]:
  dataset_views = list(client.list_tables(dataset=dataset.reference))
  dataset_count += 1
  for view in dataset_views:
      # if the dataset is auto_capture* only the object name will indicate if for orion
      if "auto_capture" in view.dataset_id and not any(name in view.table_id for name in orion_datasets): continue
      # check if the object aligns with a kafka topic
      kafka_topic_exists = False
      if view.table_id in topics_only: kafka_topic_exists = True
      # store the information
      data.append([view.full_table_id, view.dataset_id, view.dataset_id.split('_')[0], view.table_type, view.created.strftime('%Y-%m-%d %H:%M:%S'), kafka_topic_exists])

  print(dataset.dataset_id + " contains {:d} views".format(len(dataset_views)))
  dataset_info.append([dataset.dataset_id, len(dataset_views)])

print("\n" + project_id + " contains {:d} dataset".format(dataset_count))
print("\nDone!")

In [33]:
from google.cloud import storage

# UPLOAD VIEWS 
# write data to newline delimited json file
json_file = "data_engineering_prod_bq_analysis.json"
columns = ["full_table_id","dataset_id","layer","type","created","kafka_topic_exists"] #"query","location","multi_regional","bytes","created", "last_modified"]
df = pd.DataFrame(data, columns=columns)
# bq requires newline delimited json so append line break
file = open(json_file, "w")
for row in df.iterrows():
  row[1].to_json(file)
  file.write("\n")
file.close()

# write the results to a gc bucket 
client = storage.Client(project=project_id)
bucket = client.get_bucket("data-engineering-prod-bq-analysis")
blob = bucket.blob(json_file)

with open('data_engineering_prod_bq_analysis.json', 'rb') as file:
  blob.upload_from_file(file)

# write data to data-integration-prod
# to update the results we will wipe the existing table and replace it
project_id = 'data-integration-prod'
dataset_id = 'data_engineering_prod_bq_analysis'
table_id = 'data_engineering_prod_bq_analysis'
full_table_id = project_id + '.' + dataset_id + '.' + table_id
client = bigquery.Client(project=project_id)

# remove existing table
client.delete_table(full_table_id, not_found_ok=True) 

# write new table
job_config = bigquery.LoadJobConfig(autodetect=True, 
            source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON)
uri = "gs://data-engineering-prod-bq-analysis/data_engineering_prod_bq_analysis.json"
load_job = client.load_table_from_uri(
    uri, full_table_id, job_config=job_config)  # Make an API request.
load_job.result() 

# check the number of rows loaded into the table is correct
destination_table = client.get_table(full_table_id)
print("Loaded {} rows.".format(destination_table.num_rows))
assert(len(data) == destination_table.num_rows)

In [36]:
from google.cloud import storage

# UPLOAD DATASET INFO 
# write data to newline delimited json file
json_file = "data_engineering_prod_bq_dataset_analysis.json"
columns = ["dataset_id","number_of_objects"] 
df = pd.DataFrame(dataset_info, columns=columns)
# bq requires newline delimited json so append line break
file = open(json_file, "w")
for row in df.iterrows():
  row[1].to_json(file)
  file.write("\n")
file.close()

# write the results to a gc bucket 
client = storage.Client(project=project_id)
bucket = client.get_bucket("data-engineering-prod-bq-analysis")
blob = bucket.blob(json_file)

with open('data_engineering_prod_bq_dataset_analysis.json', 'rb') as file:
  blob.upload_from_file(file)

# write data to data-integration-prod
# to update the results we will wipe the existing table and replace it
project_id = 'data-integration-prod'
dataset_id = 'data_engineering_prod_bq_analysis'
table_id = 'data_engineering_prod_bq_dataset_analysis'
full_table_id = project_id + '.' + dataset_id + '.' + table_id
client = bigquery.Client(project=project_id)

# remove existing table
client.delete_table(full_table_id, not_found_ok=True) 

# write new table
job_config = bigquery.LoadJobConfig(autodetect=True, 
            source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON)
uri = "gs://data-engineering-prod-bq-analysis/data_engineering_prod_bq_dataset_analysis.json"
load_job = client.load_table_from_uri(
    uri, full_table_id, job_config=job_config)  # Make an API request.
load_job.result() 

# check the number of rows loaded into the table is correct
destination_table = client.get_table(full_table_id)
print("Loaded {} rows.".format(destination_table.num_rows))
assert(len(dataset_info) == destination_table.num_rows)

Loaded 281 rows.
