In [7]:
from google.cloud import bigquery

# Define your projects and datasets
project_dataset_1 = 'umt-msba.wedge_transactions.transArchive_*'
project_dataset_2 = 'adamh-wedge-project.wedge_transactions.transArchive_*'

# Extract project IDs
project_id_1 = project_dataset_1.split('.')[0]
project_id_2 = project_dataset_2.split('.')[0]

# Initialize two BigQuery clients with different credentials
client_1 = bigquery.Client.from_service_account_json('service_account.json', project=project_id_1)
client_2 = bigquery.Client.from_service_account_json('adamh-wedge-project-6f59b14d0763.json', project=project_id_2)

In [8]:
queries = [
    # Total Rows
    "SELECT count(*) as cnt FROM `{}`",

    # January/Oct 2012 Rows
    """
    SELECT EXTRACT(Year from datetime) as Yr,
           EXTRACT(Month from datetime) as Mo,
           count(*)
    FROM `{}`
    GROUP BY Yr, Mo
    HAVING Yr = 2012 
    ORDER BY Yr, Mo
    """,

    # Rows by Month
    """
    SELECT EXTRACT(Month from datetime) as Mo,
           count(*) as cnt
    FROM `{}`
    WHERE EXTRACT(Year from datetime) <= 2016
    GROUP BY Mo
    ORDER BY cnt asc
    """,

        # Rows by Month
    """
    SELECT EXTRACT(Month from datetime) as Mo,
           count(*) as cnt
    FROM `{}`
    WHERE EXTRACT(Year from datetime) <= 2016
    GROUP BY Mo
    ORDER BY cnt desc
    """,

    # Null Counts
    """
    SELECT COUNTIF(trans_subtype is NULL) as Null_TS,
           COUNTIF(datetime is NULL) as Null_DT,
           COUNTIF(local IS NULL) as Null_Local,
           COUNTIF(card_no IS NULL) as Null_CN
    FROM `{}`
    """,

    # High volume cards
    """
    SELECT card_no, 
           COUNT(*) as cnt
    FROM `{}`
    WHERE card_no != 3
    GROUP BY card_no
    ORDER BY cnt desc
    LIMIT 10
    """,

    # 18736 Rows
    """
    SELECT card_no, 
           COUNT(*) as cnt
    FROM `{}`
    WHERE card_no = 18736
    GROUP BY card_no
    """,

    # Popular Products
    """
    SELECT LOWER(Description), 
           COUNT(*) as cnt
    FROM `{}`
    WHERE trans_type = "I" AND
          Description != "Discount"
    GROUP BY LOWER(Description)
    ORDER BY cnt desc
    LIMIT 10
    """,

    # Single-record items
    """
    SELECT 
        COUNT(DISTINCT Description) as SingleRecordItems
    FROM (
        SELECT LOWER(Description) as Description,
               COUNT(*) as cnt
        FROM `{}`
        WHERE trans_type = "I" AND Description != "Discount"
        GROUP BY Description
        HAVING cnt = 1
    )
    """,

    # Owner Fractions by Year
    """
    SELECT
      Y,
      OwnerRows,
      NonOwnerRows,
      ROUND(OwnerRows/(OwnerRows+NonOwnerRows),4) AS OwnerFrac
    FROM (
      SELECT
        EXTRACT(Year FROM datetime) AS Y,
        COUNTIF(card_no != 3) AS OwnerRows,
        COUNTIF(card_no = 3) AS NonOwnerRows
      FROM `{}`
      GROUP BY Y
    )
    ORDER BY Y
    """
]

In [9]:
import pandas as pd
import numpy as np


# Run each query for the first project and dataset and store the results in a DataFrame
results_1 = []
for query in queries:
    query = query.format(project_dataset_1)
    query_job = client_1.query(query)
    results = query_job.result()
    results_1.append(pd.DataFrame([dict(row) for row in results]))

# Run each query for the second project and dataset and store the results in a DataFrame
results_2 = []
for query in queries:
    query = query.format(project_dataset_2)
    query_job = client_2.query(query)
    results = query_job.result()
    results_2.append(pd.DataFrame([dict(row) for row in results]))

# Create a list to store the final results
final_results = []

# Loop over the results
for r1, r2 in zip(results_1, results_2):
    # Only attempt comparison on numerical columns
    numerical_columns = r1.select_dtypes(include=[np.number]).columns.tolist()
    
    # Create an empty DataFrame for comparison
    comparison = pd.DataFrame()
    
    for column in numerical_columns:
        # Add a new column for comparison: (query 1 - query 2) / query 1
        comparison[column + '_comparison'] = (r1[column] - r2[column]) / r1[column]
    
    # Concatenate the results from both projects and the comparison into a single DataFrame
    final_result = pd.concat([r1.add_suffix('_1'), r2.add_suffix('_2'), comparison], axis=1)
    
    # Append the final result to the list
    final_results.append(final_result)

# Now, final_results contains a DataFrame for each query with side by side results and the comparison column
# You can display them one by one
for result in final_results:
    display(result)


Unnamed: 0,cnt_1,cnt_2,cnt_comparison
0,85760139,85760124,1.749064e-07


Unnamed: 0,Yr_1,Mo_1,f0__1,Yr_2,Mo_2,f0__2,Yr_comparison,Mo_comparison,f0__comparison
0,2012,1,1070907,2012,1,1070907,0.0,0.0,0.0
1,2012,2,1034261,2012,2,1034261,0.0,0.0,0.0
2,2012,3,1130248,2012,3,1130248,0.0,0.0,0.0
3,2012,4,1135000,2012,4,1135000,0.0,0.0,0.0
4,2012,5,1132624,2012,5,1132624,0.0,0.0,0.0
5,2012,6,1053912,2012,6,1053912,0.0,0.0,0.0
6,2012,7,1042457,2012,7,1042457,0.0,0.0,0.0
7,2012,8,1029592,2012,8,1029592,0.0,0.0,0.0
8,2012,9,1044436,2012,9,1044436,0.0,0.0,0.0
9,2012,10,1042287,2012,10,1042287,0.0,0.0,0.0


Unnamed: 0,Mo_1,cnt_1,Mo_2,cnt_2,Mo_comparison,cnt_comparison
0,2,6556770,2,6556769,0.0,1.525141e-07
1,12,6740733,12,6740731,0.0,2.967036e-07
2,11,6955365,11,6955363,0.0,2.875478e-07
3,9,6975855,9,6975854,0.0,1.433516e-07
4,8,7008748,8,7008747,0.0,1.426788e-07
5,6,7012157,6,7012156,0.0,1.426095e-07
6,7,7013013,7,7013012,0.0,1.425921e-07
7,1,7056762,1,7056761,0.0,1.417081e-07
8,10,7085752,10,7085751,0.0,1.411283e-07
9,4,7417388,4,7417387,0.0,1.348183e-07


Unnamed: 0,Mo_1,cnt_1,Mo_2,cnt_2,Mo_comparison,cnt_comparison
0,5,7578372,5,7578371,0.0,1.319545e-07
1,3,7422483,3,7422482,0.0,1.347258e-07
2,4,7417388,4,7417387,0.0,1.348183e-07
3,10,7085752,10,7085751,0.0,1.411283e-07
4,1,7056762,1,7056761,0.0,1.417081e-07
5,7,7013013,7,7013012,0.0,1.425921e-07
6,6,7012157,6,7012156,0.0,1.426095e-07
7,8,7008748,8,7008747,0.0,1.426788e-07
8,9,6975855,9,6975854,0.0,1.433516e-07
9,11,6955365,11,6955363,0.0,2.875478e-07


Unnamed: 0,Null_TS_1,Null_DT_1,Null_Local_1,Null_CN_1,Null_TS_2,Null_DT_2,Null_Local_2,Null_CN_2,Null_TS_comparison,Null_DT_comparison,Null_Local_comparison,Null_CN_comparison
0,7123792,0,234843,0,7123776,0,234839,0,2e-06,,1.7e-05,


Unnamed: 0,card_no_1,cnt_1,card_no_2,cnt_2,card_no_comparison,cnt_comparison
0,12539.0,2161167,12539.0,2161161,0.0,2.776278e-06
1,10499.0,1013697,10499.0,1013696,0.0,9.864881e-07
2,14140.0,942230,14140.0,942226,0.0,4.245248e-06
3,20074.0,643460,20074.0,643454,0.0,9.324589e-06
4,14987.0,460630,14987.0,460625,0.0,1.08547e-05
5,10504.0,164864,10504.0,164864,0.0,0.0
6,21517.0,133006,21517.0,133006,0.0,0.0
7,23170.0,99276,23170.0,99276,0.0,0.0
8,19750.0,65876,19750.0,65875,0.0,1.518004e-05
9,15876.0,59053,15876.0,59052,0.0,1.693394e-05


Unnamed: 0,card_no_1,cnt_1,card_no_2,cnt_2,card_no_comparison,cnt_comparison
0,18736.0,12153,18736.0,12153,0.0,0.0


Unnamed: 0,f0__1,cnt_1,f0__2,cnt_2,cnt_comparison
0,banana organic,908639,banana organic,908637,2e-06
1,green patch redemption,572473,green patch redemption,572472,2e-06
2,wedge cookie,510140,wedge cookie,510140,0.0
3,avocado hass organic,456771,avocado hass organic,456771,0.0
4,broccoli organic,344657,broccoli organic,344657,0.0
5,celery organic,254479,celery organic,254479,0.0
6,citrus lemon organic,246058,citrus lemon organic,246058,0.0
7,salad mix organic,225088,salad mix organic,225088,0.0
8,pepper bell red organic,224162,pepper bell red organic,224162,0.0
9,spinach bulk organic,209826,spinach bulk organic,209826,0.0


Unnamed: 0,SingleRecordItems_1,SingleRecordItems_2,SingleRecordItems_comparison
0,2769,2741,0.010112


Unnamed: 0,Y_1,OwnerRows_1,NonOwnerRows_1,OwnerFrac_1,Y_2,OwnerRows_2,NonOwnerRows_2,OwnerFrac_2,Y_comparison,OwnerRows_comparison,NonOwnerRows_comparison,OwnerFrac_comparison
0,2010,9005825,3128483,0.7422,2010,9005825,3128483,0.7422,0.0,0.0,0.0,0.0
1,2011,8980792,3200841,0.7372,2011,8980792,3200841,0.7372,0.0,0.0,0.0,0.0
2,2012,9442984,3287078,0.7418,2012,9442984,3287078,0.7418,0.0,0.0,0.0,0.0
3,2013,9214660,3103828,0.748,2013,9214660,3103828,0.748,0.0,0.0,0.0,0.0
4,2014,9237995,2932261,0.7591,2014,9237995,2932261,0.7591,0.0,0.0,0.0,0.0
5,2015,9217601,3183007,0.7433,2015,9217692,3182914,0.7433,0.0,-9.872417e-06,2.9e-05,0.0
6,2016,8166872,2721171,0.7501,2016,8166864,2721167,0.7501,0.0,9.795672e-07,1e-06,0.0
7,2017,703766,232975,0.7513,2017,703765,232975,0.7513,0.0,1.420927e-06,0.0,0.0
