In [1]:
#### import global modules
import os
import sys
import pandas as pd
from pathlib import Path
from yaml import safe_load
import google.oauth2.credentials
from google.cloud import bigquery
import numpy as np
# Set global vars
pth_project = Path(os.getcwd().split('notebooks')[0])
pth_data = pth_project / 'data'
pth_queries = pth_project / 'core' / 'queries'
pth_creds = pth_project / 'conf' / 'local' / 'project_config.yaml'
sys.path.insert(0, str(pth_project))
d_project_config = safe_load(pth_creds.open())
# d_params = safe_load((pth_project / 'core' / 'parameters' / 'common.yaml').open())['data_extract']

# import local modules
from core.utils.gcp import connect_bq_services
# from core.etl.extract import extract_bq_data, extract_pr_codes, format_conv_df, filter_convs

# Connect to google services
bq_client = connect_bq_services(d_project_config['gcp-project-name'])
pd.options.display.max_rows = 100

In [2]:
def extract_bq_data(bq_client, sql=None, pth_query=None):
    if sql is not None:
        df = bq_client.query(sql).to_dataframe()
    elif pth_query is not None:
        sql = pth_query.read_text()
        df = bq_client.query(sql).to_dataframe()
    else:
        raise ValueError('`sql` or `pth_query` should be set')  
    return df

In [3]:
Best_Practices_Query='''

select customer_id,best_practices_id,best_practices_ind,date(dt_last_calculate_utc) as last_date_calculate
from `cio-datahub-enterprise-pr-183a.src_adc.bq_customer_best_practice`
-- where DATE(create_ts)<='2022-08-31'
QUALIFY ROW_NUMBER() OVER (PARTITION BY customer_id,best_practices_id ORDER BY last_updt_ts DESC) = 1
order by customer_id,best_practices_id


'''

In [4]:
Best_Practices_DF= extract_bq_data(bq_client,sql=Best_Practices_Query)

In [5]:
Best_Practices_DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3672642 entries, 0 to 3672641
Data columns (total 4 columns):
 #   Column               Dtype  
---  ------               -----  
 0   customer_id          Int64  
 1   best_practices_id    Int64  
 2   best_practices_ind   boolean
 3   last_date_calculate  dbdate 
dtypes: Int64(2), boolean(1), dbdate(1)
memory usage: 98.1 MB


In [6]:
Best_Practices_DF.head()

Unnamed: 0,customer_id,best_practices_id,best_practices_ind,last_date_calculate
0,287850,1,True,2022-08-14
1,287850,2,False,2022-08-14
2,287850,3,False,2022-08-14
3,287850,4,False,2022-08-14
4,287850,5,True,2022-08-14


In [7]:
# Best_Practices_DF['customer_id'].value_counts()

In [8]:
# Best_Practices_DF_True=Best_Practices_DF[Best_Practices_DF['best_practices_ind']==True]

In [9]:
# Best_Practices_DF_True['customer_id'].value_counts()

In [10]:
Best_Practices_DF['Best_practices_flag']=Best_Practices_DF['best_practices_ind'].apply(lambda x: 1 if x==True else 0)

In [11]:
Best_Practices_DF.head()

Unnamed: 0,customer_id,best_practices_id,best_practices_ind,last_date_calculate,Best_practices_flag
0,287850,1,True,2022-08-14,1
1,287850,2,False,2022-08-14,0
2,287850,3,False,2022-08-14,0
3,287850,4,False,2022-08-14,0
4,287850,5,True,2022-08-14,1


In [12]:
Best_Practices_DF_wide=Best_Practices_DF.pivot(index='customer_id', columns='best_practices_id', values='Best_practices_flag').reset_index()


In [13]:
Best_Practices_DF_wide.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 612107 entries, 0 to 612106
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   customer_id  612107 non-null  Int64
 1   1            612107 non-null  int64
 2   2            612107 non-null  int64
 3   3            612107 non-null  int64
 4   4            612107 non-null  int64
 5   5            612107 non-null  int64
 6   6            612107 non-null  int64
dtypes: Int64(1), int64(6)
memory usage: 33.3 MB


In [14]:
Best_Practices_DF_wide.columns=['customer_id','Best_partices_1','Best_partices_2','Best_partices_3','Best_partices_4','Best_partices_5','Best_partices_6']

In [15]:
Best_Practices_DF_wide.head()

Unnamed: 0,customer_id,Best_partices_1,Best_partices_2,Best_partices_3,Best_partices_4,Best_partices_5,Best_partices_6
0,287850,1,0,0,0,1,0
1,292877,1,1,0,0,0,0
2,299100,1,1,0,0,1,1
3,355703,1,1,0,1,1,1
4,359052,1,1,0,0,1,1


In [16]:
Best_Practices_DF_wide['Best_partices_1'].value_counts()

1    519533
0     92574
Name: Best_partices_1, dtype: int64

In [17]:
Best_Practices_DF_wide.fillna(0,inplace=True)

In [18]:
Best_Practices_DF_wide['Best_partices_1'].value_counts()

1    519533
0     92574
Name: Best_partices_1, dtype: int64

In [19]:
Best_Practices_DF_wide['Best_partices_3'].value_counts()

0    425436
1    186671
Name: Best_partices_3, dtype: int64

In [20]:
Best_Practices_DF_wide.head()

Unnamed: 0,customer_id,Best_partices_1,Best_partices_2,Best_partices_3,Best_partices_4,Best_partices_5,Best_partices_6
0,287850,1,0,0,0,1,0
1,292877,1,1,0,0,0,0
2,299100,1,1,0,0,1,1
3,355703,1,1,0,1,1,1
4,359052,1,1,0,0,1,1


In [21]:
cols_to_sum=['Best_partices_1','Best_partices_2','Best_partices_3','Best_partices_4','Best_partices_5','Best_partices_6']
Best_Practices_DF_wide['Best_Practice_All']=Best_Practices_DF_wide[cols_to_sum].sum(axis=1)

In [22]:
Best_Practices_DF_wide['Best_Practice_All'].value_counts(normalize=True)*100

4    28.229215
5    21.841606
3    15.352054
2    11.631463
0     9.583292
6     6.742612
1     6.619758
Name: Best_Practice_All, dtype: float64

In [23]:
Best_Practices_DF_wide=Best_Practices_DF_wide.sort_values('Best_Practice_All',ascending=False)

In [24]:
Best_Practices_DF_wide.head()

Unnamed: 0,customer_id,Best_partices_1,Best_partices_2,Best_partices_3,Best_partices_4,Best_partices_5,Best_partices_6,Best_Practice_All
226310,9733900,1,1,1,1,1,1,6
70328,6732112,1,1,1,1,1,1,6
555897,13867410,1,1,1,1,1,1,6
555895,13867396,1,1,1,1,1,1,6
281920,10486393,1,1,1,1,1,1,6


In [25]:
Best_Practices_DF_wide['Best_Practice_All_flag']=Best_Practices_DF_wide['Best_Practice_All'].apply(lambda x: 1 if x==6 else 0)

In [26]:
Best_Practices_DF_wide.head()

Unnamed: 0,customer_id,Best_partices_1,Best_partices_2,Best_partices_3,Best_partices_4,Best_partices_5,Best_partices_6,Best_Practice_All,Best_Practice_All_flag
226310,9733900,1,1,1,1,1,1,6,1
70328,6732112,1,1,1,1,1,1,6,1
555897,13867410,1,1,1,1,1,1,6,1
555895,13867396,1,1,1,1,1,1,6,1
281920,10486393,1,1,1,1,1,1,6,1


In [27]:
Best_Practices_DF_wide.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 612107 entries, 226310 to 612106
Data columns (total 9 columns):
 #   Column                  Non-Null Count   Dtype
---  ------                  --------------   -----
 0   customer_id             612107 non-null  Int64
 1   Best_partices_1         612107 non-null  int64
 2   Best_partices_2         612107 non-null  int64
 3   Best_partices_3         612107 non-null  int64
 4   Best_partices_4         612107 non-null  int64
 5   Best_partices_5         612107 non-null  int64
 6   Best_partices_6         612107 non-null  int64
 7   Best_Practice_All       612107 non-null  int64
 8   Best_Practice_All_flag  612107 non-null  int64
dtypes: Int64(1), int64(8)
memory usage: 47.3 MB


In [28]:
config= bigquery.job.LoadJobConfig()

# config._properties['timePartitioning'] = {'field': 'Month_Year'}
config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

Table_BQ = 'ADC_updated.Best_Practices_Customers_latest'

bq_table_instance= bq_client.load_table_from_dataframe(Best_Practices_DF_wide, Table_BQ,job_config=config)