In [None]:
#### import global modules
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
from yaml import safe_load
import google.oauth2.credentials
from google.cloud import bigquery
import gc

# Set global vars
pth_project = Path(os.getcwd().split('notebooks')[0])
pth_data = pth_project / 'data'
pth_queries = pth_project / 'core' / 'queries'
pth_creds = pth_project / 'conf' / 'local' / 'project_config.yaml' 
sys.path.insert(0, str(pth_project))
d_project_config = safe_load(pth_creds.open())
# d_params = safe_load((pth_project / 'core' / 'parameters' / 'common.yaml').open())['data_extract']

# import local modules
from core.utils.gcp import connect_bq_services
# from core.etl.extract import extract_bq_data, extract_pr_codes, format_conv_df, filter_convs

# Connect to google services
bq_client = connect_bq_services(d_project_config['gcp-project-name'])
pd.options.display.max_rows = 100

In [None]:
def extract_bq_data(bq_client, sql=None, pth_query=None):
    if sql is not None:
        df = bq_client.query(sql).to_dataframe()
    elif pth_query is not None:
        sql = pth_query.read_text()
        df = bq_client.query(sql).to_dataframe()
    else:
        raise ValueError('`sql` or `pth_query` should be set')  
    return df

In [None]:
Query='''





with original
as 
(SELECT 
min(dly_ord_itm_actvy_ts) as first_add,
bus_prod_instnc_id
 FROM `cio-datahub-enterprise-pr-183a.ent_cust_ord_actvy.bq_dly_wln_ord_item_actvy` WHERE ord_act_typ_cd = 'Add' and prod_typ_cd in ('SMHM') and ord_act_stat_cd = 'Processed' and bus_prod_instnc_id is not null
 and date(actvn_ts) >= '2023-01-01' 

 group by bus_prod_instnc_id) /* Fetch original order date for the product instance */

,details as
(
SELECT 
dly_ord_itm_actvy_ts,
actvn_ts,
cast (bus_billg_acct_num as INT) as BAN,
bus_prod_instnc_id,
 FROM `cio-datahub-enterprise-pr-183a.ent_cust_ord_actvy.bq_dly_wln_ord_item_actvy` WHERE ord_act_typ_cd = 'Add' and prod_typ_cd in ('SMHM') and ord_act_stat_cd = 'Processed'
and date(actvn_ts) >= '2023-01-01'  
 
 )
 
 
 
  /* Gather channel details */

,channel_refs as (


with
latest_update as (
SELECT
chnl_org_id,
max(chnl_org_key) as latest_key FROM `cio-datahub-enterprise-pr-183a.ent_sls_chnl.bq_channel_org_dim`
group by chnl_org_id

)

select distinct
t1.chnl_org_id,
chnl_org_txt from `cio-datahub-enterprise-pr-183a.ent_sls_chnl.bq_channel_org_dim` t1 inner join latest_update on latest_key = chnl_org_key

)

,Bill_payment_data as (


SELECT
  oth_billg_acct_num AS BAN,
  --min(bill_dt) as first_payment_date,
  MAX(CASE WHEN EXTRACT(YEAR FROM bill_dt) = 2023 AND EXTRACT(MONTH FROM bill_dt) = 1 THEN 1 ELSE 0 END) AS Jan_2023_Paid,
  MAX(CASE WHEN EXTRACT(YEAR FROM bill_dt) = 2023 AND EXTRACT(MONTH FROM bill_dt) = 2 THEN 1 ELSE 0 END) AS Feb_2023_Paid,
  MAX(CASE WHEN EXTRACT(YEAR FROM bill_dt) = 2023 AND EXTRACT(MONTH FROM bill_dt) = 3 THEN 1 ELSE 0 END) AS Mar_2023_Paid,
  MAX(CASE WHEN EXTRACT(YEAR FROM bill_dt) = 2023 AND EXTRACT(MONTH FROM bill_dt) = 4 THEN 1 ELSE 0 END) AS Apr_2023_Paid,
  MAX(CASE WHEN EXTRACT(YEAR FROM bill_dt) = 2023 AND EXTRACT(MONTH FROM bill_dt) = 5 THEN 1 ELSE 0 END) AS May_2023_Paid,
  MAX(CASE WHEN EXTRACT(YEAR FROM bill_dt) = 2023 AND EXTRACT(MONTH FROM bill_dt) = 6 THEN 1 ELSE 0 END) AS Jun_2023_Paid,
  MAX(CASE WHEN EXTRACT(YEAR FROM bill_dt) = 2023 AND EXTRACT(MONTH FROM bill_dt) = 7 THEN 1 ELSE 0 END) AS Jul_2023_Paid,
  MAX(CASE WHEN EXTRACT(YEAR FROM bill_dt) = 2023 AND EXTRACT(MONTH FROM bill_dt) = 8 THEN 1 ELSE 0 END) AS Aug_2023_Paid,
  MAX(CASE WHEN EXTRACT(YEAR FROM bill_dt) = 2023 AND EXTRACT(MONTH FROM bill_dt) = 9 THEN 1 ELSE 0 END) AS Sep_2023_Paid,
  MAX(CASE WHEN EXTRACT(YEAR FROM bill_dt) = 2023 AND EXTRACT(MONTH FROM bill_dt) = 10 THEN 1 ELSE 0 END) AS Oct_2023_Paid,
  MAX(CASE WHEN EXTRACT(YEAR FROM bill_dt) = 2023 AND EXTRACT(MONTH FROM bill_dt) = 11 THEN 1 ELSE 0 END) AS Nov_2023_Paid,
  MAX(CASE WHEN EXTRACT(YEAR FROM bill_dt) = 2023 AND EXTRACT(MONTH FROM bill_dt) = 12 THEN 1 ELSE 0 END) AS Dec_2023_Paid,
  MAX(CASE WHEN EXTRACT(YEAR FROM bill_dt) = 2024 AND EXTRACT(MONTH FROM bill_dt) = 1 THEN 1 ELSE 0 END) AS Jan_2024_Paid,
  MAX(CASE WHEN EXTRACT(YEAR FROM bill_dt) = 2024 AND EXTRACT(MONTH FROM bill_dt) = 2 THEN 1 ELSE 0 END) AS Feb_2024_Paid,
  MAX(CASE WHEN EXTRACT(YEAR FROM bill_dt) = 2024 AND EXTRACT(MONTH FROM bill_dt) = 3 THEN 1 ELSE 0 END) AS Mar_2024_Paid
FROM
  `cio-datahub-enterprise-pr-183a.ent_cust_bill.bq_wln_pymt_dtl`
WHERE
  bill_dt BETWEEN '2023-01-01' AND '2024-03-31'
GROUP BY
  oth_billg_acct_num



)


,province_info as (


select
      distinct
      cast (bill_account_number as INT) as BAN
      ,province as Province_Code
      -- ,rank() over (partition by bill_account_number order by activation_dt desc) as rank_
from `bi-srv-hsmsd-3c-pr-ca2cd4.hsmsd_3c_rpt_dataset.bq_rpt_chnl_order_ffh_dtl_view`
where activation_dt >= '2023-01-01'
and bill_account_number is not null
and province is not null
and product_family in ('SMHM')
and action_type = 'Add'
order by BAN

)


, ECID_data as (

select 
distinct CAST(bacct_bus_bacct_num AS INT) as BAN,
--cust_bus_cust_id as ECID,
pi_prod_instnc_stat_cd as BAN_status
--,pi_prod_instnc_typ_cd,pi_prod_instnc_stat_ts,prod_instnc_ts,pi_cntrct_start_ts as contract_start_date
--,pi_cntrct_end_ts as contract_end_date
from `cio-datahub-enterprise-pr-183a.ent_cust_cust.bq_prod_instnc_snpsht` 
WHERE DATE(prod_instnc_ts) = date(current_date()-1) 
and pi_prod_instnc_typ_cd ='SMHM' #Serice type
and bus_prod_instnc_src_id = 1001 #BANs that are for home services
--and pi_prod_instnc_stat_cd in ('A')
and  consldt_cust_typ_cd = 'R'
--QUALIFY ROW_NUMBER() OVER (PARTITION BY bacct_bus_bacct_num ORDER BY pi_prod_instnc_stat_ts DESC) = 1


)

,Order_data as (

select
date(details.dly_ord_itm_actvy_ts) as Order_date,
date(details.actvn_ts) as Activation_date,
date(DATE_TRUNC(details.actvn_ts, MONTH)) AS Activation_month,
details.BAN as BAN,
details.bus_prod_instnc_id as Prod_Instnc_ID,
prov.Province_code,
current_date() as Table_Update_date
from details 
left join province_info prov  
on details.BAN=prov.BAN
inner join original on original.bus_prod_instnc_id = details.bus_prod_instnc_id and original.first_add = details.dly_ord_itm_actvy_ts 

)

select * from 
Order_data a
left join  Bill_payment_data b
on a.BAN=b.BAN
-- left join ECID_data c
-- on a.BAN=c.BAN
order by Activation_date

'''

In [None]:
Sales_Bill_DF=extract_bq_data(bq_client, sql=Query)

In [None]:
Sales_Bill_DF.info()

In [None]:
Sales_Bill_DF.isna().sum()/Sales_Bill_DF.shape[0]*100

In [None]:
Sales_Bill_DF.head()

In [None]:
# Sales_Bill_DF_grouped=Sales_Bill_DF.groupby('BAN').agg({'Order_date': 'min', 'first_payment_date': 'min', 'Table_Update_date': 'min'})

In [None]:
# Sales_Bill_DF_grouped.head()

In [None]:
# config= bigquery.job.LoadJobConfig()

# # config._properties['timePartitioning'] = {'field': 'Month_Year'}
# config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

# Table_BQ = 'pras-pr-223186.pras_pr_dataset.Sales_Order_Bill_payment_data'

# bq_table_instance= bq_client.load_table_from_dataframe(Sales_Bill_DF_grouped, Table_BQ,job_config=config)

In [None]:
Sales_Bill_DF['BAN_1'].fillna(0,inplace=True)
# Sales_Bill_DF['BAN_2'].fillna(0,inplace=True)
# Sales_Bill_DF['ECID'].fillna(0,inplace=True)
# Sales_Bill_DF['BAN_status'].fillna('None',inplace=True)


In [None]:
Sales_Bill_DF['Never_Paid_Indicator'] = np.where(Sales_Bill_DF['BAN_1'] == 0, 1, 0)


In [None]:
paid_columns = [col for col in Sales_Bill_DF.columns if col.endswith('_Paid')]

# Fill missing values in paid columns with zero
Sales_Bill_DF[paid_columns] = Sales_Bill_DF[paid_columns].fillna(0)

In [None]:
Sales_Bill_DF.isna().sum()/Sales_Bill_DF.shape[0]*100

In [None]:
Sales_Bill_DF.head()

In [None]:


# Ensure 'activation_month' is in datetime format
Sales_Bill_DF['Activation_month'] = pd.to_datetime(Sales_Bill_DF['Activation_month'])

# Function to get column name for N months after the activation month
def get_future_month_col(activation_month, months_ahead):
    future_month = activation_month + pd.DateOffset(months=months_ahead)
    future_month_col = f"{future_month.strftime('%b_%Y')}_Paid"
    return future_month_col

# Adjusted part to handle NaN values safely
for i in range(1, 5):
    # Calculate future month column names
    future_month_cols = Sales_Bill_DF['Activation_month'].apply(get_future_month_col, months_ahead=i)
    
    # Create a new column for each future month's payment status, handling NaN values
    Sales_Bill_DF[f'{i}_Months_After_Activation'] = [
        1 if col in Sales_Bill_DF.columns and not pd.isna(Sales_Bill_DF.at[j, col]) and Sales_Bill_DF.at[j, col] == 1 else 0
        for j, col in enumerate(future_month_cols)
    ]
# Display the updated DataFrame
# Sales_Bill_DF.head()

In [None]:
Sales_Bill_DF.head()

In [None]:
Sales_Bill_DF['Province_code'].value_counts()

In [None]:
Sales_Bill_DF_sub = Sales_Bill_DF[~Sales_Bill_DF['Province_code'].isin(['AB', 'BC'])][['Activation_month', 'BAN', 'Province_code', '1_Months_After_Activation', '2_Months_After_Activation', '3_Months_After_Activation', '4_Months_After_Activation', 'Never_Paid_Indicator']]


In [None]:
Sales_Bill_DF_sub.to_csv('Sales_Billing_Data_Jan2023_Mar2024.csv',index=False)