In [None]:
#### import global modules
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
from yaml import safe_load
import google.oauth2.credentials
from google.cloud import bigquery
import gc

# Set global vars
pth_project = Path(os.getcwd().split('notebooks')[0])
pth_data = pth_project / 'data'
pth_queries = pth_project / 'core' / 'queries'
pth_creds = pth_project / 'conf' / 'local' / 'project_config.yaml'
sys.path.insert(0, str(pth_project))
d_project_config = safe_load(pth_creds.open())
# d_params = safe_load((pth_project / 'core' / 'parameters' / 'common.yaml').open())['data_extract']

# import local modules
from core.utils.gcp import connect_bq_services
# from core.etl.extract import extract_bq_data, extract_pr_codes, format_conv_df, filter_convs

# Connect to google services
bq_client = connect_bq_services(d_project_config['gcp-project-name'])
pd.options.display.max_rows = 100

In [None]:
def extract_bq_data(bq_client, sql=None, pth_query=None):
    if sql is not None:
        df = bq_client.query(sql).to_dataframe()
    elif pth_query is not None:
        sql = pth_query.read_text()
        df = bq_client.query(sql).to_dataframe()
    else:
        raise ValueError('`sql` or `pth_query` should be set')  
    return df

In [None]:
Sales_order_Query='''


with original
as 
(SELECT 
min(dly_ord_itm_actvy_ts) as first_add,
bus_prod_instnc_id
 FROM `cio-datahub-enterprise-pr-183a.ent_cust_ord_actvy.bq_dly_wln_ord_item_actvy` WHERE ord_act_typ_cd = 'Add' and prod_typ_cd in ('SMHM') and ord_act_stat_cd = 'Processed' and bus_prod_instnc_id is not null
 and date(dly_ord_itm_actvy_ts) >= '2022-01-01' and date(dly_ord_itm_actvy_ts) < '2024-02-01'

 group by bus_prod_instnc_id) /* Fetch original order date for the product instance */

,details as
(
SELECT 
dly_ord_itm_actvy_ts,
bus_billg_acct_num,
bus_prod_instnc_id,
prod_nm,
munic_nm,
prov_state_cd,
bi_chnl_tag_cd,
chnl_org_id,
prod_typ_cd,
src_sls_rep_cd,
src_typ_cd,
src_usr_chnl_txt,
txn_sub_typ_txt,
SLS_ACTVY_TXT
 FROM `cio-datahub-enterprise-pr-183a.ent_cust_ord_actvy.bq_dly_wln_ord_item_actvy` WHERE ord_act_typ_cd = 'Add' and prod_typ_cd in ('SMHM') and ord_act_stat_cd = 'Processed'
and date(dly_ord_itm_actvy_ts) >= '2022-01-01' and date(dly_ord_itm_actvy_ts) < '2024-02-01'
 
 
 )
 
 
 
  /* Gather channel details */

,channel_refs as (


with
latest_update as (
SELECT
chnl_org_id,
max(chnl_org_key) as latest_key FROM `cio-datahub-enterprise-pr-183a.ent_sls_chnl.bq_channel_org_dim`
group by chnl_org_id

)

select distinct
t1.chnl_org_id,
chnl_org_txt from `cio-datahub-enterprise-pr-183a.ent_sls_chnl.bq_channel_org_dim` t1 inner join latest_update on latest_key = chnl_org_key

)

select
date(details.dly_ord_itm_actvy_ts) as Order_date,
details.bus_billg_acct_num as BAN,
details.bus_prod_instnc_id as Prod_Instnc_ID,
details.prod_nm as Prod_nm,
details.munic_nm,
details.prov_state_cd,
details.bi_chnl_tag_cd as Sales_Channel_tag,
details.chnl_org_id,
channel_refs.chnl_org_txt,
details.prod_typ_cd as Prod_Type,
details.src_sls_rep_cd as Sales_Agent_ID,
details.src_typ_cd,
details.src_usr_chnl_txt,
details.txn_sub_typ_txt,
details.SLS_ACTVY_TXT
 from details inner join original on original.bus_prod_instnc_id = details.bus_prod_instnc_id and original.first_add = details.dly_ord_itm_actvy_ts  /* Inner join them to prevent duplicates */
 left join channel_refs on details.chnl_org_id = channel_refs.chnl_org_id /* left joining to get channel dealer names */
 where bi_chnl_tag_cd <> 'UNKNOWN' 
--  and date(details.dly_ord_itm_actvy_ts) >= '2023-01-01' and date(details.dly_ord_itm_actvy_ts) < '2024-02-01'
 order by details.dly_ord_itm_actvy_ts



'''

In [None]:
DF_sales_Order=extract_bq_data(bq_client, sql=Sales_order_Query)

In [None]:
DF_sales_Order.info()

In [None]:
DF_sales_Order.head(10)

In [None]:
config= bigquery.job.LoadJobConfig()

# config._properties['timePartitioning'] = {'field': 'Month_Year'}
config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

Table_BQ = 'divg-team-v03-pr-de558a.shs_invol_churn.Sales_Order_SMHM_Jan2021_Jan2024'

bq_table_instance= bq_client.load_table_from_dataframe(DF_sales_Order, Table_BQ,job_config=config)

In [None]:
Query_ADC_Usage_Data= '''


with ADC_base as (

SELECT distinct
    ban as BAN,   FROM
    `bi-srv-divg-speech-pr-79f6e9.adc_feature_store.bq_adc_feature_store_daily`



)

, ADC_data_Feb_2023 as 

(

SELECT distinct
    ban as BAN,
    1 as Data_flag_Feb_2023,
    avg(login_consistency) as Login_consistency_Feb_2023 ,
    avg(Arming_Consistency) as Arming_consistency_Feb_2023,
   FROM
    `bi-srv-divg-speech-pr-79f6e9.adc_feature_store.bq_adc_feature_store_daily`
  WHERE
daily_snapshot_start_date='2023-02-01'
  group by ban
    
)

, ADC_data_Mar_2023 as 

(

SELECT distinct
    ban as BAN,
    1 as Data_flag_Mar_2023,
    avg(login_consistency) as Login_consistency_Mar_2023 ,
    avg(Arming_Consistency) as Arming_consistency_Mar_2023,
   FROM
    `bi-srv-divg-speech-pr-79f6e9.adc_feature_store.bq_adc_feature_store_daily`
  WHERE
daily_snapshot_start_date='2023-03-01'
  group by ban
    
)

select * from
ADC_base a 
LEFT JOIN ADC_data_Feb_2023 b
on a.BAN=b.BAN
LEFT JOIN ADC_data_Mar_2023 c
on a.BAN=c.BAN

'''

In [None]:
ADC_usage_DF=extract_bq_data(bq_client, sql=Query_ADC_Usage_Data)

In [None]:
ADC_usage_DF.info()

In [None]:
ADC_usage_DF.head(10)

In [None]:
from datetime import date, timedelta

# Initialize a BigQuery client
# client = bq_client

# Initialize a BigQuery client
# client = bigquery.Client()

def generate_month_start_dates(start_year, start_month, end_year, end_month):
    dates = []
    current_year, current_month = start_year, start_month
    while current_year < end_year or (current_year == end_year and current_month <= end_month):
        dates.append(date(current_year, current_month, 1))
        current_month += 1
        if current_month > 12:
            current_month = 1
            current_year += 1
    return dates

def generate_query_with_month_suffix(snapshot_date):
    month_name = snapshot_date.strftime("%b_%Y")  # e.g., "Feb_2023"
    query = f"""
    SELECT DISTINCT
        ban AS BAN,
        Segment AS Segment_{month_name},
        1 AS Data_flag_{month_name},
        AVG(login_consistency) AS Login_consistency_{month_name},
        AVG(Arming_Consistency) AS Arming_consistency_{month_name}, 
        MAX(best_practice_all) AS Best_practices_{month_name}
    FROM
        `bi-srv-divg-speech-pr-79f6e9.adc_feature_store.bq_adc_feature_store_daily`
    WHERE
        daily_snapshot_start_date = '{snapshot_date}'
    GROUP BY
        ban, Segment
    """
    return query

dates_1 = generate_month_start_dates(2023, 2, 2023, 11)

# Initialize an empty DataFrame to act as the base for outer joins
final_df = None

# Loop through each month, run the query, and perform an outer join on the 'BAN' column
for snapshot_date in dates_1:
    print(snapshot_date)
    query = generate_query_with_month_suffix(snapshot_date)
    monthly_df = bq_client.query(query).to_dataframe()
    
    if final_df is None:
        final_df = monthly_df
    else:
        # Perform an outer join on the 'BAN' column
        final_df = pd.merge(final_df, monthly_df, on='BAN', how='outer', suffixes=('', '_dup'))

# After merging, you may find some columns with '_dup' suffixes if there are overlapping column names
# You can choose to handle or rename these as needed, depending on your requirements



In [None]:
final_df.head()

In [None]:
final_df.info()

In [None]:
final_df.isna().sum()*100/final_df.shape[0]

In [None]:
config= bigquery.job.LoadJobConfig()

# config._properties['timePartitioning'] = {'field': 'Month_Year'}
config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

Table_BQ = 'divg-team-v03-pr-de558a.shs_invol_churn.ADC_Usage_data_Feb2023_Nov2023'

bq_table_instance= bq_client.load_table_from_dataframe(final_df, Table_BQ,job_config=config)

In [None]:
del final_df

In [None]:
from datetime import date, timedelta
import pandas as pd
from google.cloud import bigquery

# Initialize a BigQuery client
client = bq_client

def generate_month_start_dates(start_year, start_month, end_year, end_month):
    dates = []
    current_year, current_month = start_year, start_month
    while current_year < end_year or (current_year == end_year and current_month <= end_month):
        dates.append(date(current_year, current_month, 1))
        current_month += 1
        if current_month > 12:
            current_month = 1
            current_year += 1
    return dates

def generate_query_with_month_suffix(snapshot_date):
    month_name = snapshot_date.strftime("%b_%Y")  # e.g., "Feb_2023"
    query = f"""
    SELECT DISTINCT
        date(daily_snapshot_start_date) as Month_start_date,
        ban AS BAN,
        1 AS Data_flag,
        AVG(login_consistency) AS Login_consistency,
        AVG(Arming_Consistency) AS Arming_consistency, 
        MAX(best_practice_all) AS Best_practices,
        '{month_name}' AS Month_name,
        Segment 
        FROM
        `bi-srv-divg-speech-pr-79f6e9.adc_feature_store.bq_adc_feature_store_daily`
    WHERE
        daily_snapshot_start_date = '{snapshot_date}'
    GROUP BY
        daily_snapshot_start_date,ban,Segment
    """
    return query

dates_1 = generate_month_start_dates(2023, 2, 2024, 1)

# Initialize an empty DataFrame to store results
final_df_long = pd.DataFrame()

# Loop through each month, run the query, and append to the final DataFrame
for snapshot_date in dates_1:
    query = generate_query_with_month_suffix(snapshot_date)
    monthly_df_long = client.query(query).to_dataframe()
    # final_df_long the monthly DataFrame to the final DataFrame
    final_df_long = pd.concat([final_df_long, monthly_df_long], ignore_index=True)



In [None]:
final_df_long.info()

In [None]:
final_df_long.head()

In [None]:
final_df_long.isna().sum()*100/final_df_long.shape[0]

In [None]:
config= bigquery.job.LoadJobConfig()

# config._properties['timePartitioning'] = {'field': 'Month_Year'}
config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

Table_BQ = 'divg-team-v03-pr-de558a.shs_invol_churn.ADC_Usage_data_Feb2023_Jan2024_long'

bq_table_instance= bq_client.load_table_from_dataframe(final_df_long, Table_BQ,job_config=config)

In [None]:
Base_Sales_data_query='''



with interim_data as (
select a.*,b.Sales_Channel_tag, b.chnl_org_id, b.chnl_org_txt,b.src_usr_chnl_txt,b.Order_date,b.Sales_Agent_ID,c.*
from `divg-team-v03-pr-de558a.shs_invol_churn.shs_invol_churn_base_data` a

left join `divg-team-v03-pr-de558a.shs_invol_churn.Sales_Order_SMHM_Jan2023_Jan2024` b
on a.Bus_Billing_Account_Num=b.BAN and a.Bus_Prod_Instnc_Id=cast(b.Prod_Instnc_ID as integer)


left join `divg-team-v03-pr-de558a.shs_invol_churn.ADC_Usage_data_Feb2023_Jan2024_long` c
on a.Bus_Billing_Account_Num=c.BAN 
where c.Month_start_date="2023-08-01"

)


select * from interim_data
where Base_Snapshot_Date="2023-09-01"
order by Base_Snapshot_Date,Bus_Billing_Account_Num

'''

In [None]:
Base_DF=extract_bq_data(bq_client, sql=Base_Sales_data_query)

In [None]:
Base_DF.info()

In [None]:
Base_DF.head()

In [None]:
Base_DF['Order_date'].value_counts()

In [None]:
Base_DF.info(verbose=True)

In [None]:
Base_DF.isna().sum()*100/Base_DF.shape[0]

In [None]:
Base_DF['Segment'].fillna('No_Segment',inplace=True)

In [None]:
Base_DF['date_status'] = np.where(Base_DF['Order_date'].isna(), 'Older_Order', 'Order_in_last12_months')


In [None]:
Base_DF.to_csv('Invol_churn_Analysis_Sep2023.csv',index=False)

In [None]:
Base_DF_subset=Base_DF[Base_DF['date_status']=='Order_in_last12_months']

In [None]:
pd.DataFrame(Base_DF_subset.groupby(['Channel_Category','Security_Plan_Group']).agg(
    
    Customer_count= ('Bus_Billing_Account_Num','nunique'),
    Churn_rate=('churn',lambda x: round(x.mean()*100,2)),
    Invol_Churn=('invol',lambda x: x.sum()),
    Invol_Churn_rate=('invol',lambda x: round(x.mean()*100,2)),

    # AVG_SMHM_ARPU=('ARPU_SMHM_intial',lambda x: round(x.mean(),0)),
    # AVG_SMHM_ARPU_final=('ARPU_SMHM_final',lambda x: round(x.mean(),0)),
    # Login_consistency=('Login_consistency',lambda x: round(x.mean(),0)),
    # prod_mix_intial=('prod_mix_intial',lambda x: round(x.mean(),1)),
    # prod_mix_final=('prod_mix_final',lambda x: round(x.mean(),1)) 


    # Email_Opened_count=('OPENED','sum'),
    # CT_Opened_count=('CLICKTHROUGH','sum'),
    # softbounce_count=('SOFTBOUNCE','sum'),
    # Hardbounce_count=('HARDBOUNCE','sum'),
    # Unsub_count=('UNSUBSCRIBE','sum'),
    # campaign_date_min=('Campaign_date','min'),
    # campaign_date_max=('Campaign_date','max')

    
    # # Customer_Share= ('customer_id',lambda x:x.count()*100/Merge_DF_4.shape[0])
    # Churn_total=('Telus_Churn_Flag',lambda x: x.sum()),
    # Churn_rate=('Telus_Churn_Flag',lambda x: x.mean()*100)
).reset_index())