In [157]:
#### import global modules
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
from yaml import safe_load
import google.oauth2.credentials
from google.cloud import bigquery
import gc

# Set global vars
pth_project = Path(os.getcwd().split('notebooks')[0])
pth_data = pth_project / 'data'
pth_queries = pth_project / 'core' / 'queries'
pth_creds = pth_project / 'conf' / 'local' / 'project_config.yaml'
sys.path.insert(0, str(pth_project))
d_project_config = safe_load(pth_creds.open())
# d_params = safe_load((pth_project / 'core' / 'parameters' / 'common.yaml').open())['data_extract']

# import local modules
from core.utils.gcp import connect_bq_services
# from core.etl.extract import extract_bq_data, extract_pr_codes, format_conv_df, filter_convs

# Connect to google services
bq_client = connect_bq_services(d_project_config['gcp-project-name'])
pd.options.display.max_rows = 100

In [158]:
def extract_bq_data(bq_client, sql=None, pth_query=None):
    if sql is not None:
        df = bq_client.query(sql).to_dataframe()
    elif pth_query is not None:
        sql = pth_query.read_text()
        df = bq_client.query(sql).to_dataframe()
    else:
        raise ValueError('`sql` or `pth_query` should be set')  
    return df

In [214]:
Query='''


DECLARE
  start_dt_snpsht DATE DEFAULT '2022-09-01';
DECLARE
  end_dt_snpsht DATE DEFAULT '2022-11-30';
DECLARE
  churn_dt_snpsht_start DATE DEFAULT '2022-11-30';
DECLARE
  churn_dt_snpsht_end DATE DEFAULT '2023-01-15';  
DECLARE
  churn_dt_snpsht_end_1 DATE DEFAULT '2023-02-28';  

WITH date_sql as (

SELECT 
    DATE_TRUNC(DATE_SUB(start_dt_snpsht, INTERVAL 3 month), month) AS eligibility_date 


)


,Customer_Base as (select * from `divgpras-pr-579355.SHS.SHS_churn_model_base_data_sep_nov_2022`)

, ADC_BP as ( 
select *,(Best_Practices_1+Best_Practices_2+Best_Practices_3+Best_Practices_4+Best_Practices_5+Best_Practices_6) as Best_Practice_All
from 

(
  -- #1 from_item
  SELECT 
    customer_id,
    best_practices_id,
     case when best_practices_ind=True then 1 else 0 end as Best_practices_flag,

  FROM `cio-datahub-enterprise-pr-183a.src_adc.bq_customer_best_practice`
  where DATE(dt_last_calculate_utc) <= end_dt_snpsht
  QUALIFY ROW_NUMBER() OVER (PARTITION BY customer_id,best_practices_id ORDER BY last_updt_ts DESC) = 1
)
PIVOT
(
  -- #2 aggregate
  sum(Best_practices_flag) AS Best_Practices
  -- #3 pivot_column
  FOR best_practices_id in (1,2,3,4,5,6)
)

)


,ADC_arming AS (

With date_sql as (
select  count(distinct date(date)) as count_of_dates from `cio-datahub-enterprise-pr-183a.src_adc.bq_aggregate_daily_arming_commands`
where date(date)<= end_dt_snpsht and date(date)>= start_dt_snpsht

)

select 
    id_cust as customer_id
    ,sum(count_arm_commands) as sum_arm_commands
    ,sum(count_disarm_commands) as sum_disarm_commands
    ,count (distinct date(date)) as number_days_arming_disarming
    , (select count_of_dates from date_sql ) as count_of_dates_arming
    , round(count (distinct date(date))*100/(select max(count_of_dates) from date_sql),0) as arming_consistency
    , round(sum(count_arm_commands)/(count (distinct date(date))),0) as Avg_arm_per_day
    ,round(sum(count_disarm_commands)/(count (distinct date(date))),0) as Avg_disarm_per_day
from `cio-datahub-enterprise-pr-183a.src_adc.bq_aggregate_daily_arming_commands`
where date(date)<= end_dt_snpsht and date(date)>= start_dt_snpsht
group by customer_id
order by number_days_arming_disarming desc

)


, ADC_login as (

with date_sql as (
select  count(distinct date(login_dt_utc)) as count_of_dates from `cio-datahub-enterprise-pr-183a.src_adc.bq_customer_daily_logins`
where date(login_dt_utc)<= end_dt_snpsht and date(login_dt_utc)>= start_dt_snpsht

)

select customer_id,sum(login_count) as sum_login_count,count (distinct date(login_dt_utc)) as number_of_login_days, (select count_of_dates from date_sql ) as count_of_dates_login, round(count (distinct date(login_dt_utc))*100/(select max(count_of_dates) from date_sql),0) as login_consistency, round(sum(login_count)/(count (distinct date(login_dt_utc))),0) as Avg_login_per_day
from `cio-datahub-enterprise-pr-183a.src_adc.bq_customer_daily_logins`
where date(login_dt_utc)<= end_dt_snpsht and date(login_dt_utc)>= start_dt_snpsht
group by customer_id
order by login_consistency desc



)


, cte_product_mix AS (
  
SELECT distinct ffh_prod.bacct_bus_bacct_num AS ban,
         COUNT(DISTINCT CASE WHEN ffh_prod.pi_prod_instnc_typ_cd IN ('SING', 'HSIC', 'TTV', 'SMHM', 'STV', 'DIIC','C') THEN ffh_prod.pi_prod_instnc_typ_cd ELSE NULL END) AS product_mix_all,
         COUNT(DISTINCT CASE WHEN ffh_prod.pi_prod_instnc_typ_cd = 'HSIC' THEN ffh_prod.bus_prod_instnc_id ELSE NULL END) AS hsic_count,
         COUNT(DISTINCT CASE WHEN ffh_prod.pi_prod_instnc_typ_cd = 'SING' THEN ffh_prod.bus_prod_instnc_id ELSE NULL END) AS sing_count,
         COUNT(DISTINCT CASE WHEN ffh_prod.pi_prod_instnc_typ_cd = 'SMHM' THEN ffh_prod.bus_prod_instnc_id ELSE NULL END) AS shs_count,
         COUNT(DISTINCT CASE WHEN ffh_prod.pi_prod_instnc_typ_cd = 'TTV'  THEN ffh_prod.bus_prod_instnc_id ELSE NULL END) AS ttv_count,
         COUNT(DISTINCT CASE WHEN ffh_prod.pi_prod_instnc_typ_cd = 'STV'  THEN ffh_prod.bus_prod_instnc_id ELSE NULL END) AS stv_count,
         COUNT(DISTINCT CASE WHEN ffh_prod.pi_prod_instnc_typ_cd = 'DIIC' THEN ffh_prod.bus_prod_instnc_id ELSE NULL END) AS diic_count   
         FROM `cio-datahub-enterprise-pr-183a.ent_cust_cust.bq_prod_instnc_snpsht` AS ffh_prod
   
   WHERE DATE(ffh_prod.prod_instnc_ts) = end_dt_snpsht
     AND ffh_prod.pi_prod_instnc_stat_cd = 'A' --Active Products
     AND ffh_prod.consldt_cust_typ_cd = 'R' --Regular (not Business)
     AND ffh_prod.pi_prod_instnc_typ_cd IN 
         (
           'DIIC', --Dialup
           'HSIC', --High Speed
           'SING', --Home Phone
           'SMHM', --Smart Home
           'STV',  --Satelite
           'TTV'   --TV
         )
GROUP BY ffh_prod.bacct_bus_bacct_num
--QUALIFY ROW_NUMBER() OVER (PARTITION BY ffh_prod.bacct_bus_bacct_num ORDER BY ffh_prod.prod_instnc_ts ) = 1
order by ffh_prod.bacct_bus_bacct_num
)



, ARPU_Calculation as(

with ARPU_full as (
SELECT
  distinct billg_acct_num AS ban,
  EXTRACT(YEAR
  FROM
    bill.bill_dt)||'-'||LPAD(CAST(EXTRACT(MONTH
      FROM
        bill.bill_dt) AS STRING), 2, '0') AS bill_year_month,
  sum(bill.rid_unit_chrg_amt) as ARPU_SMHM,
  sum(tot_inv_amt) as ARPU_FFH
  FROM
  `cio-datahub-enterprise-pr-183a.ent_cust_bill.bq_wln_inv_sum_view` bill
WHERE
  DATE(bill.bill_dt)>=  start_dt_snpsht
  and DATE(bill.bill_dt)<= end_dt_snpsht
  group by ban,bill_year_month
)

select ban, round(avg(ARPU_SMHM),0) as Avg_SMHM_ARPU,round(avg(ARPU_FFH),0) as Avg_FFH_ARPU 
from ARPU_full group by ban

)


,Deact_list as (

select distinct Telus_Cust_ID,SMHM_Churn from 
(select  distinct cast (CUST_ID as string) as Telus_Cust_ID, 1 as SMHM_Churn, PARSE_DATE('%d%h%Y',SUBSTR(ACTIVITY_DATE, 0,9)) as churn_date from `divgpras-pr-579355.SHS.SMHM_Deact_Dec2022_Feb2023`)
where churn_date> churn_dt_snpsht_start and churn_date<=churn_dt_snpsht_end


)


select cb.*,bp.*,arm.* ,lg.*,prod_mix.*,ARPU.*,Deacts.*
from Customer_Base cb
left join ADC_BP bp
on cb.customer_id=bp.customer_id
left join ADC_arming arm
on cb.customer_id=arm.customer_id
left join ADC_login lg
on cb.customer_id=lg.customer_id
left join cte_product_mix as prod_mix
on cb.BAN=prod_mix.ban
left join ARPU_Calculation as ARPU
on cb.BAN=ARPU.ban
left join Deact_list as Deacts
on cb.dealer_customer_id=Deacts.Telus_Cust_ID

'''

In [215]:
DF=extract_bq_data(bq_client, sql=Query)

In [216]:
DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 334134 entries, 0 to 334133
Data columns (total 50 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   customer_id                   334134 non-null  Int64  
 1   dealer_customer_id            334134 non-null  object 
 2   primary_login_id              334134 non-null  Int64  
 3   dealer_name                   334134 non-null  object 
 4   join_date                     334134 non-null  object 
 5   account_type_name             334134 non-null  object 
 6   customer_type_name            334134 non-null  object 
 7   Package                       334134 non-null  object 
 8   Tenure_months                 334134 non-null  Int64  
 9   BAN                           334134 non-null  Int64  
 10  cust_bus_cust_id              334134 non-null  object 
 11  contract_start_date           302317 non-null  object 
 12  contract_end_date             334134 non-nul

In [217]:
DF.head()

Unnamed: 0,customer_id,dealer_customer_id,primary_login_id,dealer_name,join_date,account_type_name,customer_type_name,Package,Tenure_months,BAN,...,sing_count,shs_count,ttv_count,stv_count,diic_count,ban_2,Avg_SMHM_ARPU,Avg_FFH_ARPU,Telus_Cust_ID,SMHM_Churn
0,14385714,104590057,16997698,TELUS Communications Inc.,2022-08-20,Security System,Customer,Monitored,3,605350400,...,0,1,0,0,1,605350400.0,77.0,99.0,,
1,14984117,98278824,17728197,TELUS Communications Inc.,2022-11-26,Security System,Customer,Monitored,0,605476776,...,0,1,1,0,1,605476776.0,,0.0,,
2,14471868,104652253,17104084,TELUS Communications Inc.,2022-09-02,Awareness and Automation,Customer,Smart_Automation_Plus,2,605368361,...,0,1,0,0,1,605368361.0,40.0,22.0,,
3,14689989,97051914,17370225,TELUS Communications Inc.,2022-10-31,Awareness and Automation,Customer,Smart_Automation_Plus,1,603232489,...,0,1,0,0,2,603232489.0,2.0,51.0,97051914.0,1.0
4,14973580,94533615,17715846,TELUS Communications Inc.,2022-11-23,Standalone,Customer,Smart_Camera,0,605542528,...,0,1,0,0,1,,,,,


In [218]:
DF['SMHM_Churn'].fillna(0,inplace=True)

In [219]:
# DF['churn_date'].fillna(0,inplace=True)

In [220]:
DF['SMHM_Churn'].value_counts()

0    330112
1      4022
Name: SMHM_Churn, dtype: Int64

In [221]:
# DF['churn_date'].value_counts()

In [222]:
# DF['churn_date'].min()

In [223]:
DF['SMHM_Churn'].value_counts(normalize=True)*100

0    98.796291
1     1.203709
Name: SMHM_Churn, dtype: Float64

In [226]:
DF['shs_count'].value_counts()

1     331751
2       2294
3         77
4          7
5          2
7          1
6          1
11         1
Name: shs_count, dtype: Int64

In [227]:
DF['BAN'].value_counts()

605350400    1
234918205    1
605542528    1
601209640    1
600770200    1
            ..
605150017    1
605167085    1
201823326    1
604998543    1
604337462    1
Name: BAN, Length: 334134, dtype: Int64

In [228]:
DF=DF.drop_duplicates()

In [229]:
DF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 334134 entries, 0 to 334133
Data columns (total 50 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   customer_id                   334134 non-null  Int64  
 1   dealer_customer_id            334134 non-null  object 
 2   primary_login_id              334134 non-null  Int64  
 3   dealer_name                   334134 non-null  object 
 4   join_date                     334134 non-null  object 
 5   account_type_name             334134 non-null  object 
 6   customer_type_name            334134 non-null  object 
 7   Package                       334134 non-null  object 
 8   Tenure_months                 334134 non-null  Int64  
 9   BAN                           334134 non-null  Int64  
 10  cust_bus_cust_id              334134 non-null  object 
 11  contract_start_date           302317 non-null  object 
 12  contract_end_date             334134 non-nul

In [230]:
# BAN_list=DF['BAN'].value_counts().rename_axis('BAN').reset_index(name='unique_counts')

In [231]:
# BAN_list_2=BAN_list[BAN_list.unique_counts>1]

In [232]:
# BAN_list_2

Unnamed: 0,BAN,unique_counts


In [234]:
# DF[DF['BAN']==604371757]
# .to_csv('Data_testing.csv',index=False)

In [235]:
config= bigquery.job.LoadJobConfig()

# config._properties['timePartitioning'] = {'field': 'Month_Year'}
config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

Table_BQ = 'SHS.SHS_churn_model_train_data_sep_nov_2022'

bq_table_instance= bq_client.load_table_from_dataframe(DF, Table_BQ,job_config=config)