In [5]:
#### import global modules
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
from yaml import safe_load
import google.oauth2.credentials
from google.cloud import bigquery
import gc

# Set global vars
pth_project = Path(os.getcwd().split('notebooks')[0])
pth_data = pth_project / 'data'
pth_queries = pth_project / 'core' / 'queries'
pth_creds = pth_project / 'conf' / 'local' / 'project_config.yaml'
sys.path.insert(0, str(pth_project))
d_project_config = safe_load(pth_creds.open())
# d_params = safe_load((pth_project / 'core' / 'parameters' / 'common.yaml').open())['data_extract']

# import local modules
from core.utils.gcp import connect_bq_services
# from core.etl.extract import extract_bq_data, extract_pr_codes, format_conv_df, filter_convs

# Connect to google services
bq_client = connect_bq_services(d_project_config['gcp-project-name'])
pd.options.display.max_rows = 100

In [6]:
def extract_bq_data(bq_client, sql=None, pth_query=None):
    if sql is not None:
        df = bq_client.query(sql).to_dataframe()
    elif pth_query is not None:
        sql = pth_query.read_text()
        df = bq_client.query(sql).to_dataframe()
    else:
        raise ValueError('`sql` or `pth_query` should be set')  
    return df

In [7]:
Sales_order_Query='''


with original
as 
(SELECT 
min(dly_ord_itm_actvy_ts) as first_add,
bus_prod_instnc_id
 FROM `cio-datahub-enterprise-pr-183a.ent_cust_ord_actvy.bq_dly_wln_ord_item_actvy` WHERE ord_act_typ_cd = 'Add' and prod_typ_cd in ('SMHM') and ord_act_stat_cd = 'Processed' and bus_prod_instnc_id is not null
 and date(dly_ord_itm_actvy_ts) >= '2022-01-01' and date(dly_ord_itm_actvy_ts) < '2024-02-01'

 group by bus_prod_instnc_id) /* Fetch original order date for the product instance */

,details as
(
SELECT 
dly_ord_itm_actvy_ts,
bus_billg_acct_num,
bus_prod_instnc_id,
prod_nm,
munic_nm,
prov_state_cd,
bi_chnl_tag_cd,
chnl_org_id,
prod_typ_cd,
src_sls_rep_cd,
src_typ_cd,
src_usr_chnl_txt,
txn_sub_typ_txt,
SLS_ACTVY_TXT
 FROM `cio-datahub-enterprise-pr-183a.ent_cust_ord_actvy.bq_dly_wln_ord_item_actvy` WHERE ord_act_typ_cd = 'Add' and prod_typ_cd in ('SMHM') and ord_act_stat_cd = 'Processed'
and date(dly_ord_itm_actvy_ts) >= '2022-01-01' and date(dly_ord_itm_actvy_ts) < '2024-02-01'
 
 
 )
 
 
 
  /* Gather channel details */

,channel_refs as (


with
latest_update as (
SELECT
chnl_org_id,
max(chnl_org_key) as latest_key FROM `cio-datahub-enterprise-pr-183a.ent_sls_chnl.bq_channel_org_dim`
group by chnl_org_id

)

select distinct
t1.chnl_org_id,
chnl_org_txt from `cio-datahub-enterprise-pr-183a.ent_sls_chnl.bq_channel_org_dim` t1 inner join latest_update on latest_key = chnl_org_key

)

select
date(details.dly_ord_itm_actvy_ts) as Order_date,
details.bus_billg_acct_num as BAN,
details.bus_prod_instnc_id as Prod_Instnc_ID,
details.prod_nm as Prod_nm,
details.munic_nm,
details.prov_state_cd,
details.bi_chnl_tag_cd as Sales_Channel_tag,
details.chnl_org_id,
channel_refs.chnl_org_txt,
details.prod_typ_cd as Prod_Type,
details.src_sls_rep_cd as Sales_Agent_ID,
details.src_typ_cd,
details.src_usr_chnl_txt,
details.txn_sub_typ_txt,
details.SLS_ACTVY_TXT
 from details inner join original on original.bus_prod_instnc_id = details.bus_prod_instnc_id and original.first_add = details.dly_ord_itm_actvy_ts  /* Inner join them to prevent duplicates */
 left join channel_refs on details.chnl_org_id = channel_refs.chnl_org_id /* left joining to get channel dealer names */
 where bi_chnl_tag_cd <> 'UNKNOWN' 
--  and date(details.dly_ord_itm_actvy_ts) >= '2023-01-01' and date(details.dly_ord_itm_actvy_ts) < '2024-02-01'
 order by details.dly_ord_itm_actvy_ts



'''

In [8]:
DF_sales_Order=extract_bq_data(bq_client, sql=Sales_order_Query)

In [9]:
DF_sales_Order.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 777602 entries, 0 to 777601
Data columns (total 15 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Order_date         777602 non-null  dbdate
 1   BAN                777602 non-null  Int64 
 2   Prod_Instnc_ID     777602 non-null  object
 3   Prod_nm            777602 non-null  object
 4   munic_nm           777602 non-null  object
 5   prov_state_cd      777602 non-null  object
 6   Sales_Channel_tag  777602 non-null  object
 7   chnl_org_id        777602 non-null  Int64 
 8   chnl_org_txt       777602 non-null  object
 9   Prod_Type          777602 non-null  object
 10  Sales_Agent_ID     777602 non-null  object
 11  src_typ_cd         777602 non-null  object
 12  src_usr_chnl_txt   777602 non-null  object
 13  txn_sub_typ_txt    777602 non-null  object
 14  SLS_ACTVY_TXT      777602 non-null  object
dtypes: Int64(2), dbdate(1), object(12)
memory usage: 90.5+ MB


In [10]:
DF_sales_Order.head(10)

Unnamed: 0,Order_date,BAN,Prod_Instnc_ID,Prod_nm,munic_nm,prov_state_cd,Sales_Channel_tag,chnl_org_id,chnl_org_txt,Prod_Type,Sales_Agent_ID,src_typ_cd,src_usr_chnl_txt,txn_sub_typ_txt,SLS_ACTVY_TXT
0,2022-01-01,605014389,1122295769,LivingWell Companion Home with Fall Detection ...,UNKNOWN,UKN,WEB,-1000000000,UNKNOWN,SMHM,UKN,FIFA,N/AVAIL,N/AVAIL,Select
1,2022-01-01,605014241,1122283148,Control Plus Video,UNKNOWN,UKN,WEB,-1000000000,UNKNOWN,SMHM,UKN,FIFA,N/AVAIL,New Contract,Select
2,2022-01-01,237487792,1122298297,Smart Automation Plus (Automation Hub),UNKNOWN,UKN,CSS,-1000000000,UNKNOWN,SMHM,x215703,FIFA,Frontline,New Contract,CSS
3,2022-01-01,602589261,1122298007,Smart Automation Plus (Automation Hub),UNKNOWN,UKN,CARE-WLN,-1000000000,UNKNOWN,SMHM,t900065,FIFA,CLS,New Contract,N/AVAIL
4,2022-01-01,605002268,1122298317,Smart Camera,UNKNOWN,UKN,CARE-WLN,-1000000000,UNKNOWN,SMHM,t921852,FIFA,CLS,New Contract,N/AVAIL
5,2022-01-01,603825187,1122298034,Smart Automation Plus (Automation Hub),UNKNOWN,UKN,CARE-WLN,-1000000000,UNKNOWN,SMHM,t926907,FIFA,Frontline,New Contract,N/AVAIL
6,2022-01-01,604116377,1122273986,Smart Camera,UNKNOWN,UKN,CARE-WLN,-1000000000,UNKNOWN,SMHM,t962513,FIFA,CLS,New Contract,N/AVAIL
7,2022-01-01,603216902,1122273065,Smart Camera,UNKNOWN,UKN,CARE-WLN,-1000000000,UNKNOWN,SMHM,x188996,FIFA,Frontline,New Contract,N/AVAIL
8,2022-01-01,603990471,1122296469,Smart Camera,UNKNOWN,UKN,CARE-WLN,-1000000000,UNKNOWN,SMHM,x205516,FIFA,Frontline,New Contract,N/AVAIL
9,2022-01-01,603234403,1122289131,Smart Camera,UNKNOWN,UKN,CARE-WLN,-1000000000,UNKNOWN,SMHM,x235929,FIFA,CSR,New Contract,N/AVAIL


In [65]:
config= bigquery.job.LoadJobConfig()

# config._properties['timePartitioning'] = {'field': 'Month_Year'}
config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

Table_BQ = 'divg-team-v03-pr-de558a.shs_invol_churn.Sales_Order_SMHM_Jan2021_Jan2024'

bq_table_instance= bq_client.load_table_from_dataframe(DF_sales_Order, Table_BQ,job_config=config)

In [7]:
Query_ADC_Usage_Data= '''


with ADC_base as (

SELECT distinct
    ban as BAN,   FROM
    `bi-srv-divg-speech-pr-79f6e9.adc_feature_store.bq_adc_feature_store_daily`



)

, ADC_data_Feb_2023 as 

(

SELECT distinct
    ban as BAN,
    1 as Data_flag_Feb_2023,
    avg(login_consistency) as Login_consistency_Feb_2023 ,
    avg(Arming_Consistency) as Arming_consistency_Feb_2023,
   FROM
    `bi-srv-divg-speech-pr-79f6e9.adc_feature_store.bq_adc_feature_store_daily`
  WHERE
daily_snapshot_start_date='2023-02-01'
  group by ban
    
)

, ADC_data_Mar_2023 as 

(

SELECT distinct
    ban as BAN,
    1 as Data_flag_Mar_2023,
    avg(login_consistency) as Login_consistency_Mar_2023 ,
    avg(Arming_Consistency) as Arming_consistency_Mar_2023,
   FROM
    `bi-srv-divg-speech-pr-79f6e9.adc_feature_store.bq_adc_feature_store_daily`
  WHERE
daily_snapshot_start_date='2023-03-01'
  group by ban
    
)

select * from
ADC_base a 
LEFT JOIN ADC_data_Feb_2023 b
on a.BAN=b.BAN
LEFT JOIN ADC_data_Mar_2023 c
on a.BAN=c.BAN

'''

In [8]:
ADC_usage_DF=extract_bq_data(bq_client, sql=Query_ADC_Usage_Data)

In [9]:
ADC_usage_DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 787274 entries, 0 to 787273
Data columns (total 9 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   BAN                          787273 non-null  Int64  
 1   BAN_1                        416472 non-null  Int64  
 2   Data_flag_Feb_2023           416472 non-null  Int64  
 3   Login_consistency_Feb_2023   416472 non-null  float64
 4   Arming_consistency_Feb_2023  416472 non-null  float64
 5   BAN_2                        293025 non-null  Int64  
 6   Data_flag_Mar_2023           293025 non-null  Int64  
 7   Login_consistency_Mar_2023   293025 non-null  float64
 8   Arming_consistency_Mar_2023  293025 non-null  float64
dtypes: Int64(5), float64(4)
memory usage: 57.8 MB


In [10]:
ADC_usage_DF.head(10)

Unnamed: 0,BAN,BAN_1,Data_flag_Feb_2023,Login_consistency_Feb_2023,Arming_consistency_Feb_2023,BAN_2,Data_flag_Mar_2023,Login_consistency_Mar_2023,Arming_consistency_Mar_2023
0,601549416,601549416,1,0.0,72.413793,601549416,1,40.0,100.0
1,603644748,603644748,1,0.0,0.0,603644748,1,43.333333,0.0
2,605717820,605717820,1,0.0,27.586207,605717820,1,60.0,46.428571
3,603427273,603427273,1,0.0,0.0,603427273,1,60.0,0.0
4,602978330,602978330,1,0.0,0.0,602978330,1,60.0,0.0
5,605488852,605488852,1,0.0,1.724138,605488852,1,1.666667,0.0
6,605586077,605586077,1,0.0,86.206897,605586077,1,60.0,85.714286
7,600671310,600671310,1,0.0,75.862069,600671310,1,43.333333,89.285714
8,603122248,603122248,1,0.0,0.0,603122248,1,43.333333,0.0
9,605593125,605593125,1,0.0,96.551724,605593125,1,60.0,100.0


In [2]:
from datetime import date, timedelta

# Initialize a BigQuery client
# client = bq_client

# Initialize a BigQuery client
# client = bigquery.Client()

def generate_month_start_dates(start_year, start_month, end_year, end_month):
    dates = []
    current_year, current_month = start_year, start_month
    while current_year < end_year or (current_year == end_year and current_month <= end_month):
        dates.append(date(current_year, current_month, 1))
        current_month += 1
        if current_month > 12:
            current_month = 1
            current_year += 1
    return dates

def generate_query_with_month_suffix(snapshot_date):
    month_name = snapshot_date.strftime("%b_%Y")  # e.g., "Feb_2023"
    query = f"""
    SELECT DISTINCT
        ban AS BAN,
        Segment AS Segment_{month_name},
        1 AS Data_flag_{month_name},
        AVG(login_consistency) AS Login_consistency_{month_name},
        AVG(Arming_Consistency) AS Arming_consistency_{month_name}, 
        MAX(best_practice_all) AS Best_practices_{month_name}
    FROM
        `bi-srv-divg-speech-pr-79f6e9.adc_feature_store.bq_adc_feature_store_daily`
    WHERE
        daily_snapshot_start_date = '{snapshot_date}'
    GROUP BY
        ban, Segment
    """
    return query

dates_1 = generate_month_start_dates(2023, 2, 2023, 11)

# Initialize an empty DataFrame to act as the base for outer joins
final_df = None

# Loop through each month, run the query, and perform an outer join on the 'BAN' column
for snapshot_date in dates_1:
    print(snapshot_date)
    query = generate_query_with_month_suffix(snapshot_date)
    monthly_df = bq_client.query(query).to_dataframe()
    
    if final_df is None:
        final_df = monthly_df
    else:
        # Perform an outer join on the 'BAN' column
        final_df = pd.merge(final_df, monthly_df, on='BAN', how='outer', suffixes=('', '_dup'))

# After merging, you may find some columns with '_dup' suffixes if there are overlapping column names
# You can choose to handle or rename these as needed, depending on your requirements



2023-02-01
2023-03-01
2023-04-01
2023-05-01
2023-06-01
2023-07-01
2023-08-01
2023-09-01
2023-10-01
2023-11-01


In [3]:
final_df.head()

Unnamed: 0,BAN,Segment_Feb_2023,Data_flag_Feb_2023,Login_consistency_Feb_2023,Arming_consistency_Feb_2023,Best_practices_Feb_2023,Segment_Mar_2023,Data_flag_Mar_2023,Login_consistency_Mar_2023,Arming_consistency_Mar_2023,...,Segment_Oct_2023,Data_flag_Oct_2023,Login_consistency_Oct_2023,Arming_consistency_Oct_2023,Best_practices_Oct_2023,Segment_Nov_2023,Data_flag_Nov_2023,Login_consistency_Nov_2023,Arming_consistency_Nov_2023,Best_practices_Nov_2023
0,123456958,,,,,,,,,,...,Disengaged,1.0,0.0,0.0,0.0,,,,,
1,123457066,,,,,,,,,,...,,,,,,,,,,
2,123457339,,,,,,,,,,...,,,,,,,,,,
3,123457393,Old_Fashion,1.0,0.0,68.965517,3.0,,,,,...,Moderate_Users,1.0,93.333333,89.655172,3.0,Moderate_Users,1.0,93.333333,85.714286,3.0
4,123457881,Disengaged,1.0,0.0,0.0,2.0,Disengaged,1.0,0.0,0.0,...,Disengaged,1.0,0.0,0.0,2.0,Disengaged,1.0,0.0,0.0,2.0


In [4]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14558025 entries, 0 to 14558024
Data columns (total 51 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   BAN                          Int64  
 1   Segment_Feb_2023             object 
 2   Data_flag_Feb_2023           Int64  
 3   Login_consistency_Feb_2023   float64
 4   Arming_consistency_Feb_2023  float64
 5   Best_practices_Feb_2023      float64
 6   Segment_Mar_2023             object 
 7   Data_flag_Mar_2023           Int64  
 8   Login_consistency_Mar_2023   float64
 9   Arming_consistency_Mar_2023  float64
 10  Best_practices_Mar_2023      float64
 11  Segment_Apr_2023             object 
 12  Data_flag_Apr_2023           Int64  
 13  Login_consistency_Apr_2023   float64
 14  Arming_consistency_Apr_2023  float64
 15  Best_practices_Apr_2023      float64
 16  Segment_May_2023             object 
 17  Data_flag_May_2023           Int64  
 18  Login_consistency_May_2023   float64
 19

In [5]:
final_df.isna().sum()*100/final_df.shape[0]

BAN                            67.080699
Segment_Feb_2023                5.804888
Data_flag_Feb_2023              5.804888
Login_consistency_Feb_2023      5.804888
Arming_consistency_Feb_2023     5.804888
Best_practices_Feb_2023         5.804888
Segment_Mar_2023                4.714314
Data_flag_Mar_2023              4.714314
Login_consistency_Mar_2023      4.714314
Arming_consistency_Mar_2023     4.714314
Best_practices_Mar_2023         4.714314
Segment_Apr_2023                1.635565
Data_flag_Apr_2023              1.635565
Login_consistency_Apr_2023      1.635565
Arming_consistency_Apr_2023     1.635565
Best_practices_Apr_2023         1.635565
Segment_May_2023                1.377975
Data_flag_May_2023              1.377975
Login_consistency_May_2023      1.377975
Arming_consistency_May_2023     1.377975
Best_practices_May_2023         1.377975
Segment_Jun_2023                1.051420
Data_flag_Jun_2023              1.051420
Login_consistency_Jun_2023      1.051420
Arming_consisten

In [6]:
config= bigquery.job.LoadJobConfig()

# config._properties['timePartitioning'] = {'field': 'Month_Year'}
config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

Table_BQ = 'divg-team-v03-pr-de558a.shs_invol_churn.ADC_Usage_data_Feb2023_Nov2023'

bq_table_instance= bq_client.load_table_from_dataframe(final_df, Table_BQ,job_config=config)

In [7]:
del final_df

In [71]:
from datetime import date, timedelta
import pandas as pd
from google.cloud import bigquery

# Initialize a BigQuery client
client = bq_client

def generate_month_start_dates(start_year, start_month, end_year, end_month):
    dates = []
    current_year, current_month = start_year, start_month
    while current_year < end_year or (current_year == end_year and current_month <= end_month):
        dates.append(date(current_year, current_month, 1))
        current_month += 1
        if current_month > 12:
            current_month = 1
            current_year += 1
    return dates

def generate_query_with_month_suffix(snapshot_date):
    month_name = snapshot_date.strftime("%b_%Y")  # e.g., "Feb_2023"
    query = f"""
    SELECT DISTINCT
        date(daily_snapshot_start_date) as Month_start_date,
        ban AS BAN,
        1 AS Data_flag,
        AVG(login_consistency) AS Login_consistency,
        AVG(Arming_Consistency) AS Arming_consistency, 
        MAX(best_practice_all) AS Best_practices,
        '{month_name}' AS Month_name,
        Segment 
        FROM
        `bi-srv-divg-speech-pr-79f6e9.adc_feature_store.bq_adc_feature_store_daily`
    WHERE
        daily_snapshot_start_date = '{snapshot_date}'
    GROUP BY
        daily_snapshot_start_date,ban,Segment
    """
    return query

dates_1 = generate_month_start_dates(2023, 2, 2024, 1)

# Initialize an empty DataFrame to store results
final_df_long = pd.DataFrame()

# Loop through each month, run the query, and append to the final DataFrame
for snapshot_date in dates_1:
    query = generate_query_with_month_suffix(snapshot_date)
    monthly_df_long = client.query(query).to_dataframe()
    # final_df_long the monthly DataFrame to the final DataFrame
    final_df_long = pd.concat([final_df_long, monthly_df_long], ignore_index=True)



In [72]:
final_df_long.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6884253 entries, 0 to 6884252
Data columns (total 8 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Month_start_date    dbdate 
 1   BAN                 Int64  
 2   Data_flag           Int64  
 3   Login_consistency   float64
 4   Arming_consistency  float64
 5   Best_practices      float64
 6   Month_name          object 
 7   Segment             object 
dtypes: Int64(2), dbdate(1), float64(3), object(2)
memory usage: 433.3+ MB


In [73]:
final_df_long.head()

Unnamed: 0,Month_start_date,BAN,Data_flag,Login_consistency,Arming_consistency,Best_practices,Month_name,Segment
0,2023-02-01,217824963,1,0.0,12.068966,5.0,Feb_2023,Moderate_Users
1,2023-02-01,602274202,1,0.0,56.896552,2.0,Feb_2023,Old_Fashion
2,2023-02-01,605672208,1,0.0,25.862069,0.0,Feb_2023,Moderate_Users
3,2023-02-01,604692700,1,0.0,8.62069,6.0,Feb_2023,Moderate_Users
4,2023-02-01,601541423,1,0.0,12.068966,5.0,Feb_2023,Moderate_Users


In [75]:
final_df_long.isna().sum()*100/final_df_long.shape[0]

Month_start_date      0.000000
BAN                   0.000872
Data_flag             0.000000
Login_consistency     0.000000
Arming_consistency    0.000000
Best_practices        0.000000
Month_name            0.000000
Segment               0.000000
dtype: float64

In [76]:
config= bigquery.job.LoadJobConfig()

# config._properties['timePartitioning'] = {'field': 'Month_Year'}
config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

Table_BQ = 'divg-team-v03-pr-de558a.shs_invol_churn.ADC_Usage_data_Feb2023_Jan2024_long'

bq_table_instance= bq_client.load_table_from_dataframe(final_df_long, Table_BQ,job_config=config)

In [76]:
Base_Sales_data_query='''



with interim_data as (
select a.*,b.Sales_Channel_tag, b.chnl_org_id, b.chnl_org_txt,b.src_usr_chnl_txt,b.Order_date,b.Sales_Agent_ID,c.*
from `divg-team-v03-pr-de558a.shs_invol_churn.shs_invol_churn_base_data` a

left join `divg-team-v03-pr-de558a.shs_invol_churn.Sales_Order_SMHM_Jan2023_Jan2024` b
on a.Bus_Billing_Account_Num=b.BAN and a.Bus_Prod_Instnc_Id=cast(b.Prod_Instnc_ID as integer)


left join `divg-team-v03-pr-de558a.shs_invol_churn.ADC_Usage_data_Feb2023_Jan2024_long` c
on a.Bus_Billing_Account_Num=c.BAN 
where c.Month_start_date="2023-08-01"

)


select * from interim_data
where Base_Snapshot_Date="2023-09-01"
order by Base_Snapshot_Date,Bus_Billing_Account_Num

'''

In [77]:
Base_DF=extract_bq_data(bq_client, sql=Base_Sales_data_query)

In [78]:
Base_DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67895 entries, 0 to 67894
Data columns (total 85 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   CUST_ID                                 67895 non-null  Int64  
 1   Bus_Billing_Account_Num                 67895 non-null  Int64  
 2   Bus_Prod_Instnc_Id                      67895 non-null  Int64  
 3   Base_Snapshot_Date                      67895 non-null  dbdate 
 4   Security_Origin                         67895 non-null  object 
 5   Cultural_Segment                        67895 non-null  object 
 6   Installation_Type                       67895 non-null  object 
 7   Product_Profile                         67895 non-null  object 
 8   Province_Grp                            67895 non-null  object 
 9   Tenure_Month_Groups                     67895 non-null  object 
 10  ACQUIRED_FROM                           3683 non-null   ob

In [79]:
Base_DF.head()

Unnamed: 0,CUST_ID,Bus_Billing_Account_Num,Bus_Prod_Instnc_Id,Base_Snapshot_Date,Security_Origin,Cultural_Segment,Installation_Type,Product_Profile,Province_Grp,Tenure_Month_Groups,...,Order_date,Sales_Agent_ID,Month_start_date,BAN,Data_flag,Login_consistency,Arming_consistency,Best_practices,Month_name,Segment
0,98812035,603748201,1132310330,2023-09-01,Organic,General,Traditional Install,Monitored,QC,7 to 12 Mo,...,NaT,,2023-08-01,603748201,1,46.666667,0.0,4.0,Aug_2023,Moderate_Users
1,99623285,603976520,1115798171,2023-09-01,Organic,General,Traditional Install,Unmonitored,QC,13 to 24 Mo,...,2023-02-14,x224607,2023-08-01,603976520,1,63.333333,0.0,4.0,Aug_2023,Home_automation_Savvy
2,99907228,604062443,1133514362,2023-09-01,Organic,General,Traditional Install,Monitored,QC,7 to 12 Mo,...,NaT,,2023-08-01,604062443,1,0.0,0.0,0.0,Aug_2023,Disengaged
3,100448132,604194702,1138217834,2023-09-01,Organic,General,Traditional Install,Monitored,QC,36+,...,NaT,,2023-08-01,604194702,1,86.666667,6.666667,4.0,Aug_2023,Home_automation_Savvy
4,100472558,604202926,1109711028,2023-09-01,Organic,General,Traditional Install,Monitored,QC,36+,...,NaT,,2023-08-01,604202926,1,3.333333,10.0,3.0,Aug_2023,Moderate_Users


In [80]:
Base_DF['Order_date'].value_counts()

Order_date
2023-02-14    1945
2023-07-05     130
2023-08-24     127
2023-08-23     125
2023-08-04     124
              ... 
2023-10-29       1
2023-11-07       1
2024-01-15       1
2024-01-25       1
2023-09-01       1
Name: count, Length: 342, dtype: int64

In [81]:
Base_DF.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67895 entries, 0 to 67894
Data columns (total 85 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   CUST_ID                                 67895 non-null  Int64  
 1   Bus_Billing_Account_Num                 67895 non-null  Int64  
 2   Bus_Prod_Instnc_Id                      67895 non-null  Int64  
 3   Base_Snapshot_Date                      67895 non-null  dbdate 
 4   Security_Origin                         67895 non-null  object 
 5   Cultural_Segment                        67895 non-null  object 
 6   Installation_Type                       67895 non-null  object 
 7   Product_Profile                         67895 non-null  object 
 8   Province_Grp                            67895 non-null  object 
 9   Tenure_Month_Groups                     67895 non-null  object 
 10  ACQUIRED_FROM                           3683 non-null   ob

In [82]:
Base_DF.isna().sum()*100/Base_DF.shape[0]

CUST_ID                                    0.000000
Bus_Billing_Account_Num                    0.000000
Bus_Prod_Instnc_Id                         0.000000
Base_Snapshot_Date                         0.000000
Security_Origin                            0.000000
Cultural_Segment                           0.000000
Installation_Type                          0.000000
Product_Profile                            0.000000
Province_Grp                               0.000000
Tenure_Month_Groups                        0.000000
ACQUIRED_FROM                             94.575447
DIY_Activated                              0.000000
price_plan                                 0.265115
Acquisition_Source                         0.000000
Activation_Ts__Month___Year_               0.000000
bi_chnl_tag_cd                            47.281832
Channel                                    0.000000
Channel__Best_                             0.000000
Channel_Category                           0.000000
CHNL_ORG_TXT

In [83]:
Base_DF['Segment'].fillna('No_Segment',inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  Base_DF['Segment'].fillna('No_Segment',inplace=True)


In [84]:
Base_DF['date_status'] = np.where(Base_DF['Order_date'].isna(), 'Older_Order', 'Order_in_last12_months')


In [85]:
Base_DF.to_csv('Invol_churn_Analysis_Sep2023.csv',index=False)

In [33]:
Base_DF_subset=Base_DF[Base_DF['date_status']=='Order_in_last12_months']

In [42]:
pd.DataFrame(Base_DF_subset.groupby(['Channel_Category','Security_Plan_Group']).agg(
    
    Customer_count= ('Bus_Billing_Account_Num','nunique'),
    Churn_rate=('churn',lambda x: round(x.mean()*100,2)),
    Invol_Churn=('invol',lambda x: x.sum()),
    Invol_Churn_rate=('invol',lambda x: round(x.mean()*100,2)),

    # AVG_SMHM_ARPU=('ARPU_SMHM_intial',lambda x: round(x.mean(),0)),
    # AVG_SMHM_ARPU_final=('ARPU_SMHM_final',lambda x: round(x.mean(),0)),
    # Login_consistency=('Login_consistency',lambda x: round(x.mean(),0)),
    # prod_mix_intial=('prod_mix_intial',lambda x: round(x.mean(),1)),
    # prod_mix_final=('prod_mix_final',lambda x: round(x.mean(),1)) 


    # Email_Opened_count=('OPENED','sum'),
    # CT_Opened_count=('CLICKTHROUGH','sum'),
    # softbounce_count=('SOFTBOUNCE','sum'),
    # Hardbounce_count=('HARDBOUNCE','sum'),
    # Unsub_count=('UNSUBSCRIBE','sum'),
    # campaign_date_min=('Campaign_date','min'),
    # campaign_date_max=('Campaign_date','max')

    
    # # Customer_Share= ('customer_id',lambda x:x.count()*100/Merge_DF_4.shape[0])
    # Churn_total=('Telus_Churn_Flag',lambda x: x.sum()),
    # Churn_rate=('Telus_Churn_Flag',lambda x: x.mean()*100)
).reset_index())

Unnamed: 0,Channel_Category,Security_Plan_Group,Customer_count,Churn_rate,Invol_Churn,Invol_Churn_rate
0,Client Care,Control,2920,2.35,52,1.74
1,Client Care,Other,2,0.0,0,0.0
2,Client Care,Secure,1380,1.69,19,1.34
3,Client Care,Smart Automation,1955,2.89,36,1.83
4,Client Care,Smart Camera,333,4.46,8,2.38
5,Corp Stores,Control,150,3.97,4,2.65
6,Corp Stores,Other,2,0.0,0,0.0
7,Corp Stores,Secure,252,4.69,3,1.17
8,Corp Stores,Smart Automation,491,5.28,13,2.64
9,Corp Stores,Smart Camera,771,5.92,21,2.7
