In [1]:
import time
import pandas as pd
from os import listdir
from sqlalchemy import create_engine
import sqlalchemy
from tqdm import tqdm
import datetime
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import gspread_dataframe as gd

## Variables

In [2]:
# Dates of the campaign
dates = ['2022-10-14', '2023-01-06']
# Cluster dictionary
cluster_names = pd.read_excel("/Users/gabrielreynoso/Documents/RFM_LayerA_Dic.xlsx")
# Create the channel dictionary
cluster_dict = cluster_names.set_index('id').to_dict()['name']
# BD Connection
f = open('/Users/gabrielreynoso/Documents/Queries/db_klarprod_connection.txt', 'r')
postgres_str = f.read()
f.close()
cnx = create_engine(postgres_str)
# Read and Load Credentials
credentials = ServiceAccountCredentials.from_json_keyfile_name('/Users/gabrielreynoso/Documents/GoogleCredentials/gabo_credentials.json')
gc = gspread.authorize(credentials)
# Export results
writer = pd.ExcelWriter('./Results/November_ChurnCampaign_Results.xlsx', engine='xlsxwriter')

## Queries

In [3]:
# RFM Query for data on a specific date
query_rfm = '''
        select rfm_layer.user_id,
               max_user.max_date as date,
               rfm_layer.cluster,
               rfm_layer.time,
               rfm_layer.monetary,
               rfm_layer.frequency,
               rfm_layer.recency
        from growth.rfm_history as rfm_layer,
             (select user_id,
                     max(date) as max_date
             from growth.rfm_history
             where date < '{}'::date
             group by user_id) max_user
        where rfm_layer.user_id = max_user.user_id
        and rfm_layer.date = max_user.max_date
        and rfm_layer.user_id in {}
'''
# Purchases for the users in the specific date
query_p = '''
        select
               t.user_id,
               count(transaction_id) as purchases,
                -1*sum(amount) as total_amount
        from analytics_bi.transactions as t
        where t.type in ('PURCHASE')
          and t.timestamp_mx_created_at between {} and  {}
          and t.state = 'SETTLED'
          and t.source_account_internal_id <> '0000000000000000'
          and t.source_account_internal_id <> '00000000-0000-0000-0000-000000000000'
          and t.provider_id <> 'KLAR'
          and t.user_id in {}
        group by t.user_id

'''

In [4]:
cohort_query = '''
select
    klar_user_id as user_id,
    segment_name,
    case when segment_name like '%MAR%' then 'More at risk'
         when segment_name like '%More at risk%' then 'More at risk' else 'Churned' end as cohort,
    case when segment_name like '%Trigger%' then 'Trigger'
         when segment_name like '%Treatment' then 'Treatment' else 'Control' end as type_segment
from is_customer_io.segments
where segment_name like '%RFM November%'
and type_segment != 'Trigger';
'''

In [5]:
cohort_info = pd.read_sql_query(sqlalchemy.text(cohort_query),cnx)

In [6]:
cohort_info.segment_name.value_counts()

Churn RFM November - Churn Treatment       104927
Churn RFM November - Churn Control          69614
Churn RFM November - MAR 0,75 Treatment     57234
Churn RFM November - MAR 0,75 Control       38572
Churn RFM November - MAR 75,- Treatment      8531
Churn RFM November - MAR 75,- Control        5715
Name: segment_name, dtype: int64

In [7]:
cohort_info.head(5)

Unnamed: 0,user_id,segment_name,cohort,type_segment
0,000145c0-671b-4563-bbd0-88cb0f44f420,Churn RFM November - Churn Treatment,Churned,Treatment
1,0001be16-8430-4314-bac4-f23119645859,"Churn RFM November - MAR 0,75 Control",More at risk,Control
2,0001e0b1-14de-48c6-b41f-d0ec240c6e95,Churn RFM November - Churn Control,Churned,Control
3,0001e46b-bade-479e-967f-0481ed7318cb,"Churn RFM November - MAR 0,75 Control",More at risk,Control
4,00026f33-68e7-479f-83aa-36f19a2846cc,Churn RFM November - Churn Treatment,Churned,Treatment


#### Cohorts

In [8]:
# Read a worksheet into a Dataframe
churned = cohort_info[cohort_info.cohort=='Churned']
more_risk_1 = cohort_info[cohort_info.segment_name.str.contains('0,75')]
more_risk_2 = cohort_info[cohort_info.segment_name.str.contains('75,-')]

In [9]:
churned.type_segment.value_counts()

Treatment    104927
Control       69614
Name: type_segment, dtype: int64

In [10]:
churned.columns

Index(['user_id', 'segment_name', 'cohort', 'type_segment'], dtype='object')

## Segments EDA

In [11]:
more_risk_2.type_segment.value_counts()

Treatment    8531
Control      5715
Name: type_segment, dtype: int64

## Processing

In [12]:
# For loop for extracting cohort information
cohorts = {}
cohorts_results = {}
cohorts_movement = {}
segment_names = ['Churned', 'At_Risk_S1', 'At_Risk_S2']
for idx, aux in enumerate([churned, more_risk_1 , more_risk_2]):
    # Get cohort info
    aux = aux[~aux.user_id.isna()][['user_id', 'type_segment']]
    # Dataframe and results dicts
    segments_dict = {}
    segments_results = {}
    segments_movement = {}
    # Print the cohort name
    start_time = time.time()
    overall_treatment = pd.DataFrame()
    i = 0
    for segment in tqdm(aux.type_segment.unique()):
        print(segment)
        # Filter segment
        aux_segment = aux[aux.type_segment == segment]
        # Start of the campaign
        aux_start_campaign = pd.read_sql_query(query_rfm.format(dates[0], tuple(aux_segment.user_id.to_list())), cnx)
        aux_start_campaign = aux_start_campaign.set_index('user_id')
        # Map cluster
        aux_start_campaign['cluster'] = aux_start_campaign['cluster'].map(cluster_dict)
        aux_start_campaign = aux_start_campaign.add_prefix('Pre_')
        # End of the campaign
        aux_end_campaign = pd.read_sql_query(query_rfm.format(dates[1], tuple(aux_segment.user_id.to_list())), cnx)
        aux_end_campaign = aux_end_campaign.set_index('user_id')
        # Map cluster
        aux_end_campaign['cluster'] = aux_end_campaign['cluster'].map(cluster_dict)
        aux_end_campaign = aux_end_campaign.add_prefix('Post_')
        # Merge Pre and Post campaign
        aux_result = pd.merge(aux_start_campaign, aux_end_campaign, left_index=True, right_index=True)
        aux_result = aux_result.reset_index()
        # Purchases
        aux_purchases = pd.read_sql_query(query_p.format(dates[0],dates[1],tuple(aux_segment.user_id.to_list())), cnx)
        # Add Purchases to result info
        aux_result = pd.merge(aux_result, aux_purchases, on='user_id', how='left')
        aux_result['movement'] = aux_result['Pre_cluster'] + '->' + aux_result['Post_cluster']
        # Add to dict of segments cohorts
        segments_dict[segment] = aux_result
        segments_results[segment] = aux_result.groupby('movement')[['purchases','total_amount','Pre_monetary','Pre_frequency', 'Pre_recency','Post_monetary', 'Post_frequency', 'Post_recency']].describe()
        segments_movement[segment] = pd.concat([aux_result.movement.value_counts(), aux_result.movement.value_counts(normalize=True).mul(100)],axis=1, keys=('counts','percentage'))
        # If not control then append to general treatment dataframe
        if segment != 'Control':
            overall_treatment = overall_treatment.append(aux_result)
        # Write the results into an excel
        segments_results[segment].to_excel(writer, startrow=i+3, startcol=0, sheet_name=segment_names[idx])
        segments_movement[segment].to_excel(writer, startrow=i+7+segments_results[segment].shape[0], startcol=0, sheet_name=segment_names[idx])
        worksheet = writer.sheets[segment_names[idx]]
        worksheet.write(i, 0, segment)
        i = i+9+segments_results[segment].shape[0] + segments_movement[segment].shape[0]
    print(str(time.time()-start_time) + ' seconds')
    segments_dict['Overall_Treatment'] = overall_treatment
    segments_results['Overall_Treatment'] = overall_treatment.groupby('movement')[['purchases','total_amount','Pre_monetary','Pre_frequency', 'Pre_recency','Post_monetary', 'Post_frequency', 'Post_recency']].describe()
    segments_movement['Overall_Treatment'] = pd.concat([overall_treatment.movement.value_counts(), overall_treatment.movement.value_counts(normalize=True).mul(100)], axis=1, keys=('counts', 'percentage'))
    # Print the overall results table into excel
    segments_results['Overall_Treatment'].to_excel(writer, startrow=i+3, startcol=0, sheet_name=segment_names[idx])
    segments_movement['Overall_Treatment'].to_excel(writer, startrow=i+7+segments_results['Overall_Treatment'].shape[0], startcol=0, sheet_name=segment_names[idx])
    worksheet = writer.sheets[segment_names[idx]]
    worksheet.write(i, 0, 'Overall_Treatment')
    # Add the segment dict to the cohorts dict
    cohorts[segment_names[idx]] = segments_dict
    cohorts_results[segment_names[idx]] = segments_results
writer.save()

  0%|          | 0/2 [00:00<?, ?it/s]

Treatment


  overall_treatment = overall_treatment.append(aux_result)
 50%|█████     | 1/2 [01:32<01:32, 92.73s/it]

Control


100%|██████████| 2/2 [02:19<00:00, 69.61s/it]


139.2615749835968 seconds


  0%|          | 0/2 [00:00<?, ?it/s]

Control


 50%|█████     | 1/2 [00:35<00:35, 35.19s/it]

Treatment


  overall_treatment = overall_treatment.append(aux_result)
100%|██████████| 2/2 [01:23<00:00, 41.98s/it]


83.98099684715271 seconds


  0%|          | 0/2 [00:00<?, ?it/s]

Control


 50%|█████     | 1/2 [00:34<00:34, 34.14s/it]

Treatment


  overall_treatment = overall_treatment.append(aux_result)
100%|██████████| 2/2 [01:09<00:00, 34.89s/it]

69.77745509147644 seconds



  writer.save()
