In [1]:
import pandas as pd
import numpy as np
import itertools as it
import os
import io
import logging

import boto3
import sys
import json
import snowflake.connector

# Reading Data 

In [2]:
input_bucket = "hbo-ingest-datascience-content-dev"

In [3]:
logger = logging.getLogger()
logger.info(f'Loading inputs')
data_list =[]

s3 = boto3.resource('s3')
bucket = s3.Bucket(input_bucket)

for obj in bucket.objects.filter(Prefix='cost_allocation/dev/df_churn_2309'): #churn_metric_0811
            key = obj.key
            logger.info('Loading csv file {}'.format(key))
            body = obj.get()['Body']
            print('Reading {0} features'.format(key))
            churn_metric = pd.read_csv(body, na_values = [r'\\\\N'])



Reading cost_allocation/dev/df_churn_2309.csv features


In [6]:
df_churn = churn_metric.copy()

In [7]:
df_churnp = df_churn.pivot(index=['month','sub_month'], columns='is_cancel', values='subscriber_count').reset_index()
df_churnp = df_churnp.rename(columns={True: 'is_cancel_true', False: 'is_cancel_false'})
df_churnp=df_churnp.reset_index()


df_churnp['total'] = df_churnp['is_cancel_false'] +  df_churnp['is_cancel_true']
df_churnp['churn_rate'] = df_churnp['is_cancel_true']/df_churnp['total']
df_churnp['tenure_grp'] = 0
df_churnp['tenure_grp'] = np.where(df_churnp['sub_month']<=3, '1-3', df_churnp['tenure_grp'])
df_churnp['tenure_grp'] = np.where((df_churnp['sub_month']>3) & (df_churnp['sub_month']<=6), '4-6', df_churnp['tenure_grp'])
df_churnp['tenure_grp'] = np.where((df_churnp['sub_month']>6) & (df_churnp['sub_month']<=12), '6-12', df_churnp['tenure_grp'])
df_churnp['tenure_grp'] = np.where((df_churnp['sub_month']>12), '>12', df_churnp['tenure_grp'])
df_churnp['composition'] = df_churnp['total'] / df_churnp.groupby('month')['total'].transform('sum')
display(df_churnp.head())

df_churnpm = df_churnp[(df_churnp.sub_month<=24)].groupby(by=['month']).sum().reset_index()
df_churnpm['churn_rate'] = df_churnpm['is_cancel_true']/df_churnpm['total']
display(df_churnpm.head())

is_cancel,index,month,sub_month,is_cancel_false,is_cancel_true,total,churn_rate,tenure_grp,composition
0,0,2022-01-01,1,1000221,330175,1330396,0.248178,1-3,0.127524
1,1,2022-01-01,2,495093,91248,586341,0.155623,1-3,0.056203
2,2,2022-01-01,3,552589,76743,629332,0.121944,1-3,0.060324
3,3,2022-01-01,4,267510,32818,300328,0.109274,4-6,0.028788
4,4,2022-01-01,5,455599,48277,503876,0.095811,4-6,0.048299


is_cancel,month,index,sub_month,is_cancel_false,is_cancel_true,total,churn_rate,composition
0,2022-01-01,276,300,9509743,922747,10432490,0.088449,1.0
1,2022-02-01,852,300,9361360,787995,10149355,0.07764,1.0
2,2022-03-01,1428,300,9575767,801067,10376834,0.077198,1.0
3,2022-04-01,2004,300,9413842,766149,10179991,0.07526,1.0
4,2022-05-01,2580,300,9412058,718541,10130599,0.070928,1.0


# My Cal

In [8]:
churn_genpop_tenure = pd.read_csv('churn_genpop_tenure.csv')


In [13]:
churn_genpop_tenure.rename(columns = {'date_month':'month'}, inplace = True)

In [10]:
df_churn.head()

Unnamed: 0.1,Unnamed: 0,month,sub_month,is_cancel,subscriber_count
0,1710,2022-01-01,1,False,1000221
1,1697,2022-01-01,1,True,330175
2,1690,2022-01-01,2,False,495093
3,1679,2022-01-01,2,True,91248
4,1661,2022-01-01,3,False,552589


In [14]:
diff = df_churn.merge(churn_genpop_tenure, on = ['month', 'sub_month', 'is_cancel'],
                     how = 'left')

In [23]:
churn_genpop_tenure['total_subscriber'] = churn_genpop_tenure['subscriber_count'].groupby(churn_genpop_tenure['month']).transform('sum')
churn_genpop_tenure['churn_rate'] = churn_genpop_tenure['subscriber_count']/churn_genpop_tenure['total_subscriber']
churn_genpop_tenure = churn_genpop_tenure[churn_genpop_tenure['is_cancel'] == True]
churn_genpop_tenure = churn_genpop_tenure.sort_values(by = ['month'])

In [26]:
churn_genpop_tenure.head()

Unnamed: 0.1,Unnamed: 0,month,sub_month,is_cancel,subscriber_count,total_subscriber,churn_rate
843,843,2022-01-01,2,True,91248,10432490,0.008747
78,78,2022-01-01,22,True,0,10432490,0.0
866,866,2022-01-01,1,True,330175,10432490,0.031649
214,214,2022-01-01,19,True,14206,10432490,0.001362
356,356,2022-01-01,15,True,6444,10432490,0.000618


In [27]:
df_churnp.head()

is_cancel,index,month,sub_month,is_cancel_false,is_cancel_true,total,churn_rate,tenure_grp,composition
0,0,2022-01-01,1,1000221,330175,1330396,0.248178,1-3,0.127524
1,1,2022-01-01,2,495093,91248,586341,0.155623,1-3,0.056203
2,2,2022-01-01,3,552589,76743,629332,0.121944,1-3,0.060324
3,3,2022-01-01,4,267510,32818,300328,0.109274,4-6,0.028788
4,4,2022-01-01,5,455599,48277,503876,0.095811,4-6,0.048299


In [36]:
churn_genpop_tenure_bucekts = pd.read_csv('churn_genpop_tenure_bucekts.csv')

In [37]:
churn_genpop_tenure_bucekts.sort_values(by = ['date_month', 'tenure']).head()

Unnamed: 0.1,Unnamed: 0,date_month,tenure,is_cancel,subscriber_count
26,26,2022-01-01,Month 1-3,False,2047903
36,36,2022-01-01,Month 1-3,True,498166
7,7,2022-01-01,Month 13+,True,75042
61,61,2022-01-01,Month 13+,False,2658339
65,65,2022-01-01,Month 4-6,False,1109309


In [39]:
churn_genpop_tenure_bucekts['total_subscriber'] = churn_genpop_tenure_bucekts.groupby(['date_month'])['subscriber_count'].transform('sum')
churn_genpop_tenure_bucekts['churn_rate'] = churn_genpop_tenure_bucekts['subscriber_count']/churn_genpop_tenure_bucekts['total_subscriber']
# churn_genpop_tenure_bucekts = churn_genpop_tenure_bucekts[churn_genpop_tenure_bucekts['is_cancel'] == True]

KeyError: ('date_month', 'tenure')

In [34]:
churn_genpop_tenure_bucekts.sort_values(by = ['date_month', 'tenure']).head()

Unnamed: 0.1,Unnamed: 0,date_month,tenure,is_cancel,subscriber_count,total_subscriber,churn_rate
26,26,2022-01-01,Month 1-3,False,2047903,10432675,0.196297
36,36,2022-01-01,Month 1-3,True,498166,10432675,0.047751
7,7,2022-01-01,Month 13+,True,75042,10432675,0.007193
61,61,2022-01-01,Month 13+,False,2658339,10432675,0.254809
65,65,2022-01-01,Month 4-6,False,1109309,10432675,0.10633
