In [1]:
import Bens_forecasting_utils as fc
import pandas as pd
import datetime
import numpy as np
import logging
import sys
from pandas.tseries.offsets import MonthEnd
import warnings
warnings.filterwarnings('ignore')

In [2]:
parameters = dict(
    
    # cs kycc forecast converter:
    sheet_id                    = '1a_v0g2y5DvqLDbXXJC0m7i3PjF2AseC3lqoRwlGxg1o',
    raw_marketing_forecast      = 'raw_marketing_forecast!A1:S',
    tnc_to_lang                 = 'Tnc_to_Lang!A1:F',
    intra_month                 = 'kycc_by_lang_intra-month_seasonality!A1:D',
    intra_week                  = 'kycc_by_lang_intra-week_seasonality!A1:E' ,
    cohorts                     = 'cohorts!A1:B',
    kycc_volume                 = 'kycc_volume!A1:F',
    holiday_sheet_id            = '199I1PNPhdbOW_ytQK_zLoby_J0Wk1XHA1spXNqEbMuE',
    holiday_range               = 'holidays!A1:F'
)

### generating df_cohort

#### converting marketing forecast kycc to daily kycc cohorts

In [3]:
def generate_dates(month_year):

    
    date = pd.to_datetime(month_year, format='%b %y', errors='coerce')
    
    first_day = date
    last_day = date + MonthEnd(1)
    
    return pd.date_range(start=first_day, end=last_day).to_list()

In [4]:
def cohort_df(sheet_id=parameters['sheet_id'],
              raw_marketing_forecast=parameters['raw_marketing_forecast'],
              tnc_to_lang=parameters['tnc_to_lang'],
              intra_month=parameters['intra_month'],
              intra_week=parameters['intra_week'],
              cohorts=parameters['cohorts']):
    
    df = fc.import_gsheet_to_df(sheet_id, raw_marketing_forecast) #importing data from gsheet
    df.iloc[:,1:] = df.iloc[:,1:].replace(',','',regex=True).astype('int32') #converting integers
    
    df = df[df['KYCC']!='Total'] #removing total
    
    c_code = {'Germany':'DEU','Austria':'AUT','Spain':'ESP','France':'FRA','Italy':'ITA','Spain':'ESP','Greater Europe':'GrE',
          'Non-Euro':'NEuro'}
    
    df['TnC Country'] = df['KYCC'].map(c_code) #creating KYCC columns
    
    df = df.melt(id_vars=['KYCC','TnC Country'],var_name='Month',value_name='Total')
    
    # import tnc 
    tnc = fc.import_gsheet_to_df(sheet_id, tnc_to_lang)
    
    tnc.iloc[:,1:] = tnc.iloc[:,1:].replace('%','',regex=True).astype('float')
    
    merged_df = pd.merge(df, tnc, on='TnC Country')
    
    for col in tnc.columns:
        if col != 'TnC Country':
            merged_df[col] = merged_df['Total'] * (merged_df[col] / 100)
            
    df_melt = pd.melt(merged_df, id_vars=['Month'], value_vars=['de','en','es','fr','it'], var_name='language',
        value_name='value')
    
    df_melt = df_melt.groupby(['Month','language'])['value'].sum().reset_index()
    
    df_melt['date'] = df_melt['Month'].str.strip().apply(generate_dates)
    
    data = df_melt.explode('date').reset_index(drop=True)
    
    data = data.sort_values(by=['date','language'])
    
    data['dom'] = data['date'].dt.day
    data['dow'] = data['date'].dt.weekday + 1
    data.rename(columns={'language':'business_line_alias'},inplace=True)
    
    intra_m = fc.import_gsheet_to_df(sheet_id, intra_month)
    intra_w = fc.import_gsheet_to_df(sheet_id, intra_week)
    
    intra_m['dom'] = intra_m['dom'].astype('int32')
    intra_w['dow'] = intra_w['dow'].astype('int32')
    
    data = pd.merge(data, intra_m, how='left', on=['dom','business_line_alias'])
    data = pd.merge(data, intra_w, how='left', on=['dow','business_line_alias'])
    data.rename(columns={'seasonality':'intra_week_seasonality'},inplace=True)
    data = data[['Month','date','business_line_alias','value','dom','dow','intra_month_seasonality','intra_week_seasonality']]
    data['daily'] = data['value'] / data['date'].dt.to_period('M').dt.days_in_month
    
    
    data[['daily','intra_month_seasonality','intra_week_seasonality']] = data[['daily','intra_month_seasonality','intra_week_seasonality']].astype('float')
    data['daily_adj'] = (data['daily'] * (1+data['intra_month_seasonality'])) * (1+data['intra_week_seasonality'])
    data['daily_adj_total'] = data.groupby(['Month','business_line_alias'])['daily_adj'].transform('sum')
    data['final_daily'] = (1+(data['value'] - data['daily_adj_total']) / data['daily_adj_total']) * data['daily_adj']
    data = data[['date','business_line_alias','final_daily']]
    data.rename(columns={'business_line_alias':'language'},inplace=True)
    
    
    cohorts_1 = fc.import_gsheet_to_df(sheet_id, cohorts)
    df_cohorts = pd.merge(data, cohorts_1, how='left', on='language')
    df_cohorts.rename(columns={'date':'cohort_start_date','final_daily':'cohort_size'},inplace=True)
    df_cohorts = df_cohorts[['cohort_start_date','business_line_alias','cohort_size']]
    
    return df_cohorts

In [5]:
df_cohorts = cohort_df()

### generating df_vol_distro, kycc contact rates by business_line_alias

In [6]:
def vol_distro(sheet_id=parameters['sheet_id'],
              kycc_volume=parameters['kycc_volume']):
    
    
    kycc = fc.import_gsheet_to_df(sheet_id, kycc_volume)
    kycc['business_line_alias'] = 'ops-cs-L1-' + kycc['contact_language'] + '-' + kycc['channel']
    
    df_vol_distro = kycc[['days_since_kycc','business_line_alias','cs_contact_rate']]
    
    df_vol_distro.rename(columns={'days_since_kycc':'date_offset','cs_contact_rate':'vol_distro'},inplace=True)
    
    return df_vol_distro

In [7]:
df_vol_distro = vol_distro()

---
###### Join each cohort with corresponding volume distribution to create forecast:

In [8]:
%%time
# merge cohorts with volume distribution & calculate forecast values
grouping_column           = 'business_line_alias'
df_fc = df_cohorts.merge(df_vol_distro, left_on=[grouping_column], right_on=[grouping_column], how='inner')
df_fc['cohort_start_date'] = pd.to_datetime(df_fc['cohort_start_date'], format='%d/%m/%Y')
df_fc['cohort_size'] = pd.to_numeric(df_fc['cohort_size'])
df_fc['vol_distro'] = pd.to_numeric(df_fc['vol_distro'])
df_fc['date_offset'] = pd.to_numeric(df_fc['date_offset'])
print(max(df_fc['cohort_start_date']))

df_fc['forecast_date'] = df_fc['cohort_start_date'] + pd.to_timedelta(df_fc['date_offset'], unit='D') #causes perf warning but idk a more efficient way to do this 🤷‍♂️
df_fc['forecast_base'] = df_fc['cohort_size'] * df_fc['vol_distro']

print(max(df_fc['forecast_date']))

df_fc_grouped = df_fc.groupby(['forecast_date', 'business_line_alias']).agg(
    forecast = pd.NamedAgg(column='forecast_base', aggfunc=sum)
    #cohort_count = pd.NamedAgg(column='cohort_size', aggfunc=len)
)
print(df_fc_grouped)

2025-12-31 00:00:00
2026-02-04 00:00:00
                                   forecast
forecast_date business_line_alias          
2024-07-28    ops-cs-L1-de-call    0.000000
              ops-cs-L1-de-chat    0.000000
              ops-cs-L1-de-email   0.000000
              ops-cs-L1-en-call    0.000000
              ops-cs-L1-en-chat    0.000000
...                                     ...
2026-02-04    ops-cs-L1-fr-chat    1.517387
              ops-cs-L1-fr-email   1.487039
              ops-cs-L1-it-call    0.114473
              ops-cs-L1-it-chat    0.715454
              ops-cs-L1-it-email   0.400655

[8355 rows x 1 columns]
CPU times: user 451 ms, sys: 16.9 ms, total: 468 ms
Wall time: 465 ms


---
###### Seasonally adjust the forecast with intra-week and intra-month seasonality:

In [9]:
parameters_month = dict(
    
    # month seasonality:
    sheet_id                    = '1qPVSHGL6kxQ-JDStTVAET4fVWIv9aVBu_gHzziMVEuc',
    tab                         = 'intra-month_seasonality!A1:D'
)

In [10]:
parameters_week = dict(
    
    # week seasonality:
    sheet_id                    = '1qPVSHGL6kxQ-JDStTVAET4fVWIv9aVBu_gHzziMVEuc',
    tab                         = 'intra-week_seasonality!A1:E'
)

In [11]:
# adjust seasonality function
def adj_seasonality(df,parameters_month=parameters_month,parameters_week=parameters_week):
    
    # reset index
    df = df.reset_index()
    
    # getting day of week and day of month
    df['dow'] = df['forecast_date'].dt.weekday+1
    df['dom']  = df['forecast_date'].dt.day
    
    # getting seasonality numbers
    month = fc.import_gsheet_to_df(parameters_month['sheet_id'], parameters_month['tab'])
    week = fc.import_gsheet_to_df(parameters_week['sheet_id'], parameters_week['tab'])
    
    # converting data types
    month['dom'] = month['dom'].astype('int32')
    month['intra_month_seasonality'] = month['intra_month_seasonality'].astype('float')
    week['dow'] = week['dow'].astype('int32')
    week['seasonality'] = week['seasonality'].astype('float')
    
    #merging with month
    df = df.merge(month,how='left',on=['business_line_alias','dom'])
    df = df.drop(columns=['avg_vol'])
    
    # merging with week
    df = df.merge(week,how='left',on=['business_line_alias','dow'])
    
    # adjusting the forecast
    df['adj_forecast'] = df['forecast']*(1+df['intra_month_seasonality'])*(1+df['seasonality'])
    
    # choosing columns
    df = df[['forecast_date','business_line_alias','adj_forecast']]
    
    # renaming column
    df = df.rename(columns={'adj_forecast':'forecast'})
    
    df['forecast'] = df['forecast'].fillna(0)
    
    df['forecast_date'] = pd.to_datetime(df['forecast_date'])
    
    df['forecast_date'] = df['forecast_date'].dt.date
    
    df['forecast_date'] = pd.to_datetime(df['forecast_date'])
    
    df_holidays = fc.import_gsheet_to_df(parameters['holiday_sheet_id'], parameters['holiday_range'])
    df_holidays['holiday_date'] = pd.to_datetime(df_holidays['holiday_date'], format='%d/%m/%Y')
    df_holidays['holiday_offset_%'] = df_holidays['holiday_offset_%'].str.rstrip('%').astype('float') / 100.0
    
    df = df.merge(df_holidays, how='left', left_on=['forecast_date', 'business_line_alias'], right_on=['holiday_date', 'business_line_alias'])
    df['holiday_offset_%'] = df['holiday_offset_%'].fillna(0.0)
    df['holiday_vol_offset'] = df['forecast'] * df['holiday_offset_%']
    df['forecast'] = df['forecast'] + df['holiday_vol_offset']
    
    df = df[['forecast_date','business_line_alias','forecast']]
    
    return df

In [12]:
# run the function
df_fc_grouped = adj_seasonality(df = df_fc_grouped)

In [13]:
# export parameters:
gsheet_export_params = dict(
    
    df                 = df_fc_grouped,
    gsheet_id          = parameters['sheet_id'],
    gsheet_tab_name    = 'kycc_cs_vol',
    include_df_headers = True,
    tab_colour         = (0.0, 0.0, 0.0) #RGB tab colour
)

#date column must be turned into strings because datetime type is not JSON serialisable
df_fc_grouped['forecast_date'] = df_fc_grouped['forecast_date'].dt.strftime('%Y-%m-%d %H:%M:%S')
fc.export_df_to_google_sheet(**gsheet_export_params)