In [14]:
import os, pickle
import pandas as pd
import numpy as np
import plotly.express as px
from pycaret.regression import *
from datetime import datetime
import matplotlib.pyplot as plt
import src.paychex_ml.data_loader as dl

In [3]:
## Change the project path
proyect_path = "C:/Users/bruno.gonzalez/DataspellProjects/Paychex_revenue_forecast"

In [4]:
# ------------------------------------------------------------------------------------- #
# This is where we change parameters to the model
# ------------------------------------------------------------------------------------- #
pred_start_dt = '20210601'
pred_end_dt = '20220101'
forecast_window = 8

In [24]:
# Set manually date if is necessary
model_run_date = '20220512'

In [25]:
# ------------------------------------------------------------------------------------- #
# Total Payroll Section
# ------------------------------------------------------------------------------------- #
# target_col = 'Payroll blended products'
# has_drivers = True
# target_col = 'W-2 Revenue'
# has_drivers = False
# target_col = 'Delivery Revenue'
# has_drivers = False
# target_col = 'ASO Allocation'
# has_drivers = False
# target_col = 'Other Processing Revenue'
# has_drivers = False
# target_col = 'SurePayroll.'
# has_drivers = True
# target_col = 'Total international'
# has_drivers = False

# ------------------------------------------------------------------------------------- #
# Total 401K Section
# ------------------------------------------------------------------------------------- #
# target_col = '401K Fee Revenue'
# has_drivers = True
# target_col = '401K Asset fee & BP Revenue'
# has_drivers = True

# ------------------------------------------------------------------------------------- #
# Total ASO Revenue
# ------------------------------------------------------------------------------------- #
# target_col = 'HR Solutions (PEO)'
# has_drivers = False
# target_col = 'ASO Revenue - Oasis'
# has_drivers = False

# ------------------------------------------------------------------------------------- #
# Total Online Services
# ------------------------------------------------------------------------------------- #
# target_col = 'HR Online'
# has_drivers = False
# target_col = 'Time & Attendance'
# has_drivers = False

# ------------------------------------------------------------------------------------- #
# Other Management Solutions
# ------------------------------------------------------------------------------------- #
# target_col = 'Total Paychex Advance'
# has_drivers = True
# target_col = 'Full Service Unemployment Revenue'
# has_drivers = True
# target_col = 'ESR Revenue'
# has_drivers = True
# target_col = 'Cafeteria Plans Revenue'
# has_drivers = True
# target_col = 'Benetrac'
# has_drivers = True
# target_col = 'Emerging Products'
# has_drivers = True

# ------------------------------------------------------------------------------------- #
# Total PEO
# ------------------------------------------------------------------------------------- #
# target_col = 'Total PEO'
# has_drivers = False

# ------------------------------------------------------------------------------------- #
# Total Insurance Services
# ------------------------------------------------------------------------------------- #
# target_col = 'Workers Comp - Payment Services'
# has_drivers = True
# target_col = 'Health Benefits'
# has_drivers = True

# target_col = 'Interest on Funds Held for Clients'
# has_drivers = False

# ------------------------------------------------------------------------------------- #
# Total Level 1
# ------------------------------------------------------------------------------------- #
target_col = 'Total Payroll Revenue.'
has_drivers = True
# target_col = 'Total 401k'
# has_drivers = True
# target_col = 'Total ASO Revenue'
# has_drivers = False
# target_col = 'Total Online Services'
# has_drivers = True
# target_col = 'Other Management Solutions'
# has_drivers = True
# target_col = 'Total Insurance Services'
# has_drivers = True

# ------------------------------------------------------------------------------------- #
# Total Level 2
# ------------------------------------------------------------------------------------- #
# target_col = 'Total Revenue'
# has_drivers = False

In [26]:
ml_col = target_col+' - ML Predicted'
uts_col = target_col+' - UTS Predicted'
has_actuals = True
level = 1

# Get Data

In [40]:
file_path = proyect_path+"/data/clean/table_predictable.csv"
df = dl.get_clean_data(pred_start_dt, pred_end_dt, file_path, level=level)

In [41]:
df = df[['Calendar Date', target_col]]
drive_path = proyect_path+"/data/clean/table_drivers.csv"
if has_drivers:
    driv_df = dl.get_clean_driver_data(pred_start_dt, pred_end_dt, target_col, drive_path)
    #driv_df.drop(columns=['Scenario'], inplace=True)
    df = pd.merge(df, driv_df, on='Calendar Date', how='inner')

In [42]:
external_path = proyect_path+"/data/external/external_data_fred.csv"
ext_df = pd.read_csv(external_path, dtype={'date': str}) \
    .rename(columns={'date': 'Calendar Date'})
#ext_df = dl.get_external_data(train_start_dt, pred_end_dt)
df = pd.merge(df, ext_df, on='Calendar Date', how='inner')

In [43]:
df['Calendar Date'] = pd.to_datetime(df['Calendar Date'])

In [32]:
with open(model_path + '/{}_features.pkl'.format(target_col), "rb") as fp:
    feature_cols = pickle.load(fp)

In [34]:
keeps = ['Calendar Date', target_col]+feature_cols

In [45]:
df = df[keeps]

# Models

In [29]:
model_path = proyect_path+"/data/models/"+model_run_date

## ML Models

In [30]:
model = load_model(model_path + '/{}_model'.format(target_col))

Transformation Pipeline and Model Successfully Loaded


In [46]:
predictions = predict_model(model, data=df)

In [47]:
predictions

Unnamed: 0,Calendar Date,Total Payroll Revenue.,Total Flex Blended Products Revenue/Phoenix RSC/Sales - RW,PR PRODUCT/1506 SP BOP PARTNER/Losses - RW,Product NA/6000 CORP OFFICE/# of Weeks - RW,Total Preview Blended Products Revenue/0456 TWIN CITIES MMS/Net Client Gain - RW,Total Flex Blended Products Revenue/0922 DSC Central/Net Client Gain - RW,MHHNGSP,Total Advantage Blended Products Revenue/ADVANTAGE/PACE/Net Client Gain - RW,HOUST,PR PRODUCT/1502 SP ETAP/Net Client Gain - RW,REPORTS 100/1501 SP Direct GB/SurePayroll Revenue Share,Label
0,2021-06-01,157525300.0,975.0,-47.0,5.0,-11.0,0.0,3.26,-208.0,1657.0,473.0,-7128.0,211486900.0
1,2021-07-01,160305500.0,536.0,-25.0,4.0,-3.0,0.0,3.84,-190.0,1562.0,847.0,-5841.0,174902900.0
2,2021-08-01,158182400.0,455.0,-26.0,4.0,-1.0,0.0,4.07,-171.0,1573.0,465.0,-6237.0,157542800.0
3,2021-09-01,163679400.0,590.0,-17.0,5.0,-6.0,0.0,5.16,-237.0,1550.0,757.0,2970.0,164363500.0
4,2021-10-01,158214700.0,523.0,-22.0,4.0,-7.0,0.0,5.51,-172.0,1552.0,551.0,-4653.0,160403200.0
5,2021-11-01,165418400.0,533.0,-24.0,4.0,-10.0,0.0,5.05,-234.0,1703.0,323.0,-3267.0,161019900.0
6,2021-12-01,190293600.0,698.0,-10.0,4.0,-3.0,0.0,3.76,-216.0,1754.0,508.0,-6534.0,186818300.0
7,2022-01-01,258924100.0,1711.0,-64.0,5.0,-4.0,0.0,4.38,-399.0,1679.0,2330.0,-10954.0,217877600.0


## UTS Model

In [49]:
model_uts = load_model(model_path + '/{}_uts_model'.format(target_col))

Transformation Pipeline and Model Successfully Loaded


In [50]:
model_uts

ARIMA(order=(0, 1, 1), scoring_args={}, seasonal_order=(0, 1, 2, 12),

In [51]:
predictions_utf = model_uts.predict(n_periods=forecast_window)

In [55]:
predictions['UTS'] = predictions_utf

In [56]:
predictions['Date'] = pd.date_range(start=str(pred_start_dt), end = str(pred_end_dt), freq = 'MS')
predictions.rename(columns={'Label':target_col+' - ML Predicted'}, inplace=True)
fig = px.line(predictions, x='Date', y=[target_col, target_col+' - ML Predicted', 'UTS'], template = 'plotly_white')
fig.show()