In [1]:
###########################Code
import pandas as pd
import numpy as np
import plotly.express as px
import warnings
from pycaret.regression import *
#import import_ipynb
#import data_loader as dl

warnings.filterwarnings("ignore")

In [2]:

def run_auto_ml(train_df, test_df, dependent_col, feature_cols, normal_transform):
    # Model Definitions
    s = setup(data = train_df,
              test_data = test_df,
              target = dependent_col,
              fold_strategy = 'timeseries',
              numeric_features = feature_cols,
              fold = 3,
              transform_target = normal_transform,
              feature_selection = True,
              feature_selection_threshold = 0.8,
              remove_multicollinearity = True,
              multicollinearity_threshold = 0.9,
              session_id = 123)

    # Creates model comparison matrix
    best = compare_models(sort = 'MAE')
    results = pull()
    model_results = results.Model.tolist()
    print(type(best))
    return best

def get_important_features(get_config, best):
    # Get list of features and their values
    features = pd.DataFrame({'Feature': get_config('X_train').columns,
                             'Variable Importance' : abs(best.coef_)}).sort_values(by='Variable Importance', ascending=False)
    features['Variable Importance'] = features['Variable Importance'].astype(float)

    # Here we want to get features with an absolute value variable importance score of 1 or greater
    features = features[features['Variable Importance'] >= 1]
    features = features.reset_index(drop=True)
    return features

## Data Loader

In [3]:
cols = ['File'  ,'Product0'  ,'Account0'  ,'AccountDescription'  ,'Item'  ,'FY 2015 Total'  ,'Actual - 20140601'  ,
        'Actual - 20140701'  ,'Actual - 20140801'  ,'Actual - 20140901'  ,'Actual - 20141001'  ,'Actual - 20141101'  ,
        'Actual - 20141201'  ,'Actual - 20150101'  ,'Actual - 20150201'  ,'Actual - 20150301'  ,'Actual - 20150401'  ,
        'Actual - 20150501'  ,'FY 2016 Total'  ,'Actual - 20150601'  ,'Actual - 20150701'  ,'Actual - 20150801'  ,
        'Actual - 20150901'  ,'Actual - 20151001'  ,'Actual - 20151101'  ,'Actual - 20151201'  ,'Actual - 20160101'  ,
        'Actual - 20160201'  ,'Actual - 20160301'  ,'Actual - 20160401'  ,'Actual - 20160501'  ,'FY 2017 Total'  ,
        'Actual - 20160601'  ,'Actual - 20160701'  ,'Actual - 20160801'  ,'Actual - 20160901'  ,'Actual - 20161001'  ,
        'Actual - 20161101'  ,'Actual - 20161201'  ,'Actual - 20170101'  ,'Actual - 20170201'  ,'Actual - 20170301'  ,
        'Actual - 20170401'  ,'Actual - 20170501'  ,'FY 2018 Total'  ,'Actual - 20170601'  ,'Actual - 20170701'  ,
        'Actual - 20170801'  ,'Actual - 20170901'  ,'Actual - 20171001'  ,'Actual - 20171101'  ,'Actual - 20171201'  ,
        'Actual - 20180101'  ,'Actual - 20180201'  ,'Actual - 20180301'  ,'Actual - 20180401'  ,'Actual - 20180501'  ,
        'FY 2019 Total'  ,'Actual - 20180601'  ,'Actual - 20180701'  ,'Actual - 20180801'  ,'Actual - 20180901'  ,
        'Actual - 20181001'  ,'Actual - 20181101'  ,'Actual - 20181201'  ,'Actual - 20190101'  ,'Actual - 20190201'  ,
        'Actual - 20190301'  ,'Actual - 20190401'  ,'Actual - 20190501'  ,'FY 2020 Total'  ,'Actual - 20190601'  ,'Actual - 20190701'  ,'Actual - 20190801'  ,'Actual - 20190901'  ,'Actual - 20191001'  ,'Actual - 20191101'  ,
        'Actual - 20191201'  ,'Actual - 20200101'  ,'Actual - 20200201'  ,'Actual - 20200301'  ,'Actual - 20200401'  ,
        'Actual - 20200501'  ,'FY 2021 Total'  ,'Actual - 20200601'  ,'Actual - 20200701'  ,'Actual - 20200801'  ,
        'Actual - 20200901'  ,'Actual - 20201001'  ,'Actual - 20201101'  ,'Actual - 20201201'  ,'Actual - 20210101'  ,
        'Actual - 20210201'  ,'Actual - 20210301'  ,'Actual - 20210401'  ,'Actual - 20210501'  ,'Actual - 20210601'  ,
        'Actual - 20210701'  ,'Actual - 20210801'  ,'Actual - 20210901'  ,'Actual - 20211001'  ,'Actual - 20211101'  ,
        'Actual - 20211201'  ,'Actual - 20220101'  ,'Forecast - 20220201'  ,'Forecast - 20220301'  ,
        'Forecast - 20220401'  ,'Forecast - 20220501']

In [4]:
data_file = 'Paychex_data.xlsx'
df = pd.read_excel(data_file, names=cols, skiprows=5)


In [5]:
# drop the totals columns
drops = ['FY 2015 Total' ,'FY 2016 Total' ,'FY 2017 Total' ,'FY 2018 Total' ,'FY 2019 Total' ,'FY 2020 Total' ,'FY 2021 Total']
df.drop(columns=drops, inplace=True)

In [6]:
# drop unnecessary rows
df= df.dropna(subset=['Item'])

In [7]:
level_0_list = ['401K Asset fee & BP Revenue'
    ,'401K Fee Revenue'
    ,'ASO Allocation'
    ,'ASO Revenue - Oasis'
    ,'Benetrac'
    ,'Cafeteria Plans Revenue'
    ,'Delivery Revenue'
    ,'Emerging Products'
    ,'ESR Revenue'
    ,'Full Service Unemployment Revenue'
    ,'Health Benefits'
    ,'HR Online'
    ,'HR Solutions (PEO)'
    ,'Interest on Funds Held for Clients'
    ,'Other Processing Revenue'
    ,'Payroll blended products'
    ,'SurePayroll.'
    ,'Time & Attendance'
    ,'Total international'
    ,'Total Paychex Advance'
    ,'W-2 Revenue'
    ,'Workers Comp - Payment Se']


In [8]:
# keep level 0 rows only
df0 = df[df['Item'].isin(level_0_list)]

In [9]:
flps = ['Actual - 20140601'   ,'Actual - 20140701'   ,'Actual - 20140801'   ,'Actual - 20140901'   ,'Actual - 20141001'
    ,'Actual - 20141101'   ,'Actual - 20141201'   ,'Actual - 20150101'   ,'Actual - 20150201'   ,'Actual - 20150301'
    ,'Actual - 20150401'   ,'Actual - 20150501'   ,'Actual - 20150601'   ,'Actual - 20150701'   ,'Actual - 20150801'
    ,'Actual - 20150901'   ,'Actual - 20151001'   ,'Actual - 20151101'   ,'Actual - 20151201'   ,'Actual - 20160101'
    ,'Actual - 20160201'   ,'Actual - 20160301'   ,'Actual - 20160401'   ,'Actual - 20160501'   ,'Actual - 20160601'
    ,'Actual - 20160701'   ,'Actual - 20160801'   ,'Actual - 20160901'   ,'Actual - 20161001'   ,'Actual - 20161101'
    ,'Actual - 20161201'   ,'Actual - 20170101'   ,'Actual - 20170201'   ,'Actual - 20170301'   ,'Actual - 20170401'
    ,'Actual - 20170501'   ,'Actual - 20170601'   ,'Actual - 20170701'   ,'Actual - 20170801'   ,'Actual - 20170901'
    ,'Actual - 20171001'   ,'Actual - 20171101'   ,'Actual - 20171201'   ,'Actual - 20180101'   ,'Actual - 20180201'
    ,'Actual - 20180301'   ,'Actual - 20180401'   ,'Actual - 20180501'   ,'Actual - 20180601'   ,'Actual - 20180701'
    ,'Actual - 20180801'   ,'Actual - 20180901'   ,'Actual - 20181001'   ,'Actual - 20181101'   ,'Actual - 20181201'
    ,'Actual - 20190101'   ,'Actual - 20190201'   ,'Actual - 20190301'   ,'Actual - 20190401'   ,'Actual - 20190501'
    ,'Actual - 20190601'   ,'Actual - 20190701'   ,'Actual - 20190801'   ,'Actual - 20190901'   ,'Actual - 20191001'
    ,'Actual - 20191101'   ,'Actual - 20191201'   ,'Actual - 20200101'   ,'Actual - 20200201'   ,'Actual - 20200301'
    ,'Actual - 20200401'   ,'Actual - 20200501'   ,'Actual - 20200601'   ,'Actual - 20200701'   ,'Actual - 20200801'
    ,'Actual - 20200901'   ,'Actual - 20201001'   ,'Actual - 20201101'   ,'Actual - 20201201'   ,'Actual - 20210101'
    ,'Actual - 20210201'   ,'Actual - 20210301'   ,'Actual - 20210401'   ,'Actual - 20210501'   ,'Actual - 20210601'
    ,'Actual - 20210701'   ,'Actual - 20210801'   ,'Actual - 20210901'   ,'Actual - 20211001'   ,'Actual - 20211101'
    ,'Actual - 20211201'   ,'Actual - 20220101'   ,'Forecast - 20220201'   ,'Forecast - 20220301'
    ,'Forecast - 20220401'   ,'Forecast - 20220501']
df0 = pd.melt(df0, id_vars=['File', 'Product0', 'Account0', 'AccountDescription', 'Item'],
              value_vars=flps, var_name='Scenario_Date', value_name='Amount')

In [10]:
df0[['Scenario','CalendarDate']] = df0.Scenario_Date.str.split(" - ",expand=True)

In [11]:
df0.drop(columns=['Scenario_Date'], inplace=True)

In [12]:
df0

Unnamed: 0,File,Product0,Account0,AccountDescription,Item,Amount,Scenario,CalendarDate
0,401kRevenueDetail.txt,401(K) PLANS,5600 401K ADMINISTRATION,FEE REVENUE - NA - RW,401K Fee Revenue,927785.0,Actual,20140601
1,401kRevenueDetail.txt,401K NSI EXPORT,5600 401K ADMINISTRATION,FEE REVENUE - NA - RW,401K Fee Revenue,-460.0,Actual,20140601
2,401kRevenueDetail.txt,RETIREMENT PLANS,5600 401K ADMINISTRATION,FEE REVENUE - NA - RW,401K Fee Revenue,,Actual,20140601
3,401kRevenueDetail.txt,401(K) PLANS,5745 EPLAN,FEE REVENUE - NA - RW,401K Fee Revenue,15943.0,Actual,20140601
4,401kRevenueDetail.txt,401(K) PLANS,5600 401K ADMINISTRATION,SETUP REVENUE - RW,401K Fee Revenue,919141.0,Actual,20140601
...,...,...,...,...,...,...,...,...
162043,SurePayollRevenue.txt,PREMIER HRS,1501 SP Direct GB,SALES DISCOUNTS - CONTRACT PLAN - RW,SurePayroll.,,Forecast,20220501
162044,SurePayollRevenue.txt,PREMIER HRS,1506 SP BOP PARTNER,FEE REVENUE - CONTRACT PLAN - RW,SurePayroll.,,Forecast,20220501
162045,SurePayollRevenue.txt,PREMIER HRS,1506 SP BOP PARTNER,MAINTENANCE REVENUE - NA - RW,SurePayroll.,,Forecast,20220501
162046,SurePayollRevenue.txt,PREMIER HRS,1506 SP BOP PARTNER,SALES DISCOUNTS - NA - RW,SurePayroll.,,Forecast,20220501


In [13]:
df0['Item'] = df0['Item'].replace(" ","", regex=True)

In [14]:
df0_acct = df0[['Item','Account0','Scenario','CalendarDate','Amount']]
df0_prod = df0[['Item','Product0','Scenario','CalendarDate','Amount']]

df0_group = df0.groupby(['Item','Scenario','CalendarDate'])['Amount'].sum().reset_index()
#print(df0_group)
item_list = df0_group['Item'].unique().tolist()

df_piv = df0_group.pivot(index=['CalendarDate', 'Scenario'], columns='Item', values='Amount')
df_piv = df_piv.reset_index()

In [15]:
df_piv

Item,CalendarDate,Scenario,401KAssetfee&BPRevenue,401KFeeRevenue,ASOAllocation,ASORevenue-Oasis,Benetrac,CafeteriaPlansRevenue,DeliveryRevenue,ESRRevenue,EmergingProducts,FullServiceUnemploymentRevenue,HROnline,HRSolutions(PEO),HealthBenefits,InterestonFundsHeldforClients,OtherProcessingRevenue,Payrollblendedproducts,SurePayroll.,Time&Attendance,TotalPaychexAdvance,Totalinternational,W-2Revenue,WorkersComp-PaymentSe
0,20140601,Actual,4261452.0,10621963.0,7255027.55,0.0,1922941.0,1714950.0,7393351.97,47602.0,796063.0,1869396.0,3930007.0,21143605.0,5225672.0,3308154.0,4765775.19,99633512.0,3246967.02,4651817.0,0.0,0.0,7031234.09,4858397.0
1,20140701,Actual,4513388.0,10558382.0,9070820.8,0.0,1916579.0,1698495.0,13513741.18,67521.0,986955.0,2341533.0,4535022.0,25838048.0,5108935.0,3466375.0,6502729.54,121619696.0,3362473.02,5165885.0,0.0,0.0,7325377.82,5244318.0
2,20140801,Actual,4486436.0,10297495.0,7489159.5,0.0,1944830.0,1829118.0,7286966.62,79771.0,924689.0,1986835.0,3869725.0,21762760.0,5380159.0,3434419.0,4215268.17,99472483.0,3286111.93,4856677.0,0.0,0.0,7443620.46,5588578.0
3,20140901,Actual,4355891.0,12719409.0,7011292.24,0.0,1933042.0,1710602.0,7081861.96,105330.0,732444.0,1888067.0,3847567.0,20288451.0,6281392.0,3419918.0,4726806.24,94198230.0,3345328.53,5098280.0,0.0,0.0,7414842.11,4968759.0
4,20141001,Actual,4455231.0,13797724.0,9372927.59,0.0,1956820.0,1702224.0,13741077.44,143956.0,1051920.0,2491011.0,4967922.0,26755143.0,6107475.0,3472038.0,6514037.07,124353873.0,3402641.78,5188982.0,0.0,0.0,7818212.35,5752070.0
5,20141101,Actual,4466548.0,11049609.0,7492286.01,0.0,1987727.0,1784045.0,7259539.93,178933.0,965427.0,2048831.0,4044570.0,21881382.0,5589002.0,3469203.0,4528513.14,98868618.0,3221738.58,5209888.0,0.0,0.0,7542659.46,4890557.0
6,20141201,Actual,4309643.0,15706255.0,7355083.53,0.0,2022314.0,1741201.0,7559482.15,235468.0,852949.0,2006345.0,4284873.0,21273963.0,5315219.0,3550962.0,5170797.88,99281713.0,3627472.6,5266992.0,0.0,0.0,8117392.17,5773054.0
7,20150101,Actual,4855051.0,12860281.0,9431085.71,0.0,1986486.0,1714890.0,19855110.85,607646.0,908730.0,2606423.0,5095101.0,27128652.0,5339084.0,3683535.0,6834385.36,124388966.0,3589702.61,5662406.0,0.0,0.0,9377834.84,5384244.0
8,20150201,Actual,4218929.0,12618365.0,7525036.15,0.0,1898082.0,1821176.0,7116009.19,454163.0,946365.0,2103998.0,3978493.0,21605352.0,5420555.0,3509475.0,4722991.56,95862151.0,3491857.24,5273882.0,0.0,0.0,7226134.53,4903952.0
9,20150301,Actual,4608872.0,11388694.0,7170010.02,0.0,1921510.0,1715383.0,6970218.27,905763.0,898680.0,2025107.0,4012338.0,20945564.0,6383528.0,3645265.0,4772800.16,92298659.0,3460680.8,5265180.0,0.0,0.0,6909620.47,5445220.0


In [16]:
#dependent_col = '401K Asset fee & BP Revenue'
#dependent_col = '401K Fee Revenue'
#dependent_col = 'ASO Allocation'
#dependent_col = 'ASO Revenue - Oasis'
#dependent_col = 'Benetrac'
#dependent_col = 'Cafeteria Plans Revenue'
#dependent_col = 'Delivery Revenue'
#dependent_col = 'Emerging Products'
#dependent_col = 'ESR Revenue'
#dependent_col = 'Full Service Unemployment Revenue'
#dependent_col = 'Health Benefits'
#dependent_col = 'HR Online'
#dependent_col = 'HR Solutions (PEO)'
#dependent_col = 'Interest on Funds Held for Clients'
#dependent_col = 'Other Processing Revenue'
#dependent_col = 'Payroll blended products'
#dependent_col = 'SurePayroll.'
#dependent_col = 'Time & Attendance'
#dependent_col = 'Total international'
#dependent_col = 'Total Paychex Advance'
#dependent_col = 'W-2 Revenue'
#dependent_col = 'Workers Comp - Payment Se'
dependent_col = 'TotalRevenue'
feature_cols = [c.replace(" ","") for c in level_0_list]

if dependent_col in feature_cols:
    feature_cols.remove(dependent_col)

train_start_dt = '20140601'
train_end_dt = '20200501'
test_start_dt = '20200601'
test_end_dt = '20210501'
pred_start_dt = '20210601'
pred_end_dt = '20220101'
forecast_window = 8
has_actuals = True

In [17]:
#print(df_piv)

df_ts = df_piv[df_piv['Scenario'] == 'Actual']
df_ts['Series'] = np.arange(1,len(df_ts)+1)
df_ts['Year'] = df_ts['CalendarDate'].astype(str).str[:4]
df_ts['Month'] = df_ts['CalendarDate'].astype(str).str[-4:].str[:2]
df_ts['TotalRevenue'] = df_ts[item_list].sum(axis=1)
df_ts.rename_axis(None, axis=1, inplace=True)
train_df = df_ts[(df_ts['CalendarDate'].astype(int) >= int(train_start_dt)) &
              (df_ts['CalendarDate'].astype(int) <= int(train_end_dt))]
test_df = df_ts[(df_ts['CalendarDate'].astype(int) >= int(test_start_dt)) &
                (df_ts['CalendarDate'].astype(int) <= int(test_end_dt))]
comb_df = df_ts[(df_ts['CalendarDate'].astype(int) >= int(train_start_dt)) &
                (df_ts['CalendarDate'].astype(int) <= int(test_end_dt))]

In [18]:

# create training dataframe
train_df['CalendarDate'] = pd.to_datetime(train_df['CalendarDate'])
#train_df.set_index('Calendar Date', inplace=True)

# create test dataframe

test_df['CalendarDate'] = pd.to_datetime(test_df['CalendarDate'])
#test_df.set_index('Calendar Date', inplace=True)

# create combo dataframe
comb_df['CalendarDate'] = pd.to_datetime(comb_df['CalendarDate'])
#comb_df.set_index('Calendar Date', inplace=True)


In [19]:
train_df

Unnamed: 0,CalendarDate,Scenario,401KAssetfee&BPRevenue,401KFeeRevenue,ASOAllocation,ASORevenue-Oasis,Benetrac,CafeteriaPlansRevenue,DeliveryRevenue,ESRRevenue,EmergingProducts,FullServiceUnemploymentRevenue,HROnline,HRSolutions(PEO),HealthBenefits,InterestonFundsHeldforClients,OtherProcessingRevenue,Payrollblendedproducts,SurePayroll.,Time&Attendance,TotalPaychexAdvance,Totalinternational,W-2Revenue,WorkersComp-PaymentSe,Series,Year,Month,TotalRevenue
0,2014-06-01,Actual,4261452.0,10621963.0,7255027.55,0.0,1922941.0,1714950.0,7393351.97,47602.0,796063.0,1869396.0,3930007.0,21143605.0,5225672.0,3308154.0,4765775.19,99633512.0,3246967.02,4651817.0,0.0,0.0,7031234.09,4858397.0,1,2014,6,193677900.0
1,2014-07-01,Actual,4513388.0,10558382.0,9070820.8,0.0,1916579.0,1698495.0,13513741.18,67521.0,986955.0,2341533.0,4535022.0,25838048.0,5108935.0,3466375.0,6502729.54,121619696.0,3362473.02,5165885.0,0.0,0.0,7325377.82,5244318.0,2,2014,7,232836300.0
2,2014-08-01,Actual,4486436.0,10297495.0,7489159.5,0.0,1944830.0,1829118.0,7286966.62,79771.0,924689.0,1986835.0,3869725.0,21762760.0,5380159.0,3434419.0,4215268.17,99472483.0,3286111.93,4856677.0,0.0,0.0,7443620.46,5588578.0,3,2014,8,195635100.0
3,2014-09-01,Actual,4355891.0,12719409.0,7011292.24,0.0,1933042.0,1710602.0,7081861.96,105330.0,732444.0,1888067.0,3847567.0,20288451.0,6281392.0,3419918.0,4726806.24,94198230.0,3345328.53,5098280.0,0.0,0.0,7414842.11,4968759.0,4,2014,9,191127500.0
4,2014-10-01,Actual,4455231.0,13797724.0,9372927.59,0.0,1956820.0,1702224.0,13741077.44,143956.0,1051920.0,2491011.0,4967922.0,26755143.0,6107475.0,3472038.0,6514037.07,124353873.0,3402641.78,5188982.0,0.0,0.0,7818212.35,5752070.0,5,2014,10,243045300.0
5,2014-11-01,Actual,4466548.0,11049609.0,7492286.01,0.0,1987727.0,1784045.0,7259539.93,178933.0,965427.0,2048831.0,4044570.0,21881382.0,5589002.0,3469203.0,4528513.14,98868618.0,3221738.58,5209888.0,0.0,0.0,7542659.46,4890557.0,6,2014,11,196479100.0
6,2014-12-01,Actual,4309643.0,15706255.0,7355083.53,0.0,2022314.0,1741201.0,7559482.15,235468.0,852949.0,2006345.0,4284873.0,21273963.0,5315219.0,3550962.0,5170797.88,99281713.0,3627472.6,5266992.0,0.0,0.0,8117392.17,5773054.0,7,2014,12,203451200.0
7,2015-01-01,Actual,4855051.0,12860281.0,9431085.71,0.0,1986486.0,1714890.0,19855110.85,607646.0,908730.0,2606423.0,5095101.0,27128652.0,5339084.0,3683535.0,6834385.36,124388966.0,3589702.61,5662406.0,0.0,0.0,9377834.84,5384244.0,8,2015,1,251309600.0
8,2015-02-01,Actual,4218929.0,12618365.0,7525036.15,0.0,1898082.0,1821176.0,7116009.19,454163.0,946365.0,2103998.0,3978493.0,21605352.0,5420555.0,3509475.0,4722991.56,95862151.0,3491857.24,5273882.0,0.0,0.0,7226134.53,4903952.0,9,2015,2,194697000.0
9,2015-03-01,Actual,4608872.0,11388694.0,7170010.02,0.0,1921510.0,1715383.0,6970218.27,905763.0,898680.0,2025107.0,4012338.0,20945564.0,6383528.0,3645265.0,4772800.16,92298659.0,3460680.8,5265180.0,0.0,0.0,6909620.47,5445220.0,10,2015,3,190743100.0


## Model training

In [20]:
train_df.shape

(72, 28)

In [21]:

# run auto ml and get the most important features
best = run_auto_ml(train_df, test_df, dependent_col, feature_cols, False)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
br,Bayesian Ridge,2036732.6078,8174510846920.085,2381153.1836,0.9611,0.0099,0.0084,0.0067
ridge,Ridge Regression,2339514.1042,11716921633450.666,2626803.5,0.944,0.011,0.0097,1.1133
en,Elastic Net,2359564.4375,11451640053760.0,2733153.1667,0.9464,0.0112,0.0097,0.01
lr,Linear Regression,2585985.1458,12361292098218.666,2885595.6667,0.9439,0.012,0.0107,1.99
llar,Lasso Least Angle Regression,5176175.8376,33934603727869.35,5554884.6365,0.8945,0.0226,0.0207,0.01
huber,Huber Regressor,5666600.8599,53026366472885.66,7263395.2897,0.6972,0.0278,0.0222,0.0333
dt,Decision Tree Regressor,11089106.3704,477572822412913.75,17476511.7629,0.2466,0.0626,0.0399,0.0033
par,Passive Aggressive Regressor,11945788.1574,408555262743995.25,18127704.2142,0.1237,0.0674,0.0449,0.0067
ada,AdaBoost Regressor,12188693.6106,509681318450306.8,18670348.6951,0.2398,0.0682,0.0441,0.0233
et,Extra Trees Regressor,12355192.4415,510245903219505.75,18696631.3178,0.2266,0.0677,0.0446,0.0533


<class 'sklearn.linear_model._bayes.BayesianRidge'>


In [22]:
features = get_important_features(get_config, best)

In [24]:
# Feature Importance Plot - This is the out of box pycaret plot
#plot_model(best, plot = 'feature')

# plot the most important features
fig = px.bar(features.sort_values('Variable Importance', ascending=True),
             x='Variable Importance',
             y='Feature',
             orientation='h',
             title='Feature Importance Plot')
fig.show()

In [25]:
best

BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, alpha_init=None,
              compute_score=False, copy_X=True, fit_intercept=True,
              lambda_1=1e-06, lambda_2=1e-06, lambda_init=None, n_iter=300,
              normalize=False, tol=0.001, verbose=False)

In [26]:
type(best)

sklearn.linear_model._bayes.BayesianRidge

In [27]:
best.n_features_in_

47

In [28]:
comb_df.shape

(84, 28)

In [29]:
# ------------------------------------------------------------------------------------- #
# generate and plot predicted values on the original dataset
# ------------------------------------------------------------------------------------- #

predictions = predict_model(best, data=comb_df)
predictions['Date'] = pd.date_range(start=str(train_start_dt), end = str(test_end_dt), freq = 'MS')
predictions

Unnamed: 0,CalendarDate,Scenario,401KAssetfee&BPRevenue,401KFeeRevenue,ASOAllocation,ASORevenue-Oasis,Benetrac,CafeteriaPlansRevenue,DeliveryRevenue,ESRRevenue,...,TotalPaychexAdvance,Totalinternational,W-2Revenue,WorkersComp-PaymentSe,Series,Year,Month,TotalRevenue,Label,Date
0,2014-06-01,Actual,4261452.0,10621963.0,7255027.55,0.00,1922941.0,1714950.0,7393351.97,47602.0,...,0.0,0.0,7031234.09,4858397.0,1,2014,06,1.936779e+08,1.936202e+08,2014-06-01
1,2014-07-01,Actual,4513388.0,10558382.0,9070820.80,0.00,1916579.0,1698495.0,13513741.18,67521.0,...,0.0,0.0,7325377.82,5244318.0,2,2014,07,2.328363e+08,2.324266e+08,2014-07-01
2,2014-08-01,Actual,4486436.0,10297495.0,7489159.50,0.00,1944830.0,1829118.0,7286966.62,79771.0,...,0.0,0.0,7443620.46,5588578.0,3,2014,08,1.956351e+08,1.959262e+08,2014-08-01
3,2014-09-01,Actual,4355891.0,12719409.0,7011292.24,0.00,1933042.0,1710602.0,7081861.96,105330.0,...,0.0,0.0,7414842.11,4968759.0,4,2014,09,1.911275e+08,1.912629e+08,2014-09-01
4,2014-10-01,Actual,4455231.0,13797724.0,9372927.59,0.00,1956820.0,1702224.0,13741077.44,143956.0,...,0.0,0.0,7818212.35,5752070.0,5,2014,10,2.430453e+08,2.435622e+08,2014-10-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,2021-01-01,Actual,8601656.0,14541668.0,13265382.71,273995.85,1390060.0,1771611.0,22548298.89,3955284.0,...,5542882.0,0.0,80988881.77,5399816.0,80,2021,01,3.436167e+08,3.433331e+08,2021-01-01
80,2021-02-01,Actual,8798633.0,14785257.0,13666646.86,291755.39,1354163.0,1685096.0,6430193.76,11179898.0,...,5572132.0,0.0,28599.77,5392816.0,81,2021,02,2.543673e+08,2.542883e+08,2021-02-01
81,2021-03-01,Actual,9014864.0,15017238.0,16053700.85,253715.04,1404115.0,1667442.0,7657980.44,5180501.0,...,6534835.0,0.0,-43477.46,7384007.0,82,2021,03,2.802692e+08,2.818997e+08,2021-03-01
82,2021-04-01,Actual,8986182.0,14843783.0,14728764.11,260657.63,1349889.0,1683761.0,11701117.00,4329239.0,...,6052148.0,0.0,44625.45,6571571.0,83,2021,04,2.722801e+08,2.731794e+08,2021-04-01


In [31]:
fig = px.line(predictions, x='Date', y=[dependent_col, "Label"], template = 'plotly_white')
fig.show()

In [30]:
# ------------------------------------------------------------------------------------- #
# This section now applies the trained/tested model to make future predictions
# ------------------------------------------------------------------------------------- #

pred_df = df_ts[(df_ts['Calendar Date'].astype(int) >= int(pred_start_dt)) &
                (df_ts['Calendar Date'].astype(int) <= int(pred_end_dt))]

## Save as PMML

In [43]:
from sklearn2pmml import make_pmml_pipeline, sklearn2pmml, PMMLPipeline

### Best Model

In [29]:
pmml_pipe = make_pmml_pipeline(best)

In [30]:
sklearn2pmml(pmml_pipe, "model.pmml")

### With pipeline

In [32]:
pipeline, name = save_model(best, 'pycaret_pipeline')

Transformation Pipeline and Model Successfully Saved


In [33]:
pipeline[:-1]

Pipeline(memory=None,
         steps=[('dtypes',
                 DataTypes_Auto_infer(categorical_features=[],
                                      display_types=True, features_todrop=[],
                                      id_columns=[], ml_usecase='regression',
                                      numerical_features=['401KAssetfee&BPRevenue',
                                                          '401KFeeRevenue',
                                                          'ASOAllocation',
                                                          'ASORevenue-Oasis',
                                                          'Benetrac',
                                                          'CafeteriaPlansRevenue',
                                                          'DeliveryRevenue',
                                                          'EmergingProducts',
                                                          'ESRRevenue',
                                       

In [34]:
name

'pycaret_pipeline.pkl'

In [35]:
type(pipeline)

sklearn.pipeline.Pipeline

In [36]:
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.base import BaseEstimator
from sklearn.compose import ColumnTransformer
from sklearn_pandas import DataFrameMapper
from sklearn.feature_selection.base import SelectorMixin

In [37]:
class SelectorProxy(BaseEstimator):

    def __init__(self, selector):
        self.selector = selector
        try:
            self._copy_attrs()
        except :
            pass

    def _copy_attrs(self):
        try:
            setattr(self, "support_mask_", self.selector._get_support_mask())
        except ValueError:
            pass

    def fit(self, X, y = None, **fit_params):
        self.selector.fit(X, y, **fit_params)
        self._copy_attrs()
        return self

    def transform(self, X):
        return self.selector.transform(X)

In [38]:
def _filter(obj):
    if isinstance(obj, DataFrameMapper):
        obj.features = _filter_steps(obj.features)
        if hasattr(obj, "built_features"):
            if obj.built_features is not None:
                obj.built_features = _filter_steps(obj.built_features)
    elif isinstance(obj, ColumnTransformer):
        obj.transformers = _filter_steps(obj.transformers)
        obj.remainder = _filter(obj.remainder)
        if hasattr(obj, "transformers_"):
            obj.transformers_ = _filter_steps(obj.transformers_)
    elif isinstance(obj, FeatureUnion):
        obj.transformer_list = _filter_steps(obj.transformer_list)
    elif isinstance(obj, Pipeline):
        obj.steps = _filter_steps(obj.steps)
    elif isinstance(obj, SelectorMixin):
        return SelectorProxy(obj)
    elif isinstance(obj, list):
        return [_filter(e) for e in obj]
    return obj

In [39]:
def _filter_steps(steps):
    res = []
    for step in steps:
        print("Step :", _filter(step[1]))
        i = (tuple(step[:1]) + (_filter(step[1]), ) + tuple(step[2:]))
        res.append(i)
    return res

In [40]:
def _get_steps(obj):
    if isinstance(obj, Pipeline):
        return obj.steps
    elif isinstance(obj, BaseEstimator):
        return [("estimator", obj)]
    else:
        raise ValueError()

In [41]:
def make_pmml_pipeline_mod(obj, active_fields = None, target_fields = None):
    """Translates a regular Scikit-Learn estimator or pipeline to a PMML pipeline.

    Parameters:
    ----------
    obj: BaseEstimator
        The object.

    active_fields: list of strings, optional
        Feature names. If missing, "x1", "x2", .., "xn" are assumed.

    target_fields: list of strings, optional
        Label name(s). If missing, "y" is assumed.

    """
    steps = _filter_steps(_get_steps(obj))
    pipeline = PMMLPipeline(steps)
    if active_fields is not None:
        pipeline.active_fields = np.asarray(active_fields)
    if target_fields is not None:
        pipeline.target_fields = np.asarray(target_fields)
    return pipeline

In [45]:
pmml_pipe = make_pmml_pipeline_mod(pipeline, active_fields=feature_cols, target_fields=dependent_col)

Step : DataTypes_Auto_infer(categorical_features=[], display_types=True,
                     features_todrop=[], id_columns=[], ml_usecase='regression',
                     numerical_features=['401KAssetfee&BPRevenue',
                                         '401KFeeRevenue', 'ASOAllocation',
                                         'ASORevenue-Oasis', 'Benetrac',
                                         'CafeteriaPlansRevenue',
                                         'DeliveryRevenue', 'EmergingProducts',
                                         'ESRRevenue',
                                         'FullServiceUnemploymentRevenue',
                                         'HealthBenefits', 'HROnline',
                                         'HRSolutions(PEO)',
                                         'InterestonFundsHeldforClients',
                                         'OtherProcessingRevenue',
                                         'Payrollblendedproducts',
              

In [46]:
type(pmml_pipe)

sklearn2pmml.pipeline.PMMLPipeline

In [47]:
pmml_pipe

PMMLPipeline(steps=[('dtypes', DataTypes_Auto_infer(categorical_features=[], display_types=True,
                     features_todrop=[], id_columns=[], ml_usecase='regression',
                     numerical_features=['401KAssetfee&BPRevenue',
                                         '401KFeeRevenue', 'ASOAllocation',
                                         'ASORevenue-Oasis', 'Benetrac',
                                         'CafeteriaPlansRevenue',
                                         'DeliveryRevenue', 'EmergingProducts',
                                         'ESRRevenue',
                                         'FullServiceUnemploymentRevenue',
                                         'HealthBenefits', 'HROnline',
                                         'HRSolutions(PEO)',
                                         'InterestonFundsHeldforClients',
                                         'OtherProcessingRevenue',
                                         'Payrollblendedp

In [48]:
sklearn2pmml(pmml_pipe, "model_pipeline_1.pmml",  with_repr=True)

Standard output is empty
Standard error:
Exception in thread "main" net.razorvine.pickle.PickleException: failed to __setstate__()
	at net.razorvine.pickle.Unpickler.load_build(Unpickler.java:395)
	at net.razorvine.pickle.Unpickler.dispatch(Unpickler.java:220)
	at org.jpmml.python.CustomUnpickler.dispatch(CustomUnpickler.java:31)
	at net.razorvine.pickle.Unpickler.load(Unpickler.java:109)
	at numpy.core.NDArrayUtil.readObject(NDArrayUtil.java:389)
	at numpy.core.TypeDescriptor.read(TypeDescriptor.java:165)
	at numpy.core.NDArrayUtil.parseArray(NDArrayUtil.java:225)
	at numpy.core.NDArrayUtil.parseData(NDArrayUtil.java:200)
	at joblib.NumpyArrayWrapper.toArray(NumpyArrayWrapper.java:43)
	at org.jpmml.python.PickleUtil$1.dispatch(PickleUtil.java:75)
	at net.razorvine.pickle.Unpickler.load(Unpickler.java:109)
	at org.jpmml.python.PickleUtil.unpickle(PickleUtil.java:85)
	at com.sklearn2pmml.Main.run(Main.java:71)
	at com.sklearn2pmml.Main.main(Main.java:62)
Caused by: java.lang.reflect.Inv

RuntimeError: The SkLearn2PMML application has failed. The Java executable should have printed more information about the failure into its standard output and/or standard error streams

In [202]:
import pickle
with open('pycaret_pipeline.pkl', 'rb') as f:
    mod = pickle.load(f)

In [203]:
mod

array(['Calendar Date', 'Scenario', '401K Asset fee & BP Revenue',
       '401K Fee Revenue', 'ASO Allocation', 'ASO Revenue - Oasis',
       'Benetrac', 'Cafeteria Plans Revenue', 'Delivery Revenue',
       'ESR Revenue', 'Emerging Products',
       'Full Service Unemployment Revenue', 'HR Online',
       'HR Solutions (PEO)', 'Health Benefits',
       'Interest on Funds Held for Clients', 'Other Processing Revenue',
       'Payroll blended products', 'SurePayroll.', 'Time & Attendance',
       'Total Paychex Advance', 'Total international', 'W-2 Revenue',
       'Workers Comp - Payment Se', 'Series', 'Year', 'Month'],
      dtype=object)

In [None]:
# pmml_pipeline = PMMLPipeline([
#     ("prep_pipe", prep_pipe),
#     ("final_model", final_model)
# ])

import pickle

with open("regression.pkl", "wb") as pf:
    pickle.dump(pmml_pipeline, pf)

### JPMML-SkLearn

In [207]:
import sklearn, joblib, sklearn_pandas, sklearn2pmml

print(sklearn.__version__)
print(joblib.__version__)
print(sklearn_pandas.__version__)
print(sklearn2pmml.__version__)

0.23.2
1.1.0
2.2.0
0.79.0


In [208]:
pipeline, name = save_model(best, 'pycaret_pipeline')

Transformation Pipeline and Model Successfully Saved


In [209]:
joblib.dump(pipeline, "pipeline.pkl.z", compress = 9)

['pipeline.pkl.z']

In [53]:


# Do the standard PyCaret stuff

from sklearn2pmml.pipeline import PMMLPipeline

pmml_pipeline = PMMLPipeline([
    ("prep_pipe", pipeline[:-1]),
    ("final_model", best)
])

from sklearn2pmml import sklearn2pmml

sklearn2pmml(pmml_pipeline, "pipeline_model.pmml")

Standard output is empty
Standard error:
Exception in thread "main" net.razorvine.pickle.PickleException: failed to __setstate__()
	at net.razorvine.pickle.Unpickler.load_build(Unpickler.java:395)
	at net.razorvine.pickle.Unpickler.dispatch(Unpickler.java:220)
	at org.jpmml.python.CustomUnpickler.dispatch(CustomUnpickler.java:31)
	at net.razorvine.pickle.Unpickler.load(Unpickler.java:109)
	at numpy.core.NDArrayUtil.readObject(NDArrayUtil.java:389)
	at numpy.core.TypeDescriptor.read(TypeDescriptor.java:165)
	at numpy.core.NDArrayUtil.parseArray(NDArrayUtil.java:225)
	at numpy.core.NDArrayUtil.parseData(NDArrayUtil.java:200)
	at joblib.NumpyArrayWrapper.toArray(NumpyArrayWrapper.java:43)
	at org.jpmml.python.PickleUtil$1.dispatch(PickleUtil.java:75)
	at net.razorvine.pickle.Unpickler.load(Unpickler.java:109)
	at org.jpmml.python.PickleUtil.unpickle(PickleUtil.java:85)
	at com.sklearn2pmml.Main.run(Main.java:71)
	at com.sklearn2pmml.Main.main(Main.java:62)
Caused by: java.lang.reflect.Inv

RuntimeError: The SkLearn2PMML application has failed. The Java executable should have printed more information about the failure into its standard output and/or standard error streams

### Niyoka

In [89]:
 from nyoka import skl_to_pmml

In [32]:
pipeline, name = save_model(best, 'pycaret_pipeline')

Transformation Pipeline and Model Successfully Saved


In [50]:
dependent_col

'TotalRevenue'

In [51]:
skl_to_pmml(pipeline=pipeline,col_names=feature_cols,target_name=dependent_col,pmml_f_name="pipeline_model.pmml")

TypeError: This PreProcessing Task is not Supported

In [52]:
pipeline

Pipeline(memory=None,
         steps=[('dtypes',
                 DataTypes_Auto_infer(categorical_features=[],
                                      display_types=True, features_todrop=[],
                                      id_columns=[], ml_usecase='regression',
                                      numerical_features=['401KAssetfee&BPRevenue',
                                                          '401KFeeRevenue',
                                                          'ASOAllocation',
                                                          'ASORevenue-Oasis',
                                                          'Benetrac',
                                                          'CafeteriaPlansRevenue',
                                                          'DeliveryRevenue',
                                                          'EmergingProducts',
                                                          'ESRRevenue',
                                       

In [46]:
_get_steps(pipeline)

[('dtypes',
  DataTypes_Auto_infer(categorical_features=[], display_types=True,
                       features_todrop=[], id_columns=[], ml_usecase='regression',
                       numerical_features=['401K Asset fee & BP Revenue',
                                           '401K Fee Revenue', 'ASO Allocation',
                                           'ASO Revenue - Oasis', 'Benetrac',
                                           'Cafeteria Plans Revenue',
                                           'Delivery Revenue',
                                           'Emerging Products', 'ESR Revenue',
                                           'Full Service Unemployment Revenue',
                                           'Health Benefits', 'HR Online',
                                           'HR Solutions (PEO)',
                                           'Interest on Funds Held for Clients',
                                           'Other Processing Revenue',
                    

In [49]:
type(pipeline)

sklearn.pipeline.Pipeline

In [50]:
pipeline.predict(test_df)

array([2.41584291e+08, 2.52666768e+08, 2.36259392e+08, 2.61855540e+08,
       2.60759849e+08, 2.51681907e+08, 2.92837477e+08, 3.43333100e+08,
       2.54288340e+08, 2.81899719e+08, 2.73179446e+08, 2.55473918e+08])

In [69]:
pipeline[1:]

Pipeline(memory=None,
         steps=[('imputer',
                 Simple_Imputer(categorical_strategy='not_available',
                                fill_value_categorical=None,
                                fill_value_numerical=None,
                                numeric_strategy='mean',
                                target_variable=None)),
                ('new_levels1',
                 New_Catagorical_Levels_in_TestData(replacement_strategy='least '
                                                                         'frequent',
                                                    target='TotalRevenue')),
                ('ordinal', 'passthrough'), ('cardinality', 'pas...
                                       correlation_with_target_threshold=0.0,
                                       target_variable='TotalRevenue',
                                       threshold=0.9)),
                ('dfs', 'passthrough'), ('pca', 'passthrough'),
                ['trained_model',


## Deploy

In [82]:
import os
from io import BytesIO
from azure.storage.blob import BlobServiceClient

In [75]:
from src.paychex_ml.utils import load_credentials

In [76]:
credentials = load_credentials("blob_storage",
                               file="C:/Users/bruno.gonzalez/DataspellProjects/Paychex_revenue_forecast/credentials.yml")

In [71]:
os.environ['AZURE_STORAGE_CONNECTION_STRING'] = credentials['conn_string']

In [73]:
deploy_model(pipeline, model_name = 'model_pipeline_1', platform = 'azure', authentication = { 'container'  : 'models' })

Model Successfully Deployed on Azure Storage Blob


In [84]:
# Start client
container_name = "models"
blob_service_client = BlobServiceClient.from_connection_string(credentials['conn_string'])
container_client = blob_service_client.get_container_client(container_name)

In [85]:
# Download file
file = 'model_pipeline_1.pkl'
stream_downloader = container_client.download_blob(file)
stream = BytesIO()
stream_downloader.readinto(stream)

46914

In [87]:
from joblib import dump, load
pipeline_azure = load(stream)

In [88]:
pipeline_azure

Pipeline(memory=None,
         steps=[('dtypes',
                 DataTypes_Auto_infer(categorical_features=[],
                                      display_types=True, features_todrop=[],
                                      id_columns=[], ml_usecase='regression',
                                      numerical_features=['401KAssetfee&BPRevenue',
                                                          '401KFeeRevenue',
                                                          'ASOAllocation',
                                                          'ASORevenue-Oasis',
                                                          'Benetrac',
                                                          'CafeteriaPlansRevenue',
                                                          'DeliveryRevenue',
                                                          'EmergingProducts',
                                                          'ESRRevenue',
                                       

In [92]:
pipeline_azure.predict(test_df)

array([2.41584291e+08, 2.52666768e+08, 2.36259392e+08, 2.61855540e+08,
       2.60759849e+08, 2.51681907e+08, 2.92837477e+08, 3.43333100e+08,
       2.54288340e+08, 2.81899719e+08, 2.73179446e+08, 2.55473918e+08])

In [93]:
pmml_pipe = make_pmml_pipeline_mod(pipeline_azure)

Step : DataTypes_Auto_infer(categorical_features=[], display_types=True,
                     features_todrop=[], id_columns=[], ml_usecase='regression',
                     numerical_features=['401KAssetfee&BPRevenue',
                                         '401KFeeRevenue', 'ASOAllocation',
                                         'ASORevenue-Oasis', 'Benetrac',
                                         'CafeteriaPlansRevenue',
                                         'DeliveryRevenue', 'EmergingProducts',
                                         'ESRRevenue',
                                         'FullServiceUnemploymentRevenue',
                                         'HealthBenefits', 'HROnline',
                                         'HRSolutions(PEO)',
                                         'InterestonFundsHeldforClients',
                                         'OtherProcessingRevenue',
                                         'Payrollblendedproducts',
              

In [94]:
sklearn2pmml(pmml_pipe, "pipeline_model.pmml")

Standard output is empty
Standard error:
Exception in thread "main" net.razorvine.pickle.PickleException: failed to __setstate__()
	at net.razorvine.pickle.Unpickler.load_build(Unpickler.java:395)
	at net.razorvine.pickle.Unpickler.dispatch(Unpickler.java:220)
	at org.jpmml.python.CustomUnpickler.dispatch(CustomUnpickler.java:31)
	at net.razorvine.pickle.Unpickler.load(Unpickler.java:109)
	at numpy.core.NDArrayUtil.readObject(NDArrayUtil.java:389)
	at numpy.core.TypeDescriptor.read(TypeDescriptor.java:165)
	at numpy.core.NDArrayUtil.parseArray(NDArrayUtil.java:225)
	at numpy.core.NDArrayUtil.parseData(NDArrayUtil.java:200)
	at joblib.NumpyArrayWrapper.toArray(NumpyArrayWrapper.java:43)
	at org.jpmml.python.PickleUtil$1.dispatch(PickleUtil.java:75)
	at net.razorvine.pickle.Unpickler.load(Unpickler.java:109)
	at org.jpmml.python.PickleUtil.unpickle(PickleUtil.java:85)
	at com.sklearn2pmml.Main.run(Main.java:71)
	at com.sklearn2pmml.Main.main(Main.java:62)
Caused by: java.lang.reflect.Inv

RuntimeError: The SkLearn2PMML application has failed. The Java executable should have printed more information about the failure into its standard output and/or standard error streams