In [1]:
import datarobot as dr
import pandas as pd
import numpy as np
import datetime as dt

In [16]:
import datarobot_ts_helpers as dr_helpers

In [2]:
import os
import re 
from importlib import reload
from ts_metrics import *
from ts_pre_processing import *
from ts_calendar import *
from ts_projects import *
from ts_modeling import *
from ts_clustering import *
from ts_data_quality import *
from ts_predictions import *

In [5]:
pd.set_option('display.max_colwidth', -1)

  """Entry point for launching an IPython kernel.


In [6]:
pd.set_option('display.max_rows', None)

In [7]:
# Connect to DRClient
dr.Client(config_path = './drconfig.yaml')

<datarobot.rest.RESTClientObject at 0x1a1f87eb50>

# Load Data

In [36]:
# import dataset
training = pd.read_csv('./cognex_data_region_latest.csv')
training.shape

(70004, 15)

# Create date feature

In [37]:
def create_series(df, cols_to_concat, convert=True):
    df = df[cols_to_concat].copy()
    non_strings = [c for c in df[cols_to_concat] if df[c].dtype != 'object']

    if len(non_strings) > 0:
        if convert:
            df[non_strings] = df[non_strings].applymap(str)
        else:
            raise TypeError("columns must all be type str")

    df['series_id'] = df[cols_to_concat].apply(lambda x: '/'.join(x), axis=1)
    return df['series_id']

In [38]:
# Create a date column by concatenating year and month and adding 1st day of month

In [39]:
cols_to_concat = ['year', 'fiscal_period']
new_series = create_series(training, cols_to_concat, convert = True)
training['date'] = new_series
training.head(3)


Unnamed: 0,year,fiscal_period,Area,region,industry1,account classification,product,nb of appointments,nb of technical request,nb of opportunities created,nb of opportunities qualified,nb of opportunities forecasted,weighed funnel values,opportunities average nb of months,booking value,date
0,2017,12,Americas,Americas West,Other,Non-Strategic,INS FACTRY,0,0,0,0,0,0,0.0,410,2017/12
1,2017,12,Americas,Americas West,Other,Non-Strategic,INS SENSOR,0,0,0,0,0,0,0.0,0,2017/12
2,2017,12,Americas,Americas West,Other,Non-Strategic,DATAMAN,0,0,0,0,0,0,0.0,1121,2017/12


In [40]:
training.Area.unique()

array(['Americas', 'Europe', 'GTR China', 'Japan', 'KIA'], dtype=object)

In [41]:
training['date'] = training['date'].astype(str)+'/01'
training.head(3)

Unnamed: 0,year,fiscal_period,Area,region,industry1,account classification,product,nb of appointments,nb of technical request,nb of opportunities created,nb of opportunities qualified,nb of opportunities forecasted,weighed funnel values,opportunities average nb of months,booking value,date
0,2017,12,Americas,Americas West,Other,Non-Strategic,INS FACTRY,0,0,0,0,0,0,0.0,410,2017/12/01
1,2017,12,Americas,Americas West,Other,Non-Strategic,INS SENSOR,0,0,0,0,0,0,0.0,0,2017/12/01
2,2017,12,Americas,Americas West,Other,Non-Strategic,DATAMAN,0,0,0,0,0,0,0.0,1121,2017/12/01


In [42]:
# comvert date to datetime format
training['date']= pd.to_datetime(training['date'])

# Create new series id

In [43]:
# create series id concatenating Area and Product
cols_to_concat = ['Area', 'product']
new_series = dr_helpers.create_series_id(training, cols_to_concat, convert = True)
training['Area_product'] = new_series
training.head(3)

Unnamed: 0,year,fiscal_period,Area,region,industry1,account classification,product,nb of appointments,nb of technical request,nb of opportunities created,nb of opportunities qualified,nb of opportunities forecasted,weighed funnel values,opportunities average nb of months,booking value,date,Area_product
0,2017,12,Americas,Americas West,Other,Non-Strategic,INS FACTRY,0,0,0,0,0,0,0.0,410,2017-12-01,Americas_INS FACTRY
1,2017,12,Americas,Americas West,Other,Non-Strategic,INS SENSOR,0,0,0,0,0,0,0.0,0,2017-12-01,Americas_INS SENSOR
2,2017,12,Americas,Americas West,Other,Non-Strategic,DATAMAN,0,0,0,0,0,0,0.0,1121,2017-12-01,Americas_DATAMAN


In [44]:
# sort values 
training.sort_values(['Area_product', 'date'], ascending=[True, True], inplace=True)
training.head()

Unnamed: 0,year,fiscal_period,Area,region,industry1,account classification,product,nb of appointments,nb of technical request,nb of opportunities created,nb of opportunities qualified,nb of opportunities forecasted,weighed funnel values,opportunities average nb of months,booking value,date,Area_product
20,2017,12,Americas,Other,Automotive,Non-Strategic,3D,0,0,2,2,0,0,0.0,0,2017-12-01,Americas_3D
27,2017,12,Americas,Americas Central,Automotive,Non-Strategic,3D,0,0,3,3,0,0,0.0,33,2017-12-01,Americas_3D
41,2017,12,Americas,Americas LATAM,Automotive,Non-Strategic,3D,0,0,3,3,0,0,0.0,442,2017-12-01,Americas_3D
49,2017,12,Americas,Americas South,Automotive,Non-Strategic,3D,0,0,1,1,0,0,0.0,0,2017-12-01,Americas_3D
68,2017,12,Americas,Americas East,Automotive,Strategic,3D,0,0,0,0,0,0,0.0,98870,2017-12-01,Americas_3D


In [45]:
training.columns

Index(['year', 'fiscal_period', 'Area', 'region', 'industry1',
       'account classification', 'product', 'nb of appointments',
       'nb of technical request', 'nb of opportunities created',
       'nb of opportunities qualified', 'nb of opportunities forecasted',
       'weighed funnel values', 'opportunities average nb of months',
       'booking value', 'date', 'Area_product'],
      dtype='object')

# Set up project

In [46]:
#Set all values here.
#One important point, if you decide to change to monthly or weekly ts_settings should be 
#Changed afterwards, as weekly and monthly dates are based on the original date_col

today = pd.to_datetime("today")
target = 'booking value'
date_col = 'date'
kia = ['nb of opportunities forecasted', 'weighed funnel values', 'opportunities average nb of months']
series_id = 'Area_product'
pj_name = series_id + "_Project"+"_"+str(today)[0:10]
fd_start = 1
fd_end = 1
fdw_start = -3
fdw_end = 0
num_backtests = 2
#calendar =dr.CalendarFile.create("canada_calendar.csv")
# Create TS Settings
ts_settings = {'use_time_series':training,'project_name':pj_name, 'known_in_advance':kia,
               'series_id':series_id,  'target':target, 'date_col':date_col,'num_backtests':num_backtests,
               'fd_start':fd_start, 'fd_end':fd_end, 'fdw_start':fdw_start, 'fdw_end':fdw_end
               }


#Add a week date, and a month date 
training['week_number'] = training[ts_settings['date_col']].map(lambda x: x.strftime("%W"))
training['month_date'] = training[ts_settings['date_col']].map(lambda x: str(x)[0:7]+"-01")
training = training.sort_values(by=ts_settings['date_col'], ascending=False)

In [47]:
training.head()

Unnamed: 0,year,fiscal_period,Area,region,industry1,account classification,product,nb of appointments,nb of technical request,nb of opportunities created,nb of opportunities qualified,nb of opportunities forecasted,weighed funnel values,opportunities average nb of months,booking value,date,Area_product,week_number,month_date
69995,2020,7,KIA,KIA-KOREA,Semiconductor,Strategic Existing,WAFER ID,0,0,2,2,0,0,0.0,0,2020-07-01,KIA_WAFER ID,26,2020-07-01
70000,2020,7,KIA,KIA-INDIA,Semiconductor,White Space,INS FACTRY,0,1,2,0,0,0,0.0,0,2020-07-01,KIA_INS FACTRY,26,2020-07-01
68149,2020,7,Americas,Americas Central,Packaging,White Space,INS FACTRY,0,1,3,2,0,20000,0.0,23397,2020-07-01,Americas_INS FACTRY,26,2020-07-01
68154,2020,7,Americas,Americas East,Packaging,White Space,INS FACTRY,0,0,3,0,3,28030,1.0,40859,2020-07-01,Americas_INS FACTRY,26,2020-07-01
68159,2020,7,Americas,Americas LATAM,Packaging,White Space,INS FACTRY,0,1,6,2,0,0,0.0,-5055,2020-07-01,Americas_INS FACTRY,26,2020-07-01


# Supporting Functions

In [48]:
#Function to concatenate strings or integers into one string
def concate_unique_strings (val):
    tmp = ""
    val = set(val)
    val = list(val)
    for i in val:
        s = str(i).strip()
        tmp = tmp + ', '+ s
    tmp = tmp[2:]
    return (tmp)
#Create a data frame with series_id and series_date where series dates are continuous and 
#don't have gaps. This dataframe has not only continuous dates for each series_id, but also
#has month_date, and week_date in case one wants to aggregate by week or day
def  create_continuous_dates_df(data_df,series_id,series_date):
    '''data_df: dataframe
       series_id: the multi-series id in this case it can be Area, region, product or Area_product
       Create a dateframe with series_id and series_date'''
    product_dates = training.groupby(ts_settings["series_id"])[ts_settings["date_col"]].agg(['min', 'max'])
    product_dates = product_dates.reset_index()
    product_dates.rename(columns={'min':'Date_Min','max':'Date_Max'}, inplace=True)
    n_rows =product_dates.shape[0]
    invoice_df = pd.DataFrame()
    for i in range(n_rows):
        dates2add = pd.date_range(product_dates.iloc[i,:].Date_Min, product_dates.iloc[i,:].Date_Max).tolist()
        tmp_df = pd.DataFrame({series_id : product_dates.iloc[i,:][ts_settings["series_id"]],series_date : dates2add})
        invoice_df = invoice_df.append(tmp_df,ignore_index=True)
    invoice_df['years'] = invoice_df[ts_settings['date_col']].map( lambda x: str(x)[0:4])
    invoice_df['week_number'] = invoice_df[ts_settings['date_col']].map(lambda x: x.strftime("%W"))
    invoice_df = invoice_df.sort_values(by=ts_settings['date_col'], ascending=False)
    invoice_df['week_date'] = invoice_df.groupby(['years','week_number'])[ts_settings['date_col']].transform('min')
    invoice_df['month_date'] = invoice_df[ts_settings['date_col']].map(lambda x: str(x)[0:7]+"-01")
    
    return(invoice_df)



In [49]:
def group_by_area_product(data_df,series_id,data_col):
    '''data_df:  original dataframe (training)
       series_id: multi time-series group
       date_col : the date used by time-series
       This is just an example, on how one can combine categorical features.  I decided to 
       combine some, and just count the number of unique values for others'''
    
    products_df =  data_df.groupby([series_id,data_col]).agg(
        Area_Txt = ("Area",lambda x: concate_unique_strings(x)),
        region_Txt = ("region",lambda x: concate_unique_strings(x)),
        industry1_Txt = ("industry1",lambda x: concate_unique_strings(x)),
        account_class_Txt = ("account classification",lambda x: concate_unique_strings(x)),
        product_Txt = ("product",lambda x: concate_unique_strings(x))
    )
  
    tmp_agg = training.groupby([series_id,data_col]).agg({
       "Area" : ["nunique"], 
       "region" : ["nunique"],
       "industry1" : ["nunique"], 
       "account classification" : ["nunique"],
       "product" : ["nunique"],
       "booking value" :["sum"],
       "nb of appointments" :["sum"],
       "nb of technical request": ["sum"],
       "nb of opportunities created": ["sum"],
       "nb of opportunities forecasted": ["sum"],
       "nb of opportunities qualified": ["sum"],
       "weighed funnel values": ["sum"],
       "opportunities average nb of months": ["sum"]
    })
    tmp_agg.columns = ["_".join(x) for x in tmp_agg.columns.ravel()]
    products_final = pd.merge(left=products_df, right=tmp_agg, 
                          left_on=[series_id,data_col], 
                          right_on=[series_id,data_col],
                          how = "inner")
    products_final = products_final.reset_index()
    return(products_final)

In [50]:
group_map = {"Area_product": group_by_area_product}

In [51]:

#Prepare the dataframe for DataRobot
def make_dataset(date_df,data_df):
    '''data_df: in here will be the original dataset training
    This function just combine all the other functions written above to create 
    the final dataframe that will be used for training'''
    date_df.loc[:,[ts_settings['series_id'],ts_settings['date_col']]].drop_duplicates( keep = 'first', inplace = True)
    tmp_df = pd.merge(left=date_df, right=data_df, 
                          left_on=[ts_settings['series_id'],ts_settings['date_col']], 
                          right_on=[ts_settings['series_id'],ts_settings['date_col']],
                          how = "left")
    action = group_map[ts_settings["series_id"]]
    final_df = action(tmp_df,ts_settings['series_id'],ts_settings['date_col'])
    final_df.rename(columns={ts_settings['target']+"_sum":ts_settings['target']}, inplace=True)
    features_2_set_2_zero = list(final_df.select_dtypes(include=[np.number]).columns.values)
    final_df.loc[:,features_2_set_2_zero] = final_df.loc[:,features_2_set_2_zero].fillna(0)
    tmp_df = date_df.loc[:,[ts_settings['series_id'],ts_settings['date_col']]].copy()
    tmp_df.drop_duplicates(keep = 'first', inplace = True)
    final_df = pd.merge(left=tmp_df, right=final_df, 
                          left_on=[ts_settings['series_id'],ts_settings['date_col']], 
                          right_on=[ts_settings['series_id'],ts_settings['date_col']],
                          how = "left")
    features_2_set_2_zero = list(final_df.select_dtypes(include=[np.number]).columns.values)
    final_df.loc[:,features_2_set_2_zero] = final_df.loc[:,features_2_set_2_zero].fillna(0) 
    final_df[ts_settings['date_col']]= pd.to_datetime(final_df[ts_settings['date_col']]) 
    return(final_df)



In [52]:
#Aggregate by BRP_Model_Group
def prepare_data():
    ts_settings['date_col'] = "month_date"
    date_df =  create_continuous_dates_df(ts_settings['use_time_series'],
                                          ts_settings['series_id'],ts_settings['date_col'])
    final_df = make_dataset(date_df,training) 
    return (final_df)

In [53]:
final_df = prepare_data()
ts_settings['use_time_series'] = final_df

In [54]:
final_df.sort_values(by=[ts_settings['series_id'],ts_settings['date_col']])

Unnamed: 0,Area_product,month_date,Area_Txt,region_Txt,industry1_Txt,account_class_Txt,product_Txt,Area_nunique,region_nunique,industry1_nunique,account classification_nunique,product_nunique,booking value,nb of appointments_sum,nb of technical request_sum,nb of opportunities created_sum,nb of opportunities forecasted_sum,nb of opportunities qualified_sum,weighed funnel values_sum,opportunities average nb of months_sum
2047,Americas_3D,2017-12-01,Americas,"Americas East, Other, Americas West, Americas Central, Americas LATAM, Americas South","Electronics, Automotive, Other, Packaging","Strategic, Non-Strategic, Strategic Existing, White Space",3D,1.0,6.0,4.0,4.0,1.0,801699.0,0.0,0.0,42.0,0.0,42.0,0.0,0.0
1999,Americas_3D,2018-01-01,Americas,"Americas East, Other, Americas West, Americas Central, Americas LATAM, Americas South","Electronics, Automotive, Other, Packaging","Strategic, Most Wanted, Non-Strategic, White Space, Strategic Existing",3D,1.0,6.0,4.0,5.0,1.0,184511.0,0.0,25.0,0.0,0.0,55.0,0.0,0.0
1924,Americas_3D,2018-02-01,Americas,"Americas East, Other, Americas West, Americas Central, Americas LATAM, Americas South","Electronics, Semiconductor, Packaging, Automotive, Other","Strategic, Most Wanted, Non-Strategic, White Space, Strategic Existing",3D,1.0,6.0,5.0,5.0,1.0,49768.0,0.0,79.0,0.0,0.0,70.0,0.0,0.0
1831,Americas_3D,2018-03-01,Americas,"Americas East, Other, Americas West, Americas Central, Americas LATAM, Americas South","Electronics, Semiconductor, Packaging, Automotive, Other","Strategic, Most Wanted, Non-Strategic, White Space, Strategic Existing",3D,1.0,6.0,5.0,5.0,1.0,733159.0,0.0,150.0,0.0,0.0,72.0,0.0,0.0
1779,Americas_3D,2018-04-01,Americas,"Americas East, Other, Americas West, Americas Central, Americas LATAM, Americas South","Electronics, Semiconductor, Packaging, Automotive, Other","Strategic, Non-Strategic, Strategic Existing, White Space",3D,1.0,6.0,5.0,4.0,1.0,655603.0,0.0,130.0,0.0,0.0,59.0,0.0,0.0
1734,Americas_3D,2018-05-01,Americas,"Americas East, Other, Americas West, Americas Central, Americas LATAM, Americas South","Electronics, Semiconductor, Packaging, Automotive, Other","Strategic, Non-Strategic, Strategic Existing, White Space",3D,1.0,6.0,5.0,4.0,1.0,251937.0,0.0,132.0,0.0,0.0,71.0,0.0,0.0
1689,Americas_3D,2018-06-01,Americas,"Americas East, Other, Americas West, Americas Central, Americas LATAM, Americas South","Electronics, Semiconductor, Packaging, Automotive, Other","Strategic, Most Wanted, Non-Strategic, Strategic Existing",3D,1.0,6.0,5.0,4.0,1.0,537849.0,0.0,157.0,0.0,0.0,77.0,0.0,0.0
1564,Americas_3D,2018-07-01,Americas,"Americas East, Other, Americas West, Americas Central, Americas LATAM, Americas South","Electronics, Semiconductor, Packaging, Automotive, Other","Strategic, Non-Strategic, Strategic Existing, White Space",3D,1.0,6.0,5.0,4.0,1.0,294637.0,0.0,82.0,0.0,0.0,48.0,0.0,0.0
1523,Americas_3D,2018-08-01,Americas,"Americas East, Other, Americas West, Americas Central, Americas LATAM, Americas South","Electronics, Semiconductor, Packaging, Automotive, Other","Strategic, Non-Strategic, Strategic Existing, White Space",3D,1.0,6.0,5.0,4.0,1.0,551017.0,0.0,79.0,0.0,0.0,43.0,0.0,0.0
1495,Americas_3D,2018-09-01,Americas,"Americas East, Other, Americas West, Americas Central, Americas LATAM, Americas South","Electronics, Automotive, Other, Packaging","Strategic, Non-Strategic, Strategic Existing, White Space",3D,1.0,6.0,4.0,4.0,1.0,333616.0,0.0,101.0,0.0,0.0,71.0,0.0,0.0


In [55]:
final_df.reset_index(drop=True)

Unnamed: 0,Area_product,month_date,Area_Txt,region_Txt,industry1_Txt,account_class_Txt,product_Txt,Area_nunique,region_nunique,industry1_nunique,account classification_nunique,product_nunique,booking value,nb of appointments_sum,nb of technical request_sum,nb of opportunities created_sum,nb of opportunities forecasted_sum,nb of opportunities qualified_sum,weighed funnel values_sum,opportunities average nb of months_sum
0,KIA_WAFER ID,2020-07-01,KIA,"KIA-ASEAN, KIA-Samsung, KIA-KOREA","Electronics, Semiconductor, Other","Strategic, Non-Strategic, Strategic Existing, White Space",WAFER ID,1.0,3.0,3.0,4.0,1.0,280592.0,11.0,0.0,8.0,11.0,7.0,205330.0,9.4
1,GTR China_DATAMAN,2020-07-01,GTR China,"GTR China East 1, GTR China South 1, Other, GTR China North, GTR China South 2, GTR China East 2","Electronics, Life Sciences, Semiconductor, Packaging, Automotive, Other, Logistics","Strategic, Other, Most Wanted, Non-Strategic, White Space, Strategic Existing",DATAMAN,1.0,6.0,7.0,6.0,1.0,2014886.0,431.0,44.0,993.0,585.0,331.0,2200523.0,131.95
2,GTR China_3D SENSOR,2020-07-01,GTR China,"GTR China East 1, GTR China South 1","Electronics, Automotive, Other","Strategic, Non-Strategic, Strategic Existing",3D SENSOR,1.0,2.0,3.0,3.0,1.0,6192.0,0.0,0.0,3.0,0.0,3.0,2120.0,0.0
3,GTR China_ADV ENGINE,2020-07-01,GTR China,"GTR China East 1, GTR China South 2","Electronics, Other",Strategic Existing,ADV ENGINE,1.0,2.0,2.0,1.0,1.0,4454.0,0.0,2.0,0.0,3.0,0.0,322.0,2.0
4,Americas_3D,2020-07-01,Americas,"Americas East, Americas West, Americas Central, Americas LATAM, Americas South","Electronics, Semiconductor, Packaging, Automotive, Other","Strategic, Other, Most Wanted, Non-Strategic, White Space, Strategic Existing",3D,1.0,5.0,5.0,6.0,1.0,53975.0,52.0,6.0,46.0,7.0,34.0,76704.0,11.0
5,GTR China_CHECKER,2020-07-01,GTR China,GTR China North,Packaging,Strategic,CHECKER,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,1017.0,1.5
6,Americas_WAFER ID,2020-07-01,Americas,"Americas East, Americas West, Americas Central, Americas LATAM, Americas South","Semiconductor, Other, Packaging","Strategic, Other, Non-Strategic, White Space, Strategic Existing",WAFER ID,1.0,5.0,3.0,5.0,1.0,191404.0,2.0,0.0,11.0,5.0,11.0,71544.0,2.5
7,Americas_DATAMAN,2020-07-01,Americas,"Americas East, Other, Americas West, Americas Central, Americas LATAM, Americas South","Electronics, Semiconductor, Packaging, Automotive, Other, Logistics","Strategic, Other, Most Wanted, Non-Strategic, White Space, Strategic Existing",DATAMAN,1.0,6.0,6.0,6.0,1.0,2997366.0,466.0,73.0,1277.0,469.0,568.0,2065054.0,116.44
8,Europe_ADV ENGINE,2020-07-01,Europe,Europe West,Other,Strategic Existing,ADV ENGINE,1.0,1.0,1.0,1.0,1.0,13125.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,KIA_DEEP LEARNING,2020-07-01,KIA,"KIA-ASEAN, KIA-KOREA, Other, KIA-INDIA, KIA-Samsung","Electronics, Semiconductor, Packaging, Automotive, Other, Logistics","Strategic, Other, Most Wanted, Non-Strategic, White Space, Strategic Existing",DEEP LEARNING,1.0,5.0,6.0,6.0,1.0,714269.0,355.0,327.0,95.0,69.0,74.0,758256.0,61.49


In [56]:
final_df.sort_values(by=[ts_settings['series_id'],ts_settings['date_col']], inplace=True)

In [57]:
final_df.head()

Unnamed: 0,Area_product,month_date,Area_Txt,region_Txt,industry1_Txt,account_class_Txt,product_Txt,Area_nunique,region_nunique,industry1_nunique,account classification_nunique,product_nunique,booking value,nb of appointments_sum,nb of technical request_sum,nb of opportunities created_sum,nb of opportunities forecasted_sum,nb of opportunities qualified_sum,weighed funnel values_sum,opportunities average nb of months_sum
2047,Americas_3D,2017-12-01,Americas,"Americas East, Other, Americas West, Americas Central, Americas LATAM, Americas South","Electronics, Automotive, Other, Packaging","Strategic, Non-Strategic, Strategic Existing, White Space",3D,1.0,6.0,4.0,4.0,1.0,801699.0,0.0,0.0,42.0,0.0,42.0,0.0,0.0
1999,Americas_3D,2018-01-01,Americas,"Americas East, Other, Americas West, Americas Central, Americas LATAM, Americas South","Electronics, Automotive, Other, Packaging","Strategic, Most Wanted, Non-Strategic, White Space, Strategic Existing",3D,1.0,6.0,4.0,5.0,1.0,184511.0,0.0,25.0,0.0,0.0,55.0,0.0,0.0
1924,Americas_3D,2018-02-01,Americas,"Americas East, Other, Americas West, Americas Central, Americas LATAM, Americas South","Electronics, Semiconductor, Packaging, Automotive, Other","Strategic, Most Wanted, Non-Strategic, White Space, Strategic Existing",3D,1.0,6.0,5.0,5.0,1.0,49768.0,0.0,79.0,0.0,0.0,70.0,0.0,0.0
1831,Americas_3D,2018-03-01,Americas,"Americas East, Other, Americas West, Americas Central, Americas LATAM, Americas South","Electronics, Semiconductor, Packaging, Automotive, Other","Strategic, Most Wanted, Non-Strategic, White Space, Strategic Existing",3D,1.0,6.0,5.0,5.0,1.0,733159.0,0.0,150.0,0.0,0.0,72.0,0.0,0.0
1779,Americas_3D,2018-04-01,Americas,"Americas East, Other, Americas West, Americas Central, Americas LATAM, Americas South","Electronics, Semiconductor, Packaging, Automotive, Other","Strategic, Non-Strategic, Strategic Existing, White Space",3D,1.0,6.0,5.0,4.0,1.0,655603.0,0.0,130.0,0.0,0.0,59.0,0.0,0.0


In [58]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2070 entries, 2047 to 0
Data columns (total 20 columns):
 #   Column                                  Non-Null Count  Dtype         
---  ------                                  --------------  -----         
 0   Area_product                            2070 non-null   object        
 1   month_date                              2070 non-null   datetime64[ns]
 2   Area_Txt                                1969 non-null   object        
 3   region_Txt                              1969 non-null   object        
 4   industry1_Txt                           1969 non-null   object        
 5   account_class_Txt                       1969 non-null   object        
 6   product_Txt                             1969 non-null   object        
 7   Area_nunique                            2070 non-null   float64       
 8   region_nunique                          2070 non-null   float64       
 9   industry1_nunique                       2070 non-nul

In [74]:
final_df['date1']= pd.to_datetime(final_df['month_date'].dt.strftime("%Y-%m"))

In [75]:
#Set all values here.
#One important point, if you decide to change to monthly or weekly ts_settings should be 
#Changed afterwards, as weekly and monthly dates are based on the original date_col

today = pd.to_datetime("today")
target = 'booking value'
date_col = 'date1'
kia = ['nb of opportunities forecasted', 'weighed funnel values', 'opportunities average nb of months']
series_id = 'Area_product'
pj_name = series_id + "_Project"+"_"+str(today)[0:10]
fd_start = 1
fd_end = 1
fdw_start = -3
fdw_end = 0
num_backtests = 2
#calendar =dr.CalendarFile.create("canada_calendar.csv")
# Create TS Settings
ts_settings = {'use_time_series':training,'project_name':pj_name, 'known_in_advance':kia,
               'series_id':series_id,  'target':target, 'date_col':date_col,'num_backtests':num_backtests,
               'fd_start':fd_start, 'fd_end':fd_end, 'fdw_start':fdw_start, 'fdw_end':fdw_end
               }



In [76]:
# Rerun our data quality check
ts_describe = dr_helpers.DataQualityCheck(final_df, ts_settings)

Running all data quality checks...

There are 2070 rows and 21 columns
There are 70 series
The data spans from  2017-12-01 00:00:00 to 2020-07-01 00:00:00
Data fails hierarchical check! No hierarchical blueprints will run.
Minimum target value is <> 0. Zero-inflated blueprints will not run.
 16.00% of the series appeared after the start of the training data
 9.00% of the series were removed before the end of the training data
No duplicate timestamps detected within any series
0.00% of series have at least one missing time step.
0.00% of the rows are missing a target value
11.43% series have zeros in more than 99.00% or more of the rows


ValueError: 2678400.0 seconds is not a supported timestep

In [81]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2070 entries, 2047 to 0
Data columns (total 21 columns):
 #   Column                                  Non-Null Count  Dtype         
---  ------                                  --------------  -----         
 0   Area_product                            2070 non-null   object        
 1   month_date                              2070 non-null   datetime64[ns]
 2   Area_Txt                                1969 non-null   object        
 3   region_Txt                              1969 non-null   object        
 4   industry1_Txt                           1969 non-null   object        
 5   account_class_Txt                       1969 non-null   object        
 6   product_Txt                             1969 non-null   object        
 7   Area_nunique                            2070 non-null   float64       
 8   region_nunique                          2070 non-null   float64       
 9   industry1_nunique                       2070 non-nul

In [82]:
final_df.head()

Unnamed: 0,Area_product,month_date,Area_Txt,region_Txt,industry1_Txt,account_class_Txt,product_Txt,Area_nunique,region_nunique,industry1_nunique,...,product_nunique,booking value,nb of appointments_sum,nb of technical request_sum,nb of opportunities created_sum,nb of opportunities forecasted_sum,nb of opportunities qualified_sum,weighed funnel values_sum,opportunities average nb of months_sum,date1
2047,Americas_3D,2017-12-01,Americas,"Americas East, Other, Americas West, Americas Central, Americas LATAM, Americas South","Electronics, Automotive, Other, Packaging","Strategic, Non-Strategic, Strategic Existing, White Space",3D,1.0,6.0,4.0,...,1.0,801699.0,0.0,0.0,42.0,0.0,42.0,0.0,0.0,2017-12-01
2066,Americas_3D SENSOR,2017-12-01,Americas,"Americas East, Other, Americas West, Americas Central, Americas LATAM, Americas South","Electronics, Automotive, Other, Packaging","Strategic, Most Wanted, Non-Strategic, White Space, Strategic Existing",3D SENSOR,1.0,6.0,4.0,...,1.0,107650.0,0.0,0.0,35.0,0.0,34.0,0.0,0.0,2017-12-01
2069,Americas_ADV ENGINE,2017-12-01,Americas,Americas East,Other,Strategic Existing,ADV ENGINE,1.0,1.0,1.0,...,1.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,2017-12-01
2021,Americas_CHECKER,2017-12-01,Americas,"Americas East, Americas West, Americas Central, Americas LATAM, Americas South","Electronics, Automotive, Other, Packaging","Strategic, Non-Strategic, Strategic Existing, White Space",CHECKER,1.0,5.0,4.0,...,1.0,128613.0,0.0,0.0,7.0,0.0,6.0,0.0,0.0,2017-12-01
2013,Americas_DATAMAN,2017-12-01,Americas,"Americas East, Other, Americas West, Americas Central, Americas LATAM, Americas South","Electronics, Semiconductor, Packaging, Automotive, Other, Logistics","Strategic, Other, Most Wanted, Non-Strategic, White Space, Strategic Existing",DATAMAN,1.0,6.0,6.0,...,1.0,4831862.0,0.0,0.0,418.0,0.0,400.0,0.0,0.0,2017-12-01


In [79]:
ts_settings["series_id"]

'Area_product'

In [80]:
ts_settings["date_col"]

'date1'

In [77]:
deltas = final_df.groupby(ts_settings["series_id"])[ts_settings["date_col"]].diff().reset_index(drop=True)

In [78]:
deltas.head()

0   NaT
1   NaT
2   NaT
3   NaT
4   NaT
Name: date1, dtype: timedelta64[ns]

In [None]:
median_timestep = deltas.apply(lambda x: x.total_seconds()).median()

In [61]:
final_df.head()

Unnamed: 0,Area_product,month_date,Area_Txt,region_Txt,industry1_Txt,account_class_Txt,product_Txt,Area_nunique,region_nunique,industry1_nunique,...,product_nunique,booking value,nb of appointments_sum,nb of technical request_sum,nb of opportunities created_sum,nb of opportunities forecasted_sum,nb of opportunities qualified_sum,weighed funnel values_sum,opportunities average nb of months_sum,date1
2047,Americas_3D,2017-12-01,Americas,"Americas East, Other, Americas West, Americas Central, Americas LATAM, Americas South","Electronics, Automotive, Other, Packaging","Strategic, Non-Strategic, Strategic Existing, White Space",3D,1.0,6.0,4.0,...,1.0,801699.0,0.0,0.0,42.0,0.0,42.0,0.0,0.0,2017-12
1999,Americas_3D,2018-01-01,Americas,"Americas East, Other, Americas West, Americas Central, Americas LATAM, Americas South","Electronics, Automotive, Other, Packaging","Strategic, Most Wanted, Non-Strategic, White Space, Strategic Existing",3D,1.0,6.0,4.0,...,1.0,184511.0,0.0,25.0,0.0,0.0,55.0,0.0,0.0,2018-01
1924,Americas_3D,2018-02-01,Americas,"Americas East, Other, Americas West, Americas Central, Americas LATAM, Americas South","Electronics, Semiconductor, Packaging, Automotive, Other","Strategic, Most Wanted, Non-Strategic, White Space, Strategic Existing",3D,1.0,6.0,5.0,...,1.0,49768.0,0.0,79.0,0.0,0.0,70.0,0.0,0.0,2018-02
1831,Americas_3D,2018-03-01,Americas,"Americas East, Other, Americas West, Americas Central, Americas LATAM, Americas South","Electronics, Semiconductor, Packaging, Automotive, Other","Strategic, Most Wanted, Non-Strategic, White Space, Strategic Existing",3D,1.0,6.0,5.0,...,1.0,733159.0,0.0,150.0,0.0,0.0,72.0,0.0,0.0,2018-03
1779,Americas_3D,2018-04-01,Americas,"Americas East, Other, Americas West, Americas Central, Americas LATAM, Americas South","Electronics, Semiconductor, Packaging, Automotive, Other","Strategic, Non-Strategic, Strategic Existing, White Space",3D,1.0,6.0,5.0,...,1.0,655603.0,0.0,130.0,0.0,0.0,59.0,0.0,0.0,2018-04


In [103]:
#Decide which one to save, and then just change the name
final_df.to_csv(ts_settings['project_name']+'monthkia'+".csv")

In [33]:
final_df.columns

Index(['Area_product', 'month_date', 'Area_Txt', 'region_Txt', 'industry1_Txt',
       'account_class_Txt', 'product_Txt', 'Area_nunique', 'region_nunique',
       'industry1_nunique', 'account classification_nunique',
       'product_nunique', 'booking value', 'nb of appointments_sum',
       'nb of technical request_sum', 'nb of opportunities created_sum',
       'nb of opportunities forecasted_sum',
       'nb of opportunities qualified_sum', 'weighed funnel values_sum',
       'opportunities average nb of months_sum'],
      dtype='object')

In [104]:
def start_modeling():
    '''provide a project name, and the dataframe for training
        1. Create project
        2. Set specifications for time-series modeling
        3. Start project'''
    project = dr.Project.create(project_name=ts_settings['project_name'], sourcedata=ts_settings['use_time_series'])
    mySpec = dr.DatetimePartitioningSpecification(
                datetime_partition_column=ts_settings['date_col'],
                use_time_series=True,
                multiseries_id_columns=[ts_settings['series_id']],
                number_of_backtests = ts_settings['num_backtests'],
                default_to_known_in_advance=False,
                feature_derivation_window_start=ts_settings['fdw_start'],
                feature_derivation_window_end= ts_settings['fdw_end'],
                forecast_window_start=ts_settings['fd_start'],
                forecast_window_end=ts_settings['fd_end'],
        

    )
    project.set_target(
            target = ts_settings['target'],
            partitioning_method = mySpec,
            max_wait = 1000000,
            mode=dr.AUTOPILOT_MODE.QUICK  ##### Here is where we can change to dr.AUTOPILOT_MODE.FULL_AUTO
    )
    project.wait_for_autopilot()  ##Wait for autopilot to finish
    return(project)

In [105]:
#You can add a calendar as follows
project = start_modeling()

In progress: 2, queued: 11 (waited: 0s)
In progress: 2, queued: 11 (waited: 1s)
In progress: 2, queued: 11 (waited: 2s)
In progress: 2, queued: 11 (waited: 3s)
In progress: 2, queued: 11 (waited: 4s)
In progress: 2, queued: 11 (waited: 7s)
In progress: 2, queued: 11 (waited: 11s)
In progress: 2, queued: 11 (waited: 18s)
In progress: 2, queued: 11 (waited: 31s)
In progress: 2, queued: 11 (waited: 52s)
In progress: 2, queued: 9 (waited: 73s)
In progress: 2, queued: 9 (waited: 93s)
In progress: 2, queued: 9 (waited: 114s)
In progress: 2, queued: 8 (waited: 135s)
In progress: 2, queued: 7 (waited: 155s)
In progress: 2, queued: 7 (waited: 176s)
In progress: 2, queued: 6 (waited: 197s)
In progress: 2, queued: 5 (waited: 217s)
In progress: 6, queued: 0 (waited: 238s)
In progress: 6, queued: 0 (waited: 259s)
In progress: 5, queued: 0 (waited: 279s)
In progress: 1, queued: 0 (waited: 300s)
In progress: 4, queued: 0 (waited: 320s)
In progress: 4, queued: 0 (waited: 341s)
In progress: 1, queued: 

### Data quality check

In [57]:
# Rerun our data quality check
ts_describe = data_quality_check(final_df, ts_settings)

Running all data quality checks...

There are 2070 rows and 20 columns
There are 70 series
The data spans from  2017-12-01 00:00:00 to 2020-07-01 00:00:00
Data fails hierarchical check! No hierarchical blueprints will run.
Minimum target value is > 0. Zero-inflated blueprints will not run.
 16.00% of the series appeared after the start of the training data
 9.00% of the series were removed before the end of the training data
No duplicate timestamps detected within any series
No leading or trailing zeros detected within series
0.00% of series have at least one missing time step.
0.00% of the rows are missing a target value
11.43% series have zeros in more than 99.00% or more of the rows


ValueError: 2678400.0 seconds is not a supported timestep

In [58]:
training = fill_missing_dates(final_df, ts_settings)

ValueError: 2678400.0 seconds is not a supported timestep

### Plot series

In [59]:
# Plot our average target over time
plot_series_average(final_df, ts_settings)

In [60]:
# Bottom 35 series
plot_individual_series(final_df, ts_settings, n=35, top=False)

In [61]:
# Top 35 series
plot_individual_series(final_df, ts_settings, n=35, top=True)

In [62]:
# Create and plot a Holiday calendar
plot_ts_calendar(final_df, ts_settings, calendar=None)

#Note: you don't need to have an already created calendar

ValueError: 2678400.0 seconds is not a supported timestep

### Clustering

In [67]:
# Series clustering
method='correlation'
nlags=None
alpha=None
scale=True
scale_method=None
split_method=None
n_clusters=None
max_clusters= 2
plot=True

df_w_clusters =  add_cluster_labels(final_df, 
                                    ts_settings, 
                                    method=method,
                                    scale=scale,
                                    scale_method=scale_method,
                                    alpha=alpha,
                                    nlags=nlags, 
                                    split_method=split_method,
                                    n_clusters=n_clusters, 
                                    max_clusters=max_clusters, 
                                    plot=True)

plot_clusters(df_w_clusters, ts_settings)

Testing 2 to 2 clusters
For n_clusters = 2, silhouette score is 0.26787188484505303
optimal n_clusters = 2, max silhouette score is 0.26787188484505303


In [64]:
# We can force 4 clusters
method='correlation'
nlags=None
alpha=None
scale=True
scale_method=None
n_clusters=4
max_clusters=None
plot=True

df_w_clusters =  add_cluster_labels(final_df, 
                                    ts_settings, 
                                    method=method,
                                    scale=scale,
                                    scale_method=scale_method,
                                    alpha=alpha,
                                    nlags=nlags, 
                                    n_clusters=n_clusters, 
                                    max_clusters=max_clusters, 
                                    plot=True)

plot_clusters(df_w_clusters, ts_settings)

### Add cross series features

In [69]:
df_w_clusters.head(3)

Unnamed: 0,Area_product,month_date,Area_Txt,region_Txt,industry1_Txt,account_class_Txt,product_Txt,Area_nunique,region_nunique,industry1_nunique,...,product_nunique,booking value,nb of appointments_sum,nb of technical request_sum,nb of opportunities created_sum,nb of opportunities forecasted_sum,nb of opportunities qualified_sum,weighed funnel values_sum,opportunities average nb of months_sum,Cluster
0,Americas_3D,2017-12-01,Americas,"Americas LATAM, Other, Americas East, Americas West, Americas Central, Americas South","Other, Electronics, Automotive, Packaging","Strategic, White Space, Strategic Existing, Non-Strategic",3D,1.0,6.0,4.0,...,1.0,801699.0,0.0,0.0,42.0,0.0,42.0,0.0,0.0,0
32,Americas_3D SENSOR,2017-12-01,Americas,"Americas LATAM, Other, Americas East, Americas West, Americas Central, Americas South","Other, Electronics, Automotive, Packaging","Strategic Existing, Non-Strategic, Strategic, Most Wanted, White Space",3D SENSOR,1.0,6.0,4.0,...,1.0,107650.0,0.0,0.0,35.0,0.0,34.0,0.0,0.0,0
64,Americas_ADV ENGINE,2017-12-01,Americas,Americas East,Other,Strategic Existing,ADV ENGINE,1.0,1.0,1.0,...,1.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,0


In [71]:
df_w_clusters.columns

Index(['Area_product', 'month_date', 'Area_Txt', 'region_Txt', 'industry1_Txt',
       'account_class_Txt', 'product_Txt', 'Area_nunique', 'region_nunique',
       'industry1_nunique', 'account classification_nunique',
       'product_nunique', 'booking value', 'nb of appointments_sum',
       'nb of technical request_sum', 'nb of opportunities created_sum',
       'nb of opportunities forecasted_sum',
       'nb of opportunities qualified_sum', 'weighed funnel values_sum',
       'opportunities average nb of months_sum', 'Cluster'],
      dtype='object')

In [72]:
#Set all values here.
#One important point, if you decide to change to monthly or weekly ts_settings should be 
#Changed afterwards, as weekly and monthly dates are based on the original date_col

today = pd.to_datetime("today")
target = 'booking value'
date_col = 'month_date'
kia = ['nb of opportunities forecasted_sum', 'weighed funnel values_sum', 'opportunities average nb of months_sum']
series_id = 'Area_product'
pj_name = series_id + "_Project"+"_"+str(today)[0:10]
fd_start = 1
fd_end = 1
fdw_start = -3
fdw_end = 0
num_backtests = 2
#calendar =dr.CalendarFile.create("canada_calendar.csv")
# Create TS Settings
ts_settings = {'use_time_series':training,'project_name':pj_name, 'known_in_advance':kia,
               'series_id':series_id,  'target':target, 'date_col':date_col,'num_backtests':num_backtests,
               'fd_start':fd_start, 'fd_end':fd_end, 'fdw_start':fdw_start, 'fdw_end':fdw_end
               }

In [84]:
# Set defaults

target = 'booking value'
date_col = 'month_date'
series_id = 'Area_product'
kia = ['nb of opportunities forecasted_sum', 'weighed funnel values_sum', 'opportunities average nb of months_sum']
num_backtests = 3
validation_duration = 1
holdout_duration = 1
disable_holdout = True
metric = 'RMSE'
use_time_series=True
fd_start = 1
fd_end = 1
fdw_start = -3
fdw_end = 0
max_date = pd.to_datetime(training[date_col]).max()

# Create TS Settings
ts_settings = {'max_date':max_date, 'known_in_advance':kia, 'num_backtests':num_backtests, 
               'validation_duration':validation_duration, 'holdout_duration':holdout_duration,
               'disable_holdout':disable_holdout,'use_time_series':use_time_series,
               'series_id':series_id, 'metric':metric, 'target':target, 'date_col':date_col,
               'fd_start':fd_start, 'fd_end':fd_end, 'fdw_start':fdw_start, 'fdw_end':fdw_end}

In [73]:
df_w_clusters =  create_cross_series_features(df_w_clusters, 
                                              group=[date_col,'Cluster'], 
                                              cols=[target,'nb of opportunities created_sum','nb of opportunities qualified_sum'], 
                                              funcs=['mean','std'])
df_w_clusters.head(2)

Unnamed: 0,Area_product,month_date,Area_Txt,region_Txt,industry1_Txt,account_class_Txt,product_Txt,Area_nunique,region_nunique,industry1_nunique,...,nb of opportunities qualified_sum,weighed funnel values_sum,opportunities average nb of months_sum,Cluster,booking value_mean,booking value_std,nb of opportunities created_sum_mean,nb of opportunities created_sum_std,nb of opportunities qualified_sum_mean,nb of opportunities qualified_sum_std
0,Americas_3D,2017-12-01,Americas,"Americas LATAM, Other, Americas East, Americas West, Americas Central, Americas South","Other, Electronics, Automotive, Packaging","Strategic, White Space, Strategic Existing, Non-Strategic",3D,1.0,6.0,4.0,...,42.0,0.0,0.0,0,1097396.0,2112247.0,100.428571,156.379642,98.190476,154.146151
1,Americas_3D SENSOR,2017-12-01,Americas,"Americas LATAM, Other, Americas East, Americas West, Americas Central, Americas South","Other, Electronics, Automotive, Packaging","Strategic Existing, Non-Strategic, Strategic, Most Wanted, White Space",3D SENSOR,1.0,6.0,4.0,...,34.0,0.0,0.0,0,1097396.0,2112247.0,100.428571,156.379642,98.190476,154.146151


In [88]:
# Build separate projects by forecast distance, feature derivation window, and/or cluster
fdws=[(-5,0),(-3,0),(-2,0)]
fds = [(1,1)]

In [89]:
projects = create_dr_projects(df_w_clusters, 
                             ts_settings, 
                             prefix='TS',
                             split_col='Cluster', 
                             fdws=fdws, 
                             fds=fds)

Kicking off 6 projects

Building Next Project 
...



ValueError: 2678400.0 seconds is not a supported timestep