In [4]:
# pip install upgini & catboost
from os.path import exists
import pandas as pd

In [5]:
# Import data
df_path = "train.csv.zip" if exists("train.csv.zip") else "https://github.com/upgini/upgini/raw/main/notebooks/train.csv.zip"
df = pd.read_csv(df_path)
df = df.sample(n=19000, random_state=0)  # random sample of 19k datapts

In [7]:
# Data processing
# Change store & item as strings
df["store"] = df["store"].astype(str)
df["item"] = df["item"].astype(str)

# Convert date into pandas datetime
df["date"] = pd.to_datetime(df["date"])

df.sort_values("date", inplace=True)
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,7,5,5
1,2013-01-01,4,9,19
2,2013-01-01,1,33,37
3,2013-01-01,3,41,14
4,2013-01-01,5,24,26


In [8]:
# Split train and test data
# Train: 2013-2016
# Test: 2017
train = df[df["date"] < "2017-01-01"]
test = df[df["date"] >= "2017-01-01"]

In [9]:
# Features and labels
train_features = train.drop(columns=["sales"])  # every column but sales
train_target = train["sales"]  # new sales column
test_features = test.drop(columns=["sales"])
test_target = test["sales"]

Enrich Features (Upgini)

In [13]:
from upgini import FeaturesEnricher, SearchKey
from upgini.metadata import CVType

enricher = FeaturesEnricher(
    search_keys = {
        "date": SearchKey.DATE,
    },
    cv = CVType.time_series
)
enricher.fit(train_features,
             train_target,
             eval_set=[(test_features, test_target)])

Try to add other keys like the COUNTRY, POSTAL_CODE, PHONE NUMBER, EMAIL/HEM, IP to your training dataset
for search through all the available data sources.
See docs https://github.com/upgini/upgini#-total-239-countries-and-up-to-41-years-of-history


Detected task type: ModelTaskType.REGRESSION. Reason: date search key is present, treating as regression
You can set task type manually with argument `model_task_type` of FeaturesEnricher constructor if task type detected incorrectly




Column name,Status,Errors
date,All valid,-
target,All valid,-




Running search request, search_id=cbc5e515-2bf1-49fb-8cda-6611631a69fc
We'll send email notification once it's completed, just use your personal api_key from profile.upgini.com
\


Feature name,SHAP value,Coverage %,Value preview,Provider,Source,Updates
f_economic_date_cbpol_umap_6_aa0352de,9.8908,100.0,"1.0587, 6.9596, 1.0408",Upgini,World economic indicators,Daily
f_autofe_groupbythenmean_d5f4f809f4,8.419,100.0,"0.0981, 0.0598, 0.0905","Upgini,Training dataset","AutoFE: feature from Markets data, grouped by feature from training dataset",Daily
f_autofe_groupbythenmedian_64b28257d7,7.9453,100.0,"1.0005, 1.0012, 1.0011","Upgini,Training dataset","AutoFE: feature from Markets data, grouped by feature from training dataset",Daily
f_autofe_groupbythenrank_d595da62f7,3.4884,100.0,"0.3525, 0.5, 0.2133","Upgini,Training dataset","AutoFE: feature from Calendar data, grouped by feature from training dataset",Daily
f_financial_date_crude_oil_7d_to_1y_c3e0ad17,2.1476,100.0,"1.0001, 1.0769, 1.0154",Upgini,Markets data,Daily
f_autofe_roll_3d_min_1a2ad6d8bb,2.1034,100.0,"0.3253, -0.2795, -0.3817","Training dataset,Upgini","AutoFE: features from Training dataset,Calendar data",Daily
f_events_date_year_cos1_9014a856,1.6919,100.0,"0.3253, -0.263, -0.3496",Upgini,Calendar data,Daily
f_autofe_roll_3d_median_8e12f1e1e9,1.6219,100.0,"0.309, -0.263, -0.3496","Training dataset,Upgini","AutoFE: features from Training dataset,Calendar data",Daily
store,1.1012,100.0,"7, 10, 5",,,
f_autofe_groupbythenrank_110e5b8df1,0.8017,100.0,"0.0628, 0.2215, 0.7964","Upgini,Training dataset","AutoFE: feature from Calendar data, grouped by feature from training dataset",Daily


Provider,Source,All features SHAP,Number of relevant features
"Upgini,Training dataset","AutoFE: feature from Markets data, grouped by feature from training dataset",16.3643,2
Upgini,World economic indicators,9.8908,1
"Upgini,Training dataset","AutoFE: feature from Calendar data, grouped by feature from training dataset",4.944,3
"Training dataset,Upgini","AutoFE: features from Training dataset,Calendar data",3.7253,2
Upgini,Calendar data,3.1746,4
Upgini,Markets data,2.1476,1
Training dataset,AutoFE: features from Training dataset,0.553,2


Sources,Feature name,Feature 1,Feature 2,Function
"Markets data, grouped by feature from training dataset",f_autofe_groupbythenmean_d5f4f809f4,f_financial_date_vix_gap_b64bd2b9,store_824d80,GroupByThenMean
"Markets data, grouped by feature from training dataset",f_autofe_groupbythenmedian_64b28257d7,f_financial_date_usd_eur_1d_to_7d_d8b89b5d,item_4a33ea,GroupByThenMedian
"Calendar data, grouped by feature from training dataset",f_autofe_groupbythenrank_d595da62f7,f_events_date_week_cos3_7525fe31,store_824d80,GroupByThenRank
"Training dataset,Calendar data",f_autofe_roll_3d_min_1a2ad6d8bb,f_events_date_year_cos1_9014a856,,roll_3d_min
"Training dataset,Calendar data",f_autofe_roll_3d_median_8e12f1e1e9,f_events_date_year_cos1_9014a856,,roll_3d_median
"Calendar data, grouped by feature from training dataset",f_autofe_groupbythenrank_110e5b8df1,f_events_date_week_cos1_f6a8c1fc,store_824d80,GroupByThenRank
"Calendar data, grouped by feature from training dataset",f_autofe_groupbythenrank_3729e8053b,f_events_date_week_cos1_f6a8c1fc,item_4a33ea,GroupByThenRank
Training dataset,f_autofe_roll_3d_max_946bf6086b,target,,roll_3d_max
Training dataset,f_autofe_lag_7d_4a1d9c9dff,target,,lag_7d


We detected 48 outliers in your sample.
Examples of outliers with maximum value of target:
33    205
17    196
12    187
Name: target, dtype: int64
Outliers will be excluded during the metrics calculation.
Calculating accuracy uplift after enrichment...
y distributions from the training sample and eval_set differ according to the Kolmogorov-Smirnov test,
which makes metrics between the train and eval_set incomparable.
|

Dataset type,Rows,Mean target,Baseline MAPE,Enriched MAPE,"Uplift, abs","Uplift, %"
Train,9418,53.3352,0.325 Â± 0.109,0.236 Â± 0.087,0.089,27.4%
Eval 1,3764,58.5994,0.278 Â± 0.008,0.209 Â± 0.024,0.068,24.7%
