In [1]:
import pandas as pd

df = pd.read_csv('sales_data.csv')

In [2]:
df.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10


In [3]:
df.shape

(913000, 4)

In [4]:
df.dtypes

date     object
store     int64
item      int64
sales     int64
dtype: object

In [5]:
df.date = pd.to_datetime(df.date)

In [6]:
df.dtypes

date     datetime64[ns]
store             int64
item              int64
sales             int64
dtype: object

In [7]:
df.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10


In [8]:
df = df.sample(n=89_000, random_state=0)

In [9]:
df.shape

(19000, 4)

In [10]:
df.head()

Unnamed: 0,date,store,item,sales
335813,2017-07-14,4,19,56
630838,2015-05-19,6,35,45
365685,2014-05-01,1,21,48
322781,2016-11-06,7,18,85
151590,2013-02-02,4,9,46


In [11]:
df.sort_values('date', inplace=True)

In [12]:
df.head()

Unnamed: 0,date,store,item,sales
83996,2013-01-01,7,5,5
151558,2013-01-01,4,9,19
584320,2013-01-01,1,33,37
734052,2013-01-01,3,41,14
427284,2013-01-01,5,24,26


In [13]:
df.reset_index(inplace=True, drop=True)

In [14]:
df.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,7,5,5
1,2013-01-01,4,9,19
2,2013-01-01,1,33,37
3,2013-01-01,3,41,14
4,2013-01-01,5,24,26


## Split data into trianing and testing 

In [15]:
train = df[df.date < '2017-01-01']
test = df[df.date >= '2017-01-01']

In [16]:
train.shape

(15213, 4)

In [17]:
test.shape

(3787, 4)

In [18]:
train_features = train.drop(columns=['sales'])
train_target = train.sales
test_features = test.drop(columns=['sales'])
test_target = test.sales

In [19]:
train_features.head()

Unnamed: 0,date,store,item
0,2013-01-01,7,5
1,2013-01-01,4,9
2,2013-01-01,1,33
3,2013-01-01,3,41
4,2013-01-01,5,24


In [20]:
train_target.head()

0     5
1    19
2    37
3    14
4    26
Name: sales, dtype: int64

## Create model

In [21]:
from catboost import CatBoostRegressor
from catboost.utils import eval_metric

model = CatBoostRegressor(verbose=False, allow_writing_files=False, random_state=42)

In [22]:
model.fit(train_features, train_target)

<catboost.core.CatBoostRegressor at 0x26673707400>

In [23]:
predictions = model.predict(test_features)
eval_metric(test_target.values, predictions, "SMAPE")

[37.38282082443815]

### Enriching our data

In [24]:
from upgini import FeaturesEnricher, SearchKey
from upgini.metadata import CVType

enricher = FeaturesEnricher(
    search_keys={
        'date': SearchKey.DATE
    },
    cv=CVType.time_series
)



In [25]:
enricher.fit(train_features, train_target, eval_set=[(test_features, test_target)])

Detected task type: ModelTaskType.REGRESSION


  self.logger.info(f"First 10 rows of the eval_y_{idx} with shape {len(eval_y)}:\n{eval_y[:10]}")


Column name,Status,Description
target,All valid,All values in this column are good to go
date,All valid,All values in this column are good to go


Running search request with search_id=a6affd3d-f96c-4cb8-8a0e-d0179f68db0f
We'll send email notification once it's completed, just use your personal api_key from profile.upgini.com
Done

[92m[1m
12 relevant feature(s) found with the search keys: ['date'][0m


  display(self.features_info.head(60).style.hide_index())


provider,source,feature name,shap value,coverage %,type,feature type
,,item,0.402879,100.0,numerical,
Upgini,Public/Comm. shared,f_weather_date_weather_pca_0_d7e0a1fc,0.146411,100.0,numerical,Free
,,store,0.096979,100.0,numerical,
Upgini,Public/Comm. shared,f_events_date_week_sin1_847b5db1,0.069145,100.0,numerical,Free
Upgini,Public/Comm. shared,f_events_date_year_cos1_9014a856,0.04655,100.0,numerical,Free
Upgini,Public/Comm. shared,f_events_date_week_cos1_f6a8c1fc,0.019893,100.0,numerical,Free
Upgini,Public/Comm. shared,f_weather_date_weather_umap_24_2e14c9a6,0.016572,100.0,numerical,Free
Upgini,Public/Comm. shared,f_weather_date_weather_umap_33_89bb7578,0.015019,100.0,numerical,Free
Upgini,Public/Comm. shared,f_financial_date_silver_14e835ea,0.013486,100.0,numerical,Free
Upgini,Public/Comm. shared,f_financial_date_dow_jones_fe02128f,0.013376,100.0,numerical,Free


In [26]:
enricher.feature_names_

['f_weather_date_weather_pca_0_d7e0a1fc',
 'f_events_date_week_sin1_847b5db1',
 'f_events_date_year_cos1_9014a856',
 'f_events_date_week_cos1_f6a8c1fc',
 'f_weather_date_weather_umap_24_2e14c9a6',
 'f_weather_date_weather_umap_33_89bb7578',
 'f_financial_date_silver_14e835ea',
 'f_financial_date_dow_jones_fe02128f',
 'f_events_date_week_cos3_7525fe31',
 'f_weather_date_weather_umap_20_90e5e123',
 'f_weather_date_weather_umap_49_6c549e11',
 'f_economic_date_cpi_pca_5_789d07f3']

In [27]:
enricher.feature_importances_

[0.14641081329859218,
 0.06914527492193365,
 0.04654956815648062,
 0.019892790385438543,
 0.01657185287062552,
 0.015019345396624601,
 0.013485904788648176,
 0.013376439227740278,
 0.006764713347446169,
 0.004168342669110124,
 0.003941445812618263,
 0.0034692744296017305]

In [28]:
for name, shap in zip(enricher.feature_names_, enricher.feature_importances_):
    print(f'{name}\t\t{shap}')

f_weather_date_weather_pca_0_d7e0a1fc		0.14641081329859218
f_events_date_week_sin1_847b5db1		0.06914527492193365
f_events_date_year_cos1_9014a856		0.04654956815648062
f_events_date_week_cos1_f6a8c1fc		0.019892790385438543
f_weather_date_weather_umap_24_2e14c9a6		0.01657185287062552
f_weather_date_weather_umap_33_89bb7578		0.015019345396624601
f_financial_date_silver_14e835ea		0.013485904788648176
f_financial_date_dow_jones_fe02128f		0.013376439227740278
f_events_date_week_cos3_7525fe31		0.006764713347446169
f_weather_date_weather_umap_20_90e5e123		0.004168342669110124
f_weather_date_weather_umap_49_6c549e11		0.003941445812618263
f_economic_date_cpi_pca_5_789d07f3		0.0034692744296017305


In [32]:
result_df = enricher.get_features_info().drop(columns=['provider', 'source', 'coverage %', 'type', 'feature type'])
result_df

Unnamed: 0,feature name,shap value
0,item,0.402879
1,f_weather_date_weather_pca_0_d7e0a1fc,0.146411
2,store,0.096979
3,f_events_date_week_sin1_847b5db1,0.069145
4,f_events_date_year_cos1_9014a856,0.04655
5,f_events_date_week_cos1_f6a8c1fc,0.019893
6,f_weather_date_weather_umap_24_2e14c9a6,0.016572
7,f_weather_date_weather_umap_33_89bb7578,0.015019
8,f_financial_date_silver_14e835ea,0.013486
9,f_financial_date_dow_jones_fe02128f,0.013376


In [33]:
enricher.calculate_metrics(
    train_features, train_target, eval_set=[(test_features, test_target)],
    estimator=model,
    scoring='mean_absolute_percentage_error'
)

Calculating metrics...
-

  self.logger.info(f"First 10 rows of the eval_y_{idx} with shape {len(eval_y)}:\n{eval_y[:10]}")


Done


Unnamed: 0,match_rate,baseline mean_absolute_percentage_error,enriched mean_absolute_percentage_error,uplift
,,,,
train,100.0,0.250501,0.175563,0.074937
eval 1,100.0,0.264574,0.189014,0.075559


In [34]:
enricher.transform(train_features, keep_input=True).head()



Column name,Status,Description
date,All valid,All values in this column are good to go


Running search request with search_id=f7bc992e-e421-4c34-9d2c-5af59bb2027a
We'll send email notification once it's completed, just use your personal api_key from profile.upgini.com
Done

Collecting selected features...
Done


Unnamed: 0,date,store,item,f_weather_date_weather_pca_0_d7e0a1fc,f_events_date_week_sin1_847b5db1,f_events_date_year_cos1_9014a856,f_events_date_week_cos1_f6a8c1fc,f_weather_date_weather_umap_24_2e14c9a6,f_weather_date_weather_umap_33_89bb7578,f_financial_date_silver_14e835ea,f_financial_date_dow_jones_fe02128f,f_events_date_week_cos3_7525fe31,f_weather_date_weather_umap_20_90e5e123,f_weather_date_weather_umap_49_6c549e11,f_economic_date_cpi_pca_5_789d07f3
0,2013-01-01,7,5,29.676683,0.781831,0.98522,0.62349,5.828106,4.644803,30.173,13104.139648,-0.900969,4.339428,4.654517,-8.943169
1,2013-01-01,4,9,29.676683,0.781831,0.98522,0.62349,5.828106,4.644803,30.173,13104.139648,-0.900969,4.339428,4.654517,-8.943169
2,2013-01-01,1,33,29.676683,0.781831,0.98522,0.62349,5.828106,4.644803,30.173,13104.139648,-0.900969,4.339428,4.654517,-8.943169
3,2013-01-01,3,41,29.676683,0.781831,0.98522,0.62349,5.828106,4.644803,30.173,13104.139648,-0.900969,4.339428,4.654517,-8.943169
4,2013-01-01,5,24,29.676683,0.781831,0.98522,0.62349,5.828106,4.644803,30.173,13104.139648,-0.900969,4.339428,4.654517,-8.943169


In [35]:
enriched_train_features = enricher.transform(train_features, keep_input=True)
enriched_test_features = enricher.transform(test_features, keep_input=True)



Column name,Status,Description
date,All valid,All values in this column are good to go


Running search request with search_id=f3fed247-3350-4b53-817a-07bab31bb1bc
We'll send email notification once it's completed, just use your personal api_key from profile.upgini.com
Done

Collecting selected features...
Done


Column name,Status,Description
date,All valid,All values in this column are good to go


Running search request with search_id=cfc4b976-bf85-41d8-a9db-4e14875d5a1e
We'll send email notification once it's completed, just use your personal api_key from profile.upgini.com
Done

Collecting selected features...
Done


### Predicting with our enriched data

In [36]:
model.fit(enriched_train_features, train_target)
predictions = model.predict(enriched_test_features)
eval_metric(test_target.values, predictions, 'SMAPE')

[14.174379469429864]

Before
37.38282082443815
After
14.174379469429864