In [1]:
import pandas as pd

df = pd.read_csv('sales_data.csv')

In [2]:
df.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10


In [3]:
df.shape

(913000, 4)

In [4]:
df.dtypes

date     object
store     int64
item      int64
sales     int64
dtype: object

In [5]:
df.date = pd.to_datetime(df.date)

In [6]:
df.dtypes

date     datetime64[ns]
store             int64
item              int64
sales             int64
dtype: object

In [7]:
df.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10


In [8]:
df = df.sample(n=89_000, random_state=0)

In [9]:
df.shape

(89000, 4)

In [10]:
df.head()

Unnamed: 0,date,store,item,sales
335813,2017-07-14,4,19,56
630838,2015-05-19,6,35,45
365685,2014-05-01,1,21,48
322781,2016-11-06,7,18,85
151590,2013-02-02,4,9,46


In [11]:
df.sort_values('date', inplace=True)

In [12]:
df.head()

Unnamed: 0,date,store,item,sales
83996,2013-01-01,7,5,5
381634,2013-01-01,10,21,33
377982,2013-01-01,8,21,21
368852,2013-01-01,3,21,25
668316,2013-01-01,7,37,11


In [13]:
df.reset_index(inplace=True, drop=True)

In [14]:
df.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,7,5,5
1,2013-01-01,10,21,33
2,2013-01-01,8,21,21
3,2013-01-01,3,21,25
4,2013-01-01,7,37,11


## Split data into trianing and testing 

In [15]:
train = df[df.date < '2017-01-01']
test = df[df.date >= '2017-01-01']

In [16]:
train.shape

(71051, 4)

In [17]:
test.shape

(17949, 4)

In [18]:
train_features = train.drop(columns=['sales'])
train_target = train.sales
test_features = test.drop(columns=['sales'])
test_target = test.sales

In [19]:
train_features.head()

Unnamed: 0,date,store,item
0,2013-01-01,7,5
1,2013-01-01,10,21
2,2013-01-01,8,21
3,2013-01-01,3,21
4,2013-01-01,7,37


In [20]:
train_target.head()

0     5
1    33
2    21
3    25
4    11
Name: sales, dtype: int64

## Create model

In [21]:
from catboost import CatBoostRegressor
from catboost.utils import eval_metric

model = CatBoostRegressor(verbose=False, allow_writing_files=False, random_state=42)

In [22]:
model.fit(train_features, train_target)

<catboost.core.CatBoostRegressor at 0x1ac0b305c60>

In [23]:
predictions = model.predict(test_features)
eval_metric(test_target.values, predictions, "SMAPE")

[37.8901442213532]

### Enriching our data

In [24]:
from upgini import FeaturesEnricher, SearchKey
from upgini.metadata import CVType

enricher = FeaturesEnricher(
    search_keys={
        'date': SearchKey.DATE
    },
    cv=CVType.time_series
)

In [25]:
enricher.fit(train_features, train_target, eval_set=[(test_features, test_target)])

  self.logger.info(f"First 10 rows of the eval_y_{idx} with shape {len(eval_y)}:\n{eval_y[:10]}")


Detected task type: ModelTaskType.REGRESSION


Column name,Status,Description
target,All valid,All values in this column are good to go
date,All valid,All values in this column are good to go


Running search request with search_id=2e0eed9c-ef57-4716-a570-ba226fcce387
We'll send email notification once it's completed, just use your personal api_key from profile.upgini.com
Done

[92m[1m
7 relevant feature(s) found with the search keys: ['date'][0m


  display(self.features_info.head(60).style.hide_index())


provider,source,feature name,shap value,coverage %,type,feature type
,,item,0.422035,100.0,numerical,
Upgini,Public/Comm. shared,f_weather_date_weather_pca_0_d7e0a1fc,0.284613,100.0,numerical,Free
,,store,0.079622,100.0,numerical,
Upgini,Public/Comm. shared,f_events_date_week_sin1_847b5db1,0.048605,100.0,numerical,Free
Upgini,Public/Comm. shared,f_events_date_year_cos1_9014a856,0.043097,100.0,numerical,Free
Upgini,Public/Comm. shared,f_weather_date_weather_umap_28_b90870a1,0.038404,100.0,numerical,Free
Upgini,Public/Comm. shared,f_weather_date_weather_umap_31_fa6d9a99,0.029407,100.0,numerical,Free
Upgini,Public/Comm. shared,f_events_date_week_cos1_f6a8c1fc,0.009782,100.0,numerical,Free
Upgini,Public/Comm. shared,f_financial_date_dow_jones_fe02128f,0.005034,100.0,numerical,Free


In [26]:
enricher.feature_names_

['f_weather_date_weather_pca_0_d7e0a1fc',
 'f_events_date_week_sin1_847b5db1',
 'f_events_date_year_cos1_9014a856',
 'f_weather_date_weather_umap_28_b90870a1',
 'f_weather_date_weather_umap_31_fa6d9a99',
 'f_events_date_week_cos1_f6a8c1fc',
 'f_financial_date_dow_jones_fe02128f']

In [27]:
enricher.feature_importances_

[0.28461332583112026,
 0.048605017281119045,
 0.04309735954971387,
 0.03840362359007534,
 0.02940653475683712,
 0.009781822387042656,
 0.00503417722492723]

In [28]:
for name, shap in zip(enricher.feature_names_, enricher.feature_importances_):
    print(f'{name}\t\t{shap}')

f_weather_date_weather_pca_0_d7e0a1fc		0.28461332583112026
f_events_date_week_sin1_847b5db1		0.048605017281119045
f_events_date_year_cos1_9014a856		0.04309735954971387
f_weather_date_weather_umap_28_b90870a1		0.03840362359007534
f_weather_date_weather_umap_31_fa6d9a99		0.02940653475683712
f_events_date_week_cos1_f6a8c1fc		0.009781822387042656
f_financial_date_dow_jones_fe02128f		0.00503417722492723


In [29]:
result_df = enricher.get_features_info().drop(columns=['provider', 'source', 'coverage %', 'type', 'feature type'])
result_df

Unnamed: 0,feature name,shap value
0,item,0.422035
1,f_weather_date_weather_pca_0_d7e0a1fc,0.284613
2,store,0.079622
3,f_events_date_week_sin1_847b5db1,0.048605
4,f_events_date_year_cos1_9014a856,0.043097
5,f_weather_date_weather_umap_28_b90870a1,0.038404
6,f_weather_date_weather_umap_31_fa6d9a99,0.029407
7,f_events_date_week_cos1_f6a8c1fc,0.009782
8,f_financial_date_dow_jones_fe02128f,0.005034


In [30]:
enricher.calculate_metrics(
    train_features, train_target, eval_set=[(test_features, test_target)],
    estimator=model,
    scoring='mean_absolute_percentage_error'
)

Calculating metrics...
-

  self.logger.info(f"First 10 rows of the eval_y_{idx} with shape {len(eval_y)}:\n{eval_y[:10]}")


Done


Unnamed: 0,match_rate,baseline mean_absolute_percentage_error,enriched mean_absolute_percentage_error,uplift
,,,,
train,100.0,0.24306,0.15444,0.08862
eval 1,100.0,0.257624,0.186904,0.07072


In [31]:
enricher.transform(train_features, keep_input=True).head()



Column name,Status,Description
date,All valid,All values in this column are good to go


Running search request with search_id=24be50f6-e20e-4052-b8b8-a4809a55b3d8
We'll send email notification once it's completed, just use your personal api_key from profile.upgini.com
Done

Collecting selected features...
Done


Unnamed: 0,date,store,item,f_weather_date_weather_pca_0_d7e0a1fc,f_events_date_week_sin1_847b5db1,f_events_date_year_cos1_9014a856,f_weather_date_weather_umap_28_b90870a1,f_weather_date_weather_umap_31_fa6d9a99,f_events_date_week_cos1_f6a8c1fc,f_financial_date_dow_jones_fe02128f
0,2013-01-01,7,5,29.676683,0.781831,0.98522,4.934963,4.712653,0.62349,13104.139648
1,2013-01-01,10,21,29.676683,0.781831,0.98522,4.934963,4.712653,0.62349,13104.139648
2,2013-01-01,8,21,29.676683,0.781831,0.98522,4.934963,4.712653,0.62349,13104.139648
3,2013-01-01,3,21,29.676683,0.781831,0.98522,4.934963,4.712653,0.62349,13104.139648
4,2013-01-01,7,37,29.676683,0.781831,0.98522,4.934963,4.712653,0.62349,13104.139648


In [32]:
enriched_train_features = enricher.transform(train_features, keep_input=True)
enriched_test_features = enricher.transform(test_features, keep_input=True)



Column name,Status,Description
date,All valid,All values in this column are good to go


Running search request with search_id=e480cd04-b1e4-4cb7-8f44-bf8e35334cfd
We'll send email notification once it's completed, just use your personal api_key from profile.upgini.com
Done

Collecting selected features...
Done


Column name,Status,Description
date,All valid,All values in this column are good to go


Running search request with search_id=2ea8f1eb-15ea-4324-b989-af4a95628309
We'll send email notification once it's completed, just use your personal api_key from profile.upgini.com
Done

Collecting selected features...
Done


### Predicting with our enriched data

In [33]:
model.fit(enriched_train_features, train_target)
predictions = model.predict(enriched_test_features)
eval_metric(test_target.values, predictions, 'SMAPE')

[13.298375445833686]

Before
37.38282082443815
After
14.174379469429864