In [None]:
## Install libraries
%pip install -Uq upgini catboost

from os.path import exists
import pandas as pd

In [None]:
## Get data
df_path = "train.csv.zip" if exists ("train.csv.zip") else "https://github.com/upgini/upgini/raw/main/notebooks/train.csv.zip"
df = pd.read_csv(df_path)


## Data Cleaning and Transformations
df["store"] = df["store"].astype(str)
df["item"] = df["item"].astype(str)
df["date"] = pd.to_datetime(df["date"])

df.sort_values("date", inplace=True)
df.reset_index(inplace=True, drop=True)

df.head()


In [None]:
## Split data into training and testing sets

df = df.sample(n=1000, random_state=0)

# Training dataset - all data from 2013-2016
train = df[df['date'] < "2017-01-01"]

# Testing dataset - all data from 2017
test = df[df['date'] >= "2017-01-01"]

# Take random sample of data
df = df.sample(n=1000, random_state = 0)

#verify both have enough values
print(f"Train rows: {len(train)}, Test rows: {len(test)}")

#split datasets into features and target
train_features = train.drop(columns = ["sales"])
train_target = train["sales"]

test_features = test.drop(columns = ["sales"])
test_target = test["sales"]


In [None]:
## Enrich features to account for other relevant variables

from upgini import FeaturesEnricher, SearchKey
from upgini.metadata import CVType

enricher = FeaturesEnricher(
    search_keys= {
        "date": SearchKey.DATE,
    },
    cv= CVType.time_series
)

enricher.fit(train_features,
             train_target,
             eval_set=[(test_features, test_target)])

In [None]:
## Define model
from catboost import CatBoostRegressor
from catboost.utils import eval_metric

model = CatBoostRegressor(verbose=False, allow_writing_files=False, random_state=0)

enricher.calculate_metrics(
    train_features, train_target,
    eval_set=[(test_features, test_target)],
    estimator = model,
    scoring = "mean_absolute_percentage_error"
)

In [None]:
## Train model
enriched_train_features = enricher.transform(train_features, y = train_target, keep_input=True)
enriched_test_features = enricher.transform(test_features, y = test_target, keep_input=True)
enriched_train_features.head()

In [None]:
## Model without enrichment
model.fit(train_features, train_target)
preds = model.predict(test_features)
eval_metric(test_target.values, preds, "SMAPE") #gives error rate of model

In [None]:
## Model with enrichment
model.fit(enriched_train_features, train_target)
enriched_preds = model.predict(enriched_test_features)
eval_metric(test_target.values, enriched_preds, "SMAPE") #gives error rate of model