# Predictions

In [1]:
import sys

sys.path.append('..')

In [2]:
import polars as pl
from typing import Dict, Any
from src.dataset import DatasetGenerator
from src.model import RandomForestPredictor, LgbmPredictor
from tqdm import tqdm

In [3]:
generator = DatasetGenerator('../data/df.csv')

### RandomForest model

In [4]:
(X_train, y_train), (X_test, y_test) = generator.split_train_test()
predictor = RandomForestPredictor('../checkpoints/rf.pkl')

Firstly, let's train RandomForestPredictor

In [65]:
predictor.train(X_train, y_train)

Model already trained. Skipping...


In [7]:
predictor.save('../checkpoints/rf.pkl')

Overriding /Users/lezhocheck/Documents/projects/sales-prediction/checkpoints/rf.pkl


In [8]:
predictor.eval(X_train, y_train, apply_cv=True)

KeyboardInterrupt: 

In [12]:
predictor.eval(X_test, y_test, apply_cv=False)

{'test_neg_root_mean_squared_error': -39.12015415039393,
 'test_neg_mean_absolute_error': -3.873999831861759,
 'test_explained_variance': 0.9559878080197036}

In [11]:
predicted = predictor.predict(X_test)

X_test.with_columns(predicted)

category_id,sku_id,sales_price,year,month,day,weekday,is_ukrainian_holiday,sma_5_sales_price,sma_5_sales_quantity,cluster_id,prediction
i64,i64,f64,i32,i8,i8,i8,bool,f64,f64,i32,f64
7,484587,167.7,2019,1,6,7,false,290.545836,90.8,2,2.141637
7,459599,323.7,2020,10,25,7,false,339.837083,137.0,2,1.190591
17,318834,21.84,2017,1,12,4,false,28.95534,462.8,4,2060.10042
17,8222,41.47,2018,10,10,3,false,44.480303,618.8,4,2014.037846
17,531313,30.55,2018,1,5,5,false,37.332686,492.6,1,5.438952
…,…,…,…,…,…,…,…,…,…,…,…
17,702116,31.395,2018,5,23,3,false,41.662624,561.0,4,1998.869389
17,8219,47.06,2018,4,27,5,false,40.670178,541.8,1,4.072234
17,605321,33.8,2019,2,10,7,false,48.657164,562.6,0,3.08384
17,8228,77.922,2020,8,11,2,false,66.049975,578.0,0,2.726241


In [18]:
test = DatasetGenerator(
    '../data/df_future.csv', 
    generate_lags=False,
    remove_nulls=False,
    generate_clusters=False,
    generate_sma=False
)
x, _ = test.get_full_train_data()
x = x.with_columns(pl.date(pl.col('year'), pl.col('month'), pl.col('day')))
x

category_id,sku_id,sales_price,year,month,day,weekday,is_ukrainian_holiday,date
i64,i64,f64,i32,i8,i8,i8,bool,date
7,695424,1493.7,2020,11,7,6,false,2020-11-07
7,746872,187.2,2020,11,11,3,false,2020-11-11
7,440748,258.7,2020,11,9,1,false,2020-11-09
7,752709,388.7,2020,11,4,3,false,2020-11-04
7,630972,973.7,2020,11,10,2,false,2020-11-10
…,…,…,…,…,…,…,…,…
7,621633,707.2,2020,11,3,2,false,2020-11-03
7,674270,362.7,2020,11,2,1,false,2020-11-02
7,484594,116.22,2020,11,7,6,false,2020-11-07
17,801559,62.322,2020,11,7,6,false,2020-11-07


- We need somehow to calculate lags and other rolling features for the test dataset. 
- Idea is to simmulate realtime stream and get those values one by one. Also use model prediction as real quantity (it is not the best solution, but for the purpose of this research would be ok).
- This current solution implementation is too slow and need to be improved
- In production environment real time data are always available, so the calculation of lags and other similar features would be easy and fast

In [22]:
testing_set = X_test.select('category_id', 'sku_id', 'sales_price', 'year', 'month', 'day', 'weekday', 'is_ukrainian_holiday').with_columns(
    pl.date(pl.col('year'), pl.col('month'), pl.col('day'))
)

In [23]:
t_pool, y = generator.get_full_train_data()
t_pool = t_pool.with_columns(y)
t_pool = t_pool.with_columns(pl.date(pl.col('year'), pl.col('month'), pl.col('day')))

def augment_raw_sample(sample: Dict[str, Any]) -> Dict[str, Any]:
    closest_rows = t_pool.group_by(['sku_id', 'date']).agg(
        pl.col('sales_price').mean(),
        pl.col('sales_quantity').sum()
    ).filter(
        pl.col('sku_id') == sample['sku_id'],
        pl.col('date') <= sample['date']
    ).sort('date', descending=True).head(DatasetGenerator.NUM_LAGS).select(
        'sales_price', 'sales_quantity'
    )
    for idx in range(1, 4):
        sample[f'lag_{idx}_sales_price'] = None
        sample[f'lag_{idx}_sales_quantity'] = None

    for idx, (sales_price, sales_quantity) in enumerate(closest_rows.iter_rows()):
        sample[f'lag_{idx+1}_sales_price'] = sales_price
        sample[f'lag_{idx+1}_sales_quantity'] = sales_quantity
    
    # same for category for sma5 calculation
    closest_rows = t_pool.group_by(['category_id', 'date']).agg(
        pl.col('sales_price').mean(),
        pl.col('sales_quantity').sum()
    ).filter(
        pl.col('category_id') == sample['category_id'],
        pl.col('date') <= sample['date']
    ).sort('date', descending=True).head(DatasetGenerator.WINDOW_SIZE).select(
        'sales_price', 'sales_quantity'
    )
    sample['sma_5_sales_price'] = closest_rows['sales_price'].mean()
    sample['sma_5_sales_quantity'] = closest_rows['sales_quantity'].mean()
    if None in sample.values():
        sample['cluster_id'] = None
    else:
        features = pl.DataFrame(sample).select(generator.cluster_generator.feature_names_in_)
        cluster = generator.cluster_generator.predict(features)
        sample['cluster_id'] = cluster
    return sample


In [24]:
for sku_id, group in tqdm(testing_set.group_by(['sku_id'])):
    for sample in group.sort(by='date').iter_rows():
        sample = dict(zip(group.columns, sample))
        sample = augment_raw_sample(sample)
        quantity = predictor.predict_single_sample(sample)
        sample['sales_quantity'] = quantity
        row = pl.DataFrame([sample]).with_columns(
            pl.col('cluster_id').cast(pl.Int32),
            pl.col('year').cast(pl.Int32),
            pl.col('month').cast(pl.Int8),
            pl.col('day').cast(pl.Int8),
            pl.col('weekday').cast(pl.Int8)
        ).select(t_pool.columns)
        t_pool = t_pool.vstack(row)

906it [51:52,  3.44s/it]


In [51]:
testing_set

category_id,sku_id,sales_price,year,month,day,weekday,is_ukrainian_holiday,date
i64,i64,f64,i32,i8,i8,i8,bool,date
17,8221,46.8,2019,3,5,2,false,2019-03-05
7,484581,123.487,2019,10,30,3,false,2019-10-30
17,750483,60.45,2019,12,13,5,false,2019-12-13
17,24119,49.335,2018,11,8,4,false,2018-11-08
17,32542,41.6,2017,11,23,4,false,2017-11-23
…,…,…,…,…,…,…,…,…
7,654449,180.7,2020,5,16,6,false,2020-05-16
17,589895,34.775,2018,2,13,2,false,2018-02-13
17,824499,53.3,2020,9,8,2,false,2020-09-08
17,798076,44.85,2020,3,5,4,false,2020-03-05


In [56]:
values = X_test.with_columns(y_test).select(
    'category_id', 'sku_id', 'sales_price', 'year', 'month', 'day', 'weekday', 'is_ukrainian_holiday',
    'sales_quantity'
)

In [60]:
tx, ty = generator.get_full_train_data()
tx = tx.with_columns(ty).with_columns(pl.date(pl.col('year'), pl.col('month'), pl.col('day')))

result = t_pool.join(tx, on=t_pool.columns, how="anti").rename({
    'sales_quantity': 'predicted'
}).join(values, on=['category_id', 'sku_id', 'sales_price', 'year', 'month', 'day', 'weekday', 'is_ukrainian_holiday'])

In [64]:
from sklearn.metrics import (
    mean_absolute_error,
    explained_variance_score,
    root_mean_squared_error
)


mean_absolute_error(result['sales_quantity'], result['predicted'])
explained_variance_score(result['sales_quantity'], result['predicted'])

0.9238967099981493

### Lgbm model

In [14]:
predictor = LgbmPredictor()
predictor.train(X_train, y_train)

Model already trained. Skipping...
