# 1. Import libraries

In [1]:
# data manipulation
import pandas as pd

# pipeline
from sklearn.pipeline import Pipeline

# transformers
from sklearn.impute import SimpleImputer

# model
from sklearn.tree import DecisionTreeRegressor

# save model
from joblib import dump

import warnings
warnings.filterwarnings('ignore')

# 2. Load data

In [2]:
dataset = pd.read_csv('../data/prepared_data.csv')
dataset['Date'] = pd.to_datetime(dataset['Date'], format='%Y-%m-%d')

# Rolling mean
dataset['Volume_ma_7'] = dataset['Volume'].shift(1).rolling(7).mean()
dataset.dropna(inplace=True)
dataset.reset_index(drop=True, inplace=True)

dataset

Unnamed: 0,Date,Volume,Rain,Temp,Volume_ma_7
0,2016-11-24,1962.0,0.0,1.1,1239.000000
1,2016-11-25,1638.0,1.0,3.1,1386.000000
2,2016-11-26,384.0,0.0,5.4,1449.000000
3,2016-11-27,396.0,0.0,2.6,1469.142857
4,2016-11-28,2175.0,0.0,-1.3,1492.285714
...,...,...,...,...,...
1261,2020-05-16,2484.0,0.0,6.6,4507.285714
1262,2020-05-17,1509.0,0.0,9.1,4336.285714
1263,2020-05-18,5775.0,0.0,9.3,4363.285714
1264,2020-05-19,6741.0,0.0,9.7,4430.571429


In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1266 entries, 0 to 1265
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Date         1266 non-null   datetime64[ns]
 1   Volume       1266 non-null   float64       
 2   Rain         1266 non-null   float64       
 3   Temp         1266 non-null   float64       
 4   Volume_ma_7  1266 non-null   float64       
dtypes: datetime64[ns](1), float64(4)
memory usage: 49.6 KB


# 3. Train Test Split

In [4]:
first_test_date = pd.datetime.strptime('2020-01-01', '%Y-%m-%d')

train_dataset = dataset.loc[dataset['Date']<first_test_date, :].copy()
X_train, y_train = train_dataset[['Volume_ma_7']].copy(), train_dataset['Volume'].copy()

test_dataset = dataset.loc[dataset['Date']>=first_test_date, :].copy()
X_test, y_test = test_dataset[['Volume_ma_7']].copy(), test_dataset['Volume'].copy()

# 4. Feature Engineering

In [5]:
feature_engineering = Pipeline([
    ('missing_imputer', SimpleImputer()),
])

# 5. Modelling

In [6]:
model_pipeline = Pipeline([
    ('feature_engineering', feature_engineering),
    ('model', DecisionTreeRegressor(max_depth=7, min_samples_leaf=20))
])

In [7]:
model_pipeline.fit(X_train, y_train)

In [8]:
# forecast
model_pipeline.predict(X_test)

array([ 497.04      ,  497.04      ,  808.02985075,  808.02985075,
        808.02985075,  808.02985075, 1066.78125   , 1336.96875   ,
       2054.35714286, 2564.86363636, 2564.86363636, 2564.86363636,
       2564.86363636, 2564.86363636, 2564.86363636, 2054.35714286,
       2054.35714286, 2054.35714286, 2054.35714286, 2054.35714286,
       1626.96774194, 2054.35714286, 2054.35714286, 2054.35714286,
       2564.86363636, 2564.86363636, 2564.86363636, 2564.86363636,
       2054.35714286, 2054.35714286, 2054.35714286, 1626.96774194,
       1626.96774194, 1626.96774194, 1626.96774194, 1626.96774194,
       1626.96774194, 1626.96774194, 1626.96774194, 1626.96774194,
       1626.96774194, 1626.96774194, 1626.96774194, 2054.35714286,
       2054.35714286, 2564.86363636, 2564.86363636, 2564.86363636,
       2564.86363636, 2054.35714286, 2054.35714286, 1626.96774194,
       1336.96875   , 1336.96875   , 1626.96774194, 2054.35714286,
       2054.35714286, 2054.35714286, 2564.86363636, 2564.86363

# 6. Save Output

In [9]:
# save datasets
X_test.to_csv("../Dash_app/assets/X_test.csv", index=False)
y_test.to_csv("../Dash_app/assets/y_test.csv", index=False)
dump(model_pipeline, filename='../Dash_app/assets/fitted_model.pkl')

['../Dash_app/assets/fitted_model.pkl']