In [None]:
import configparser
import pandas as pd
import numpy as np
from numpy import array

import vertica_python
from vertica_python.errors import MissingRelation
import verticapy as vp
from verticapy import pandas_to_vertica, insert_into, drop

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from statsmodels.graphics.tsaplots import plot_pacf
import catboost as ctb
from prophet import Prophet
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

## Helper functions

In [None]:
config_path = 'config.ini'
def get_VRT_cursor(path=config_path):
    config = configparser.ConfigParser()
    config.read(path)
    
    credentials = {
        'database': config['VERTICA']['database'],
        'user': config['VERTICA']['user'],
        'password': config['VERTICA']['password'],
        'host': config['VERTICA']['host'],
        'port': config['VERTICA']['port']
    }
    
    conn = vertica_python.connect(**credentials)
    cursor = conn.cursor()
    return cursor, conn

def get_df_from_sql(cursor, sql='SELECT now(), version()'):
    cursor.execute(sql)
    columns = cursor.description
    result = cursor.fetchall()
    df = pd.DataFrame(result, columns=[tuple[0] for tuple in columns])
    return df

VRT_cursor, conn = get_VRT_cursor()
vp.set_connection(conn)

In [None]:
def plot_periodogram(ts, detrend='linear', ax=None):
    from scipy.signal import periodogram
    fs = 365
    freqencies, spectrum = periodogram(
        ts,
        fs=fs,
        detrend=detrend,
        window="boxcar",
        scaling='spectrum',
    )
    if ax is None:
        _, ax = plt.subplots()
    ax.step(freqencies, spectrum, color="purple")
    ax.set_xscale("log")
    ax.set_xticks([12, 24, 52, 104])
    ax.set_xticklabels(
        [
            "Monthly (12)",
            "Semimonthly (24)",
            "Weekly (52)",
            "Semiweekly (104)",
        ],
        rotation=30,
    )
    ax.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))
    ax.set_ylabel("Variance")
    ax.set_title("Periodogram")
    return ax

## Get data

In [None]:
sql = """
select date,
     target_var  
from table1 
"""

In [None]:
df = get_df_from_sql(cursor=VRT_cursor, sql=sql)

## Exploratory data analysis

### Seasonality

In [None]:
p = df[["date", "target_var"]].groupby('date').mean()
p = p.fillna(p.mean())
ax = plot_periodogram(p["target_var"])
ax.set_title("Peridogram target_var"))

In [None]:
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['dayofmonth'] = df.date.dt.day
df['weekofmonth'] = df['dayofmonth']//7
df['dayofweek'] = df['date'].dt.dayofweek
df["is_weekend"] = df['date'].dt.dayofweek > 4

In [None]:
# 0 - Monday, 6 - Sunday
ax = df[["dayofweek","target_var"]].groupby("dayofweek").mean(
        "target_var".plot.bar(rot=0, title ="Seasonality by days of week")

### Cycles

In [None]:
def lagplot(x, y=None, lag=1, standardize=False, ax=None, **kwargs):
    from matplotlib.offsetbox import AnchoredText
    x_ = x.shift(lag)
    if standardize:
        x_ = (x_ - x_.mean()) / x_.std()
    if y is not None:
        y_ = (y - y.mean()) / y.std() if standardize else y
    else:
        y_ = x
    corr = y_.corr(x_)
    if ax is None:
        fig, ax = plt.subplots()
    scatter_kws = dict(
        alpha=0.75,
        s=3,
    )
    line_kws = dict(color='C3', )
    ax = sns.regplot(x=x_,
                     y=y_,
                     scatter_kws=scatter_kws,
                     line_kws=line_kws,
                     lowess=True,
                     ax=ax,
                     **kwargs)
    at = AnchoredText(
        f"{corr:.2f}",
        prop=dict(size="large"),
        frameon=True,
        loc="upper left",
    )
    at.patch.set_boxstyle("square, pad=0.0")
    ax.add_artist(at)
    ax.set(title=f"Lag {lag}", xlabel=x_.name, ylabel=y_.name)
    return ax

def plot_lags(x, y=None, lags=6, nrows=1, lagplot_kwargs={}, **kwargs):
    import math
    kwargs.setdefault('nrows', nrows)
    kwargs.setdefault('ncols', math.ceil(lags / nrows))
    kwargs.setdefault('figsize', (kwargs['ncols'] * 2, nrows * 2 + 0.5))
    fig, axs = plt.subplots(sharex=True, sharey=True, squeeze=False, **kwargs)
    for ax, k in zip(fig.get_axes(), range(kwargs['nrows'] * kwargs['ncols'])):
        if k + 1 <= lags:
            ax = lagplot(x, y, lag=k + 1, ax=ax, **lagplot_kwargs)
            ax.set_title(f"Lag {k + 1}", fontdict=dict(fontsize=14))
            ax.set(xlabel="", ylabel="")
        else:
            ax.axis('off')
    plt.setp(axs[-1, :], xlabel=x.name)
    plt.setp(axs[:, 0], ylabel=y.name if y is not None else x.name)
    fig.tight_layout(w_pad=0.1, h_pad=0.1)
    return fig

In [None]:
#plot_lags(df['payout0'], lags=12, nrows=2)
plot_pacf(df['target_var'], lags=12)

## Baseline modelling

### Data preparation

In [None]:
df = df.sort_values(
    by=['date'], ascending=True)

df["target_var"] = df["target_var"].fillna(df["target_var"].mean())

#### Define trend as moving average

In [None]:
moving_average = df[["target_var"]].rolling(
    window=7,       
    center=True,      # puts the average at the center of the window
    min_periods=3,  
).mean().rename(
    columns={"target_var": "target_var_trend"}
)

df = pd.concat([df, moving_average], axis=1)

In [None]:
def make_lags(df, target, lags):
    for i in range(1, lags + 1):
        df[target + '_lag_' + str(i)] = df[target].shift(i)
        df[target + '_lag_' + str(i)] = df[target + '_lag_' + str(i)].fillna(0)
    return df

lags = {"target_var": 6}

for l in lags.keys():
    df = make_lags(df, l, lags[l])

### Trend

In [None]:
def compare_accuracy(test: pd.DataFrame, 
                      prediction_label1: str,
                      prediction_label2: str):
    print("MAE " + prediction_label1 + " target_var") + ":", 
          round(mean_absolute_error(test["target_var_{}".format(prediction_label1)], 
                              test["payout{}".format(payout_type)]),6))
    print("MAE " + prediction_label2 + " payout" + str(payout_type) + ":", 
          round(mean_absolute_error(test["payout{}_{}".format(payout_type, prediction_label2)], 
                              test["payout{}".format(payout_type)]), 6))
    print("MSE " + prediction_label1 + " payout" + str(payout_type) + ":", 
          round(mean_squared_error(test["payout{}_{}".format(payout_type, prediction_label1)], 
                              test["payout{}".format(payout_type)]), 6))
    print("MSE " + prediction_label2 + " payout" + str(payout_type) + ":", 
          round(mean_squared_error(test["payout{}_{}".format(payout_type, prediction_label2)], 
                              test["payout{}".format(payout_type)]), 6))
    print('*********************************')

In [None]:
def plot_trend(df: pd.DataFrame):
    fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(25,15),)
    df.plot.line(x="date", 
                 y="target_var", 
                 color='g', 
                 #title ="payout{} predict and fact".format(payout_type), 
                 ax = axes, rot=0)
    df.plot.line(x="date", 
                   y="target_var_trend", 
                       color='b', ax = axes, rot=0)
    df.plot.line(x="date", 
                   y="target_var_prod_predict", 
                       color='r', ax = axes, rot=0)  
    #plt.legend(fontsize = 20)  
    axes.legend(["target_var fact", 
                 "target_var moving average",                 
                 "target_var prod forecast"],
                 fontsize=20
               )
    plt.show()

## Advanced models

### CatBoost

In [None]:
def train_catboost_regression(train: pd.DataFrame, 
                            test: pd.DataFrame, 
                            features: list,
                            cat_features: list):
    
    train_ = train[["target_var", 
                    "target_var_trend"] + features]
    test_ = test[["target_var", 
                    "target_var_trend"] + features]
    
        
    train_["target_var_detrend"] = train_["target_var"] - train_["target_var_trend"]
    test_["target_var_detrend"] = test_["target_var"] - test_["target_var_trend"]

    X_train = train_.drop(['target_var_detrend', 
                          'target_var_trend', 
                          'target_var'], axis=1)
    y_train = train_[['target_var_detrend']]
    X_test = test_.drop(['target_var_detrend', 
                          'target_var_trend', 
                          'target_var'], axis=1)
    y_test = test_[['target_var_detrend']]
    
    pool_train = ctb.Pool(X_train, y_train, cat_features=cat_features)
    pool_test = ctb.Pool(X_test, cat_features=cat_features)

    model = ctb.CatBoostRegressor(
                       n_estimators=200,
                       loss_function='RMSE',
                       learning_rate=0.05,
                       depth=3, 
                       task_type='CPU',
                       random_state=1,
                       verbose=False)
    model.fit(pool_train)
    
    y_predict = model.predict(pool_test)
    test = test.reset_index(drop=True)
    test['target_var_catboost'.format(payout_type)] = test['target_var_trend'.format(payout_type)] + y_predict    
    return test

### Prophet

In [None]:
def train_prophet_regression(train: pd.DataFrame, test: pd.DataFrame):
    train["target_var_detrend"] = train["target_var"] - train["target_var_trend"]
    test["target_var_detrend"] = test["target_var"] - test["target_var_trend"] 
    
    train_ = train[['target_var_detrend', 'date']].rename(
                columns={'date': 'ds', 'target_var_detrend': 'y'})
    test['ds'] = test['date']
    
    model = Prophet(interval_width=0.95)
    model.fit(train_)
    
    y_predict = model.predict(test)
    test = test.reset_index(drop=True)
    test['target_var_prophet'] = test["target_var_trend"] + y_predict['yhat'] 
    
    return test

### LSTM

In [None]:
def train_lstm(train: pd.DataFrame, test: pd.DataFrame):
    train["target_var_detrend"] = train["target_var"] - train["target_var_trend"]
    test["target_var_detrend"] = test["target_var"] - test["target_var_trend"] 
    df = pd.concat([train, test], ignore_index=True)
    sequence = df["target_var_detrend"].tolist()
    n_steps = train.shape[0]
    
    X, y = list(), list()
    for i in range(len(sequence)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the sequence
        if end_ix > len(sequence)-1:
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequence[i:end_ix], sequence[end_ix]
        X.append(seq_x)
        y.append(seq_y)
        
    X = array(X)
    y = array(y)
    
    n_features = 1    
    X = X.reshape((X.shape[0], X.shape[1], n_features))
    
    model = Sequential()
    model.add(layers.LSTM(50, activation='relu', input_shape=(n_steps, n_features)))
    model.add(layers.Dense(1))
    model.compile(optimizer='adam', loss='mse')    
    model.fit(X, y, epochs=1000, verbose=0)
    
    yhat = model.predict(X, verbose=0)
    yhat = yhat.reshape(yhat.shape[0])
    
    test = test.reset_index(drop=True)
    test['target_var_lstm'] = test["target_var_trend"] + yhat 
    
    return test